2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
62 #include <sys/param.h>
64 #include <sys/kernel.h>
65 #include <sys/limits.h>
66 #include <sys/malloc.h>
68 #include <sys/module.h>
70 #include <sys/queue.h>
73 #include <sys/socket.h>
74 #include <sys/sockio.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/taskqueue.h>
79 #include <sys/buf_ring.h>
81 #include <machine/atomic.h>
82 #include <machine/in_cksum.h>
85 #include <net/ethernet.h>
87 #include <net/if_arp.h>
88 #include <net/if_media.h>
89 #include <net/if_types.h>
90 #include <net/if_var.h>
91 #include <net/if_vlan_var.h>
92 #include <net/rndis.h>
94 #include <netinet/in_systm.h>
95 #include <netinet/in.h>
96 #include <netinet/ip.h>
97 #include <netinet/ip6.h>
98 #include <netinet/tcp.h>
99 #include <netinet/tcp_lro.h>
100 #include <netinet/udp.h>
102 #include <dev/hyperv/include/hyperv.h>
103 #include <dev/hyperv/include/hyperv_busdma.h>
104 #include <dev/hyperv/include/vmbus.h>
105 #include <dev/hyperv/include/vmbus_xact.h>
107 #include <dev/hyperv/netvsc/ndis.h>
108 #include <dev/hyperv/netvsc/if_hnreg.h>
109 #include <dev/hyperv/netvsc/if_hnvar.h>
110 #include <dev/hyperv/netvsc/hn_nvs.h>
111 #include <dev/hyperv/netvsc/hn_rndis.h>
113 #include "vmbus_if.h"
115 #define HN_IFSTART_SUPPORT
117 #define HN_RING_CNT_DEF_MAX 8
119 /* YYY should get it from the underlying channel */
120 #define HN_TX_DESC_CNT 512
122 #define HN_RNDIS_PKT_LEN \
123 (sizeof(struct rndis_packet_msg) + \
124 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
125 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
126 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
127 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
128 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
129 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
131 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
132 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
133 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
134 /* -1 for RNDIS packet message */
135 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
137 #define HN_DIRECT_TX_SIZE_DEF 128
139 #define HN_EARLY_TXEOF_THRESH 8
141 #define HN_PKTBUF_LEN_DEF (16 * 1024)
143 #define HN_LROENT_CNT_DEF 128
145 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
146 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
147 /* YYY 2*MTU is a bit rough, but should be good enough. */
148 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
150 #define HN_LRO_ACKCNT_DEF 1
152 #define HN_LOCK_INIT(sc) \
153 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
154 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
155 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
156 #define HN_LOCK(sc) \
158 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
161 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
163 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
164 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
165 #define HN_CSUM_IP_HWASSIST(sc) \
166 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
167 #define HN_CSUM_IP6_HWASSIST(sc) \
168 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
170 #define HN_PKTSIZE_MIN(align) \
171 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
172 HN_RNDIS_PKT_LEN, (align))
173 #define HN_PKTSIZE(m, align) \
174 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
176 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
179 #ifndef HN_USE_TXDESC_BUFRING
180 SLIST_ENTRY(hn_txdesc) link;
182 STAILQ_ENTRY(hn_txdesc) agg_link;
184 /* Aggregated txdescs, in sending order. */
185 STAILQ_HEAD(, hn_txdesc) agg_list;
187 /* The oldest packet, if transmission aggregation happens. */
189 struct hn_tx_ring *txr;
191 uint32_t flags; /* HN_TXD_FLAG_ */
192 struct hn_nvs_sendctx send_ctx;
196 bus_dmamap_t data_dmap;
198 bus_addr_t rndis_pkt_paddr;
199 struct rndis_packet_msg *rndis_pkt;
200 bus_dmamap_t rndis_pkt_dmap;
203 #define HN_TXD_FLAG_ONLIST 0x0001
204 #define HN_TXD_FLAG_DMAMAP 0x0002
205 #define HN_TXD_FLAG_ONAGG 0x0004
214 #define HN_RXINFO_VLAN 0x0001
215 #define HN_RXINFO_CSUM 0x0002
216 #define HN_RXINFO_HASHINF 0x0004
217 #define HN_RXINFO_HASHVAL 0x0008
218 #define HN_RXINFO_ALL \
221 HN_RXINFO_HASHINF | \
224 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
225 #define HN_NDIS_RXCSUM_INFO_INVALID 0
226 #define HN_NDIS_HASH_INFO_INVALID 0
228 static int hn_probe(device_t);
229 static int hn_attach(device_t);
230 static int hn_detach(device_t);
231 static int hn_shutdown(device_t);
232 static void hn_chan_callback(struct vmbus_channel *,
235 static void hn_init(void *);
236 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
237 #ifdef HN_IFSTART_SUPPORT
238 static void hn_start(struct ifnet *);
240 static int hn_transmit(struct ifnet *, struct mbuf *);
241 static void hn_xmit_qflush(struct ifnet *);
242 static int hn_ifmedia_upd(struct ifnet *);
243 static void hn_ifmedia_sts(struct ifnet *,
244 struct ifmediareq *);
246 static int hn_rndis_rxinfo(const void *, int,
248 static void hn_rndis_rx_data(struct hn_rx_ring *,
250 static void hn_rndis_rx_status(struct hn_softc *,
253 static void hn_nvs_handle_notify(struct hn_softc *,
254 const struct vmbus_chanpkt_hdr *);
255 static void hn_nvs_handle_comp(struct hn_softc *,
256 struct vmbus_channel *,
257 const struct vmbus_chanpkt_hdr *);
258 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
259 struct vmbus_channel *,
260 const struct vmbus_chanpkt_hdr *);
261 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
262 struct vmbus_channel *, uint64_t);
264 #if __FreeBSD_version >= 1100099
265 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
266 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
268 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
269 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
270 #if __FreeBSD_version < 1100095
271 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
273 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
275 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
276 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
277 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
278 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
279 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
280 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
281 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
282 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
283 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
284 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
285 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
286 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
287 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
288 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
289 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
291 static void hn_stop(struct hn_softc *);
292 static void hn_init_locked(struct hn_softc *);
293 static int hn_chan_attach(struct hn_softc *,
294 struct vmbus_channel *);
295 static void hn_chan_detach(struct hn_softc *,
296 struct vmbus_channel *);
297 static int hn_attach_subchans(struct hn_softc *);
298 static void hn_detach_allchans(struct hn_softc *);
299 static void hn_chan_rollup(struct hn_rx_ring *,
300 struct hn_tx_ring *);
301 static void hn_set_ring_inuse(struct hn_softc *, int);
302 static int hn_synth_attach(struct hn_softc *, int);
303 static void hn_synth_detach(struct hn_softc *);
304 static int hn_synth_alloc_subchans(struct hn_softc *,
306 static bool hn_synth_attachable(const struct hn_softc *);
307 static void hn_suspend(struct hn_softc *);
308 static void hn_suspend_data(struct hn_softc *);
309 static void hn_suspend_mgmt(struct hn_softc *);
310 static void hn_resume(struct hn_softc *);
311 static void hn_resume_data(struct hn_softc *);
312 static void hn_resume_mgmt(struct hn_softc *);
313 static void hn_suspend_mgmt_taskfunc(void *, int);
314 static void hn_chan_drain(struct hn_softc *,
315 struct vmbus_channel *);
316 static void hn_polling(struct hn_softc *, u_int);
317 static void hn_chan_polling(struct vmbus_channel *, u_int);
319 static void hn_update_link_status(struct hn_softc *);
320 static void hn_change_network(struct hn_softc *);
321 static void hn_link_taskfunc(void *, int);
322 static void hn_netchg_init_taskfunc(void *, int);
323 static void hn_netchg_status_taskfunc(void *, int);
324 static void hn_link_status(struct hn_softc *);
326 static int hn_create_rx_data(struct hn_softc *, int);
327 static void hn_destroy_rx_data(struct hn_softc *);
328 static int hn_check_iplen(const struct mbuf *, int);
329 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
330 static int hn_rxfilter_config(struct hn_softc *);
331 static int hn_rss_reconfig(struct hn_softc *);
332 static void hn_rss_ind_fixup(struct hn_softc *);
333 static int hn_rxpkt(struct hn_rx_ring *, const void *,
334 int, const struct hn_rxinfo *);
336 static int hn_tx_ring_create(struct hn_softc *, int);
337 static void hn_tx_ring_destroy(struct hn_tx_ring *);
338 static int hn_create_tx_data(struct hn_softc *, int);
339 static void hn_fixup_tx_data(struct hn_softc *);
340 static void hn_destroy_tx_data(struct hn_softc *);
341 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
342 static void hn_txdesc_gc(struct hn_tx_ring *,
344 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
345 struct hn_txdesc *, struct mbuf **);
346 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
348 static void hn_set_chim_size(struct hn_softc *, int);
349 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
350 static bool hn_tx_ring_pending(struct hn_tx_ring *);
351 static void hn_tx_ring_qflush(struct hn_tx_ring *);
352 static void hn_resume_tx(struct hn_softc *, int);
353 static void hn_set_txagg(struct hn_softc *);
354 static void *hn_try_txagg(struct ifnet *,
355 struct hn_tx_ring *, struct hn_txdesc *,
357 static int hn_get_txswq_depth(const struct hn_tx_ring *);
358 static void hn_txpkt_done(struct hn_nvs_sendctx *,
359 struct hn_softc *, struct vmbus_channel *,
361 static int hn_txpkt_sglist(struct hn_tx_ring *,
363 static int hn_txpkt_chim(struct hn_tx_ring *,
365 static int hn_xmit(struct hn_tx_ring *, int);
366 static void hn_xmit_taskfunc(void *, int);
367 static void hn_xmit_txeof(struct hn_tx_ring *);
368 static void hn_xmit_txeof_taskfunc(void *, int);
369 #ifdef HN_IFSTART_SUPPORT
370 static int hn_start_locked(struct hn_tx_ring *, int);
371 static void hn_start_taskfunc(void *, int);
372 static void hn_start_txeof(struct hn_tx_ring *);
373 static void hn_start_txeof_taskfunc(void *, int);
376 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
377 "Hyper-V network interface");
379 /* Trust tcp segements verification on host side. */
380 static int hn_trust_hosttcp = 1;
381 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
382 &hn_trust_hosttcp, 0,
383 "Trust tcp segement verification on host side, "
384 "when csum info is missing (global setting)");
386 /* Trust udp datagrams verification on host side. */
387 static int hn_trust_hostudp = 1;
388 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
389 &hn_trust_hostudp, 0,
390 "Trust udp datagram verification on host side, "
391 "when csum info is missing (global setting)");
393 /* Trust ip packets verification on host side. */
394 static int hn_trust_hostip = 1;
395 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
397 "Trust ip packet verification on host side, "
398 "when csum info is missing (global setting)");
400 /* Limit TSO burst size */
401 static int hn_tso_maxlen = IP_MAXPACKET;
402 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
403 &hn_tso_maxlen, 0, "TSO burst limit");
405 /* Limit chimney send size */
406 static int hn_tx_chimney_size = 0;
407 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
408 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
410 /* Limit the size of packet for direct transmission */
411 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
412 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
413 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
415 /* # of LRO entries per RX ring */
416 #if defined(INET) || defined(INET6)
417 #if __FreeBSD_version >= 1100095
418 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
419 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
420 &hn_lro_entry_count, 0, "LRO entry count");
424 static int hn_tx_taskq_cnt = 1;
425 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
426 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
428 #define HN_TX_TASKQ_M_INDEP 0
429 #define HN_TX_TASKQ_M_GLOBAL 1
430 #define HN_TX_TASKQ_M_EVTTQ 2
432 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
433 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
434 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
435 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
437 #ifndef HN_USE_TXDESC_BUFRING
438 static int hn_use_txdesc_bufring = 0;
440 static int hn_use_txdesc_bufring = 1;
442 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
443 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
445 #ifdef HN_IFSTART_SUPPORT
446 /* Use ifnet.if_start instead of ifnet.if_transmit */
447 static int hn_use_if_start = 0;
448 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
449 &hn_use_if_start, 0, "Use if_start TX method");
452 /* # of channels to use */
453 static int hn_chan_cnt = 0;
454 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
456 "# of channels to use; each channel has one RX ring and one TX ring");
458 /* # of transmit rings to use */
459 static int hn_tx_ring_cnt = 0;
460 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
461 &hn_tx_ring_cnt, 0, "# of TX rings to use");
463 /* Software TX ring deptch */
464 static int hn_tx_swq_depth = 0;
465 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
466 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
468 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
469 #if __FreeBSD_version >= 1100095
470 static u_int hn_lro_mbufq_depth = 0;
471 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
472 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
475 /* Packet transmission aggregation size limit */
476 static int hn_tx_agg_size = -1;
477 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
478 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
480 /* Packet transmission aggregation count limit */
481 static int hn_tx_agg_pkts = -1;
482 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
483 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
485 static u_int hn_cpu_index; /* next CPU for channel */
486 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
489 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
490 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
491 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
492 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
493 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
494 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
497 static device_method_t hn_methods[] = {
498 /* Device interface */
499 DEVMETHOD(device_probe, hn_probe),
500 DEVMETHOD(device_attach, hn_attach),
501 DEVMETHOD(device_detach, hn_detach),
502 DEVMETHOD(device_shutdown, hn_shutdown),
506 static driver_t hn_driver = {
509 sizeof(struct hn_softc)
512 static devclass_t hn_devclass;
514 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
515 MODULE_VERSION(hn, 1);
516 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
518 #if __FreeBSD_version >= 1100099
520 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
524 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
525 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
530 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
533 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
534 txd->chim_size == 0, ("invalid rndis sglist txd"));
535 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
536 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
540 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
542 struct hn_nvs_rndis rndis;
544 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
545 txd->chim_size > 0, ("invalid rndis chim txd"));
547 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
548 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
549 rndis.nvs_chim_idx = txd->chim_index;
550 rndis.nvs_chim_sz = txd->chim_size;
552 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
553 &rndis, sizeof(rndis), &txd->send_ctx));
556 static __inline uint32_t
557 hn_chim_alloc(struct hn_softc *sc)
559 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
560 u_long *bmap = sc->hn_chim_bmap;
561 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
563 for (i = 0; i < bmap_cnt; ++i) {
566 idx = ffsl(~bmap[i]);
570 --idx; /* ffsl is 1-based */
571 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
572 ("invalid i %d and idx %d", i, idx));
574 if (atomic_testandset_long(&bmap[i], idx))
577 ret = i * LONG_BIT + idx;
584 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
589 idx = chim_idx / LONG_BIT;
590 KASSERT(idx < sc->hn_chim_bmap_cnt,
591 ("invalid chimney index 0x%x", chim_idx));
593 mask = 1UL << (chim_idx % LONG_BIT);
594 KASSERT(sc->hn_chim_bmap[idx] & mask,
595 ("index bitmap 0x%lx, chimney index %u, "
596 "bitmap idx %d, bitmask 0x%lx",
597 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
599 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
602 #if defined(INET6) || defined(INET)
604 * NOTE: If this function failed, the m_head would be freed.
606 static __inline struct mbuf *
607 hn_tso_fixup(struct mbuf *m_head)
609 struct ether_vlan_header *evl;
613 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
615 #define PULLUP_HDR(m, len) \
617 if (__predict_false((m)->m_len < (len))) { \
618 (m) = m_pullup((m), (len)); \
624 PULLUP_HDR(m_head, sizeof(*evl));
625 evl = mtod(m_head, struct ether_vlan_header *);
626 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
627 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
629 ehlen = ETHER_HDR_LEN;
632 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
636 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
637 ip = mtodo(m_head, ehlen);
638 iphlen = ip->ip_hl << 2;
640 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
641 th = mtodo(m_head, ehlen + iphlen);
645 th->th_sum = in_pseudo(ip->ip_src.s_addr,
646 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
649 #if defined(INET6) && defined(INET)
656 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
657 ip6 = mtodo(m_head, ehlen);
658 if (ip6->ip6_nxt != IPPROTO_TCP) {
663 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
664 th = mtodo(m_head, ehlen + sizeof(*ip6));
667 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
674 #endif /* INET6 || INET */
677 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
683 if (sc->hn_rx_filter != filter) {
684 error = hn_rndis_set_rxfilter(sc, filter);
686 sc->hn_rx_filter = filter;
692 hn_rxfilter_config(struct hn_softc *sc)
694 struct ifnet *ifp = sc->hn_ifp;
699 if (ifp->if_flags & IFF_PROMISC) {
700 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
702 filter = NDIS_PACKET_TYPE_DIRECTED;
703 if (ifp->if_flags & IFF_BROADCAST)
704 filter |= NDIS_PACKET_TYPE_BROADCAST;
705 /* TODO: support multicast list */
706 if ((ifp->if_flags & IFF_ALLMULTI) ||
707 !TAILQ_EMPTY(&ifp->if_multiaddrs))
708 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
710 return (hn_set_rxfilter(sc, filter));
714 hn_set_txagg(struct hn_softc *sc)
720 * Setup aggregation size.
722 if (sc->hn_agg_size < 0)
725 size = sc->hn_agg_size;
727 if (sc->hn_rndis_agg_size < size)
728 size = sc->hn_rndis_agg_size;
730 /* NOTE: We only aggregate packets using chimney sending buffers. */
731 if (size > (uint32_t)sc->hn_chim_szmax)
732 size = sc->hn_chim_szmax;
734 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
741 /* NOTE: Type of the per TX ring setting is 'int'. */
746 * Setup aggregation packet count.
748 if (sc->hn_agg_pkts < 0)
751 pkts = sc->hn_agg_pkts;
753 if (sc->hn_rndis_agg_pkts < pkts)
754 pkts = sc->hn_rndis_agg_pkts;
763 /* NOTE: Type of the per TX ring setting is 'short'. */
768 /* NOTE: Type of the per TX ring setting is 'short'. */
769 if (sc->hn_rndis_agg_align > SHRT_MAX) {
776 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
777 size, pkts, sc->hn_rndis_agg_align);
780 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
781 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
783 mtx_lock(&txr->hn_tx_lock);
784 txr->hn_agg_szmax = size;
785 txr->hn_agg_pktmax = pkts;
786 txr->hn_agg_align = sc->hn_rndis_agg_align;
787 mtx_unlock(&txr->hn_tx_lock);
792 hn_get_txswq_depth(const struct hn_tx_ring *txr)
795 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
796 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
797 return txr->hn_txdesc_cnt;
798 return hn_tx_swq_depth;
802 hn_rss_reconfig(struct hn_softc *sc)
808 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
815 * Direct reconfiguration by setting the UNCHG flags does
816 * _not_ work properly.
819 if_printf(sc->hn_ifp, "disable RSS\n");
820 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
822 if_printf(sc->hn_ifp, "RSS disable failed\n");
827 * Reenable the RSS w/ the updated RSS key or indirect
831 if_printf(sc->hn_ifp, "reconfig RSS\n");
832 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
834 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
841 hn_rss_ind_fixup(struct hn_softc *sc)
843 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
846 nchan = sc->hn_rx_ring_inuse;
847 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
850 * Check indirect table to make sure that all channels in it
853 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
854 if (rss->rss_ind[i] >= nchan) {
855 if_printf(sc->hn_ifp,
856 "RSS indirect table %d fixup: %u -> %d\n",
857 i, rss->rss_ind[i], nchan - 1);
858 rss->rss_ind[i] = nchan - 1;
864 hn_ifmedia_upd(struct ifnet *ifp __unused)
871 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
873 struct hn_softc *sc = ifp->if_softc;
875 ifmr->ifm_status = IFM_AVALID;
876 ifmr->ifm_active = IFM_ETHER;
878 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
879 ifmr->ifm_active |= IFM_NONE;
882 ifmr->ifm_status |= IFM_ACTIVE;
883 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
886 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
887 static const struct hyperv_guid g_net_vsc_device_type = {
888 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
889 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
893 hn_probe(device_t dev)
896 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
897 &g_net_vsc_device_type) == 0) {
898 device_set_desc(dev, "Hyper-V Network Interface");
899 return BUS_PROBE_DEFAULT;
905 hn_attach(device_t dev)
907 struct hn_softc *sc = device_get_softc(dev);
908 struct sysctl_oid_list *child;
909 struct sysctl_ctx_list *ctx;
910 uint8_t eaddr[ETHER_ADDR_LEN];
911 struct ifnet *ifp = NULL;
912 int error, ring_cnt, tx_ring_cnt;
915 sc->hn_prichan = vmbus_get_channel(dev);
919 * Initialize these tunables once.
921 sc->hn_agg_size = hn_tx_agg_size;
922 sc->hn_agg_pkts = hn_tx_agg_pkts;
925 * Setup taskqueue for transmission.
927 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
931 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
933 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
934 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
935 M_WAITOK, taskqueue_thread_enqueue,
936 &sc->hn_tx_taskqs[i]);
937 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
938 "%s tx%d", device_get_nameunit(dev), i);
940 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
941 sc->hn_tx_taskqs = hn_tx_taskque;
945 * Setup taskqueue for mangement tasks, e.g. link status.
947 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
948 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
949 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
950 device_get_nameunit(dev));
951 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
952 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
953 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
954 hn_netchg_status_taskfunc, sc);
957 * Allocate ifnet and setup its name earlier, so that if_printf
958 * can be used by functions, which will be called after
961 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
963 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
966 * Initialize ifmedia earlier so that it can be unconditionally
967 * destroyed, if error happened later on.
969 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
972 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
973 * to use (tx_ring_cnt).
976 * The # of RX rings to use is same as the # of channels to use.
978 ring_cnt = hn_chan_cnt;
982 if (ring_cnt > HN_RING_CNT_DEF_MAX)
983 ring_cnt = HN_RING_CNT_DEF_MAX;
984 } else if (ring_cnt > mp_ncpus) {
988 tx_ring_cnt = hn_tx_ring_cnt;
989 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
990 tx_ring_cnt = ring_cnt;
991 #ifdef HN_IFSTART_SUPPORT
992 if (hn_use_if_start) {
993 /* ifnet.if_start only needs one TX ring. */
999 * Set the leader CPU for channels.
1001 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1004 * Create enough TX/RX rings, even if only limited number of
1005 * channels can be allocated.
1007 error = hn_create_tx_data(sc, tx_ring_cnt);
1010 error = hn_create_rx_data(sc, ring_cnt);
1015 * Create transaction context for NVS and RNDIS transactions.
1017 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1018 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1019 if (sc->hn_xact == NULL) {
1025 * Install orphan handler for the revocation of this device's
1029 * The processing order is critical here:
1030 * Install the orphan handler, _before_ testing whether this
1031 * device's primary channel has been revoked or not.
1033 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1034 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1040 * Attach the synthetic parts, i.e. NVS and RNDIS.
1042 error = hn_synth_attach(sc, ETHERMTU);
1046 error = hn_rndis_get_eaddr(sc, eaddr);
1050 #if __FreeBSD_version >= 1100099
1051 if (sc->hn_rx_ring_inuse > 1) {
1053 * Reduce TCP segment aggregation limit for multiple
1054 * RX rings to increase ACK timeliness.
1056 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1061 * Fixup TX stuffs after synthetic parts are attached.
1063 hn_fixup_tx_data(sc);
1065 ctx = device_get_sysctl_ctx(dev);
1066 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1067 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1068 &sc->hn_nvs_ver, 0, "NVS version");
1069 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1070 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1071 hn_ndis_version_sysctl, "A", "NDIS version");
1072 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1073 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1074 hn_caps_sysctl, "A", "capabilities");
1075 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1076 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1077 hn_hwassist_sysctl, "A", "hwassist");
1078 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1079 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1080 hn_rxfilter_sysctl, "A", "rxfilter");
1081 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1082 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1083 hn_rss_hash_sysctl, "A", "RSS hash");
1084 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1085 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1086 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1087 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1088 hn_rss_key_sysctl, "IU", "RSS key");
1089 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1090 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1091 hn_rss_ind_sysctl, "IU", "RSS indirect table");
1092 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1093 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1094 "RNDIS offered packet transmission aggregation size limit");
1095 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1096 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1097 "RNDIS offered packet transmission aggregation count limit");
1098 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1099 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1100 "RNDIS packet transmission aggregation alignment");
1101 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1102 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1103 hn_txagg_size_sysctl, "I",
1104 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1105 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1106 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1107 hn_txagg_pkts_sysctl, "I",
1108 "Packet transmission aggregation packets, "
1109 "0 -- disable, -1 -- auto");
1110 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1111 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1112 hn_polling_sysctl, "I",
1113 "Polling frequency: [100,1000000], 0 disable polling");
1116 * Setup the ifmedia, which has been initialized earlier.
1118 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1119 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1120 /* XXX ifmedia_set really should do this for us */
1121 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1124 * Setup the ifnet for this interface.
1128 ifp->if_baudrate = IF_Gbps(10);
1130 /* if_baudrate is 32bits on 32bit system. */
1131 ifp->if_baudrate = IF_Gbps(1);
1133 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1134 ifp->if_ioctl = hn_ioctl;
1135 ifp->if_init = hn_init;
1136 #ifdef HN_IFSTART_SUPPORT
1137 if (hn_use_if_start) {
1138 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1140 ifp->if_start = hn_start;
1141 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1142 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1143 IFQ_SET_READY(&ifp->if_snd);
1147 ifp->if_transmit = hn_transmit;
1148 ifp->if_qflush = hn_xmit_qflush;
1151 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1153 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1154 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1156 if (sc->hn_caps & HN_CAP_VLAN) {
1157 /* XXX not sure about VLAN_MTU. */
1158 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1161 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1162 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1163 ifp->if_capabilities |= IFCAP_TXCSUM;
1164 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1165 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1166 if (sc->hn_caps & HN_CAP_TSO4) {
1167 ifp->if_capabilities |= IFCAP_TSO4;
1168 ifp->if_hwassist |= CSUM_IP_TSO;
1170 if (sc->hn_caps & HN_CAP_TSO6) {
1171 ifp->if_capabilities |= IFCAP_TSO6;
1172 ifp->if_hwassist |= CSUM_IP6_TSO;
1175 /* Enable all available capabilities by default. */
1176 ifp->if_capenable = ifp->if_capabilities;
1179 * Disable IPv6 TSO and TXCSUM by default, they still can
1180 * be enabled through SIOCSIFCAP.
1182 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1183 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1185 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1186 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1187 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1188 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1191 ether_ifattach(ifp, eaddr);
1193 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1194 if_printf(ifp, "TSO segcnt %u segsz %u\n",
1195 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1198 /* Inform the upper layer about the long frame support. */
1199 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1202 * Kick off link status check.
1204 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1205 hn_update_link_status(sc);
1209 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1210 hn_synth_detach(sc);
1216 hn_detach(device_t dev)
1218 struct hn_softc *sc = device_get_softc(dev);
1219 struct ifnet *ifp = sc->hn_ifp;
1221 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1223 * In case that the vmbus missed the orphan handler
1226 vmbus_xact_ctx_orphan(sc->hn_xact);
1229 if (device_is_attached(dev)) {
1231 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1232 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1236 * hn_stop() only suspends data, so managment
1237 * stuffs have to be suspended manually here.
1239 hn_suspend_mgmt(sc);
1240 hn_synth_detach(sc);
1243 ether_ifdetach(ifp);
1246 ifmedia_removeall(&sc->hn_media);
1247 hn_destroy_rx_data(sc);
1248 hn_destroy_tx_data(sc);
1250 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1253 for (i = 0; i < hn_tx_taskq_cnt; ++i)
1254 taskqueue_free(sc->hn_tx_taskqs[i]);
1255 free(sc->hn_tx_taskqs, M_DEVBUF);
1257 taskqueue_free(sc->hn_mgmt_taskq0);
1259 if (sc->hn_xact != NULL) {
1261 * Uninstall the orphan handler _before_ the xact is
1264 vmbus_chan_unset_orphan(sc->hn_prichan);
1265 vmbus_xact_ctx_destroy(sc->hn_xact);
1270 HN_LOCK_DESTROY(sc);
1275 hn_shutdown(device_t dev)
1282 hn_link_status(struct hn_softc *sc)
1284 uint32_t link_status;
1287 error = hn_rndis_get_linkstatus(sc, &link_status);
1289 /* XXX what to do? */
1293 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1294 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1296 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1297 if_link_state_change(sc->hn_ifp,
1298 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1299 LINK_STATE_UP : LINK_STATE_DOWN);
1303 hn_link_taskfunc(void *xsc, int pending __unused)
1305 struct hn_softc *sc = xsc;
1307 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1313 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1315 struct hn_softc *sc = xsc;
1317 /* Prevent any link status checks from running. */
1318 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1321 * Fake up a [link down --> link up] state change; 5 seconds
1322 * delay is used, which closely simulates miibus reaction
1323 * upon link down event.
1325 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1326 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1327 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1328 &sc->hn_netchg_status, 5 * hz);
1332 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1334 struct hn_softc *sc = xsc;
1336 /* Re-allow link status checks. */
1337 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1342 hn_update_link_status(struct hn_softc *sc)
1345 if (sc->hn_mgmt_taskq != NULL)
1346 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1350 hn_change_network(struct hn_softc *sc)
1353 if (sc->hn_mgmt_taskq != NULL)
1354 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1358 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1359 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1361 struct mbuf *m = *m_head;
1364 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1366 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1367 m, segs, nsegs, BUS_DMA_NOWAIT);
1368 if (error == EFBIG) {
1371 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1375 *m_head = m = m_new;
1376 txr->hn_tx_collapsed++;
1378 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1379 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1382 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1383 BUS_DMASYNC_PREWRITE);
1384 txd->flags |= HN_TXD_FLAG_DMAMAP;
1390 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1393 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1394 ("put an onlist txd %#x", txd->flags));
1395 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1396 ("put an onagg txd %#x", txd->flags));
1398 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1399 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1402 if (!STAILQ_EMPTY(&txd->agg_list)) {
1403 struct hn_txdesc *tmp_txd;
1405 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1408 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1409 ("resursive aggregation on aggregated txdesc"));
1410 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1411 ("not aggregated txdesc"));
1412 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1413 ("aggregated txdesc uses dmamap"));
1414 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1415 ("aggregated txdesc consumes "
1416 "chimney sending buffer"));
1417 KASSERT(tmp_txd->chim_size == 0,
1418 ("aggregated txdesc has non-zero "
1419 "chimney sending size"));
1421 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1422 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1423 freed = hn_txdesc_put(txr, tmp_txd);
1424 KASSERT(freed, ("failed to free aggregated txdesc"));
1428 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1429 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1430 ("chim txd uses dmamap"));
1431 hn_chim_free(txr->hn_sc, txd->chim_index);
1432 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1434 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1435 bus_dmamap_sync(txr->hn_tx_data_dtag,
1436 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1437 bus_dmamap_unload(txr->hn_tx_data_dtag,
1439 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1442 if (txd->m != NULL) {
1447 txd->flags |= HN_TXD_FLAG_ONLIST;
1448 #ifndef HN_USE_TXDESC_BUFRING
1449 mtx_lock_spin(&txr->hn_txlist_spin);
1450 KASSERT(txr->hn_txdesc_avail >= 0 &&
1451 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1452 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1453 txr->hn_txdesc_avail++;
1454 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1455 mtx_unlock_spin(&txr->hn_txlist_spin);
1456 #else /* HN_USE_TXDESC_BUFRING */
1458 atomic_add_int(&txr->hn_txdesc_avail, 1);
1460 buf_ring_enqueue(txr->hn_txdesc_br, txd);
1461 #endif /* !HN_USE_TXDESC_BUFRING */
1466 static __inline struct hn_txdesc *
1467 hn_txdesc_get(struct hn_tx_ring *txr)
1469 struct hn_txdesc *txd;
1471 #ifndef HN_USE_TXDESC_BUFRING
1472 mtx_lock_spin(&txr->hn_txlist_spin);
1473 txd = SLIST_FIRST(&txr->hn_txlist);
1475 KASSERT(txr->hn_txdesc_avail > 0,
1476 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1477 txr->hn_txdesc_avail--;
1478 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1480 mtx_unlock_spin(&txr->hn_txlist_spin);
1482 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1486 #ifdef HN_USE_TXDESC_BUFRING
1488 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1490 #endif /* HN_USE_TXDESC_BUFRING */
1491 KASSERT(txd->m == NULL && txd->refs == 0 &&
1492 STAILQ_EMPTY(&txd->agg_list) &&
1493 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1494 txd->chim_size == 0 &&
1495 (txd->flags & HN_TXD_FLAG_ONLIST) &&
1496 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1497 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1498 txd->flags &= ~HN_TXD_FLAG_ONLIST;
1504 static __inline void
1505 hn_txdesc_hold(struct hn_txdesc *txd)
1508 /* 0->1 transition will never work */
1509 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1510 atomic_add_int(&txd->refs, 1);
1513 static __inline void
1514 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1517 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1518 ("recursive aggregation on aggregating txdesc"));
1520 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1521 ("already aggregated"));
1522 KASSERT(STAILQ_EMPTY(&txd->agg_list),
1523 ("recursive aggregation on to-be-aggregated txdesc"));
1525 txd->flags |= HN_TXD_FLAG_ONAGG;
1526 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1530 hn_tx_ring_pending(struct hn_tx_ring *txr)
1532 bool pending = false;
1534 #ifndef HN_USE_TXDESC_BUFRING
1535 mtx_lock_spin(&txr->hn_txlist_spin);
1536 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1538 mtx_unlock_spin(&txr->hn_txlist_spin);
1540 if (!buf_ring_full(txr->hn_txdesc_br))
1546 static __inline void
1547 hn_txeof(struct hn_tx_ring *txr)
1549 txr->hn_has_txeof = 0;
1554 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1555 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1557 struct hn_txdesc *txd = sndc->hn_cbarg;
1558 struct hn_tx_ring *txr;
1561 KASSERT(txr->hn_chan == chan,
1562 ("channel mismatch, on chan%u, should be chan%u",
1563 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1565 txr->hn_has_txeof = 1;
1566 hn_txdesc_put(txr, txd);
1568 ++txr->hn_txdone_cnt;
1569 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1570 txr->hn_txdone_cnt = 0;
1571 if (txr->hn_oactive)
1577 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1579 #if defined(INET) || defined(INET6)
1580 struct lro_ctrl *lro = &rxr->hn_lro;
1581 struct lro_entry *queued;
1583 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1584 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1585 tcp_lro_flush(lro, queued);
1591 * 'txr' could be NULL, if multiple channels and
1592 * ifnet.if_start method are enabled.
1594 if (txr == NULL || !txr->hn_has_txeof)
1597 txr->hn_txdone_cnt = 0;
1601 static __inline uint32_t
1602 hn_rndis_pktmsg_offset(uint32_t ofs)
1605 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1606 ("invalid RNDIS packet msg offset %u", ofs));
1607 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1610 static __inline void *
1611 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1612 size_t pi_dlen, uint32_t pi_type)
1614 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1615 struct rndis_pktinfo *pi;
1617 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1618 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1621 * Per-packet-info does not move; it only grows.
1624 * rm_pktinfooffset in this phase counts from the beginning
1625 * of rndis_packet_msg.
1627 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1628 ("%u pktinfo overflows RNDIS packet msg", pi_type));
1629 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1630 pkt->rm_pktinfolen);
1631 pkt->rm_pktinfolen += pi_size;
1633 pi->rm_size = pi_size;
1634 pi->rm_type = pi_type;
1635 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1637 /* Data immediately follow per-packet-info. */
1638 pkt->rm_dataoffset += pi_size;
1640 /* Update RNDIS packet msg length */
1641 pkt->rm_len += pi_size;
1643 return (pi->rm_data);
1647 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1649 struct hn_txdesc *txd;
1653 txd = txr->hn_agg_txd;
1654 KASSERT(txd != NULL, ("no aggregate txdesc"));
1657 * Since hn_txpkt() will reset this temporary stat, save
1658 * it now, so that oerrors can be updated properly, if
1659 * hn_txpkt() ever fails.
1661 pkts = txr->hn_stat_pkts;
1664 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1665 * failure, save it for later freeing, if hn_txpkt() ever
1669 error = hn_txpkt(ifp, txr, txd);
1670 if (__predict_false(error)) {
1671 /* txd is freed, but m is not. */
1674 txr->hn_flush_failed++;
1675 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1678 /* Reset all aggregation states. */
1679 txr->hn_agg_txd = NULL;
1680 txr->hn_agg_szleft = 0;
1681 txr->hn_agg_pktleft = 0;
1682 txr->hn_agg_prevpkt = NULL;
1688 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1693 if (txr->hn_agg_txd != NULL) {
1694 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1695 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1696 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1700 * Update the previous RNDIS packet's total length,
1701 * it can be increased due to the mandatory alignment
1702 * padding for this RNDIS packet. And update the
1703 * aggregating txdesc's chimney sending buffer size
1707 * Zero-out the padding, as required by the RNDIS spec.
1710 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1711 agg_txd->chim_size += pkt->rm_len - olen;
1713 /* Link this txdesc to the parent. */
1714 hn_txdesc_agg(agg_txd, txd);
1716 chim = (uint8_t *)pkt + pkt->rm_len;
1717 /* Save the current packet for later fixup. */
1718 txr->hn_agg_prevpkt = chim;
1720 txr->hn_agg_pktleft--;
1721 txr->hn_agg_szleft -= pktsize;
1722 if (txr->hn_agg_szleft <=
1723 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1725 * Probably can't aggregate more packets,
1726 * flush this aggregating txdesc proactively.
1728 txr->hn_agg_pktleft = 0;
1733 hn_flush_txagg(ifp, txr);
1735 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1737 txr->hn_tx_chimney_tried++;
1738 txd->chim_index = hn_chim_alloc(txr->hn_sc);
1739 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1741 txr->hn_tx_chimney++;
1743 chim = txr->hn_sc->hn_chim +
1744 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1746 if (txr->hn_agg_pktmax > 1 &&
1747 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1748 txr->hn_agg_txd = txd;
1749 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1750 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1751 txr->hn_agg_prevpkt = chim;
1758 * If this function fails, then both txd and m_head0 will be freed.
1761 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1762 struct mbuf **m_head0)
1764 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1765 int error, nsegs, i;
1766 struct mbuf *m_head = *m_head0;
1767 struct rndis_packet_msg *pkt;
1770 int pkt_hlen, pkt_size;
1772 pkt = txd->rndis_pkt;
1773 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1774 if (pkt_size < txr->hn_chim_size) {
1775 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1779 if (txr->hn_agg_txd != NULL)
1780 hn_flush_txagg(ifp, txr);
1783 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1784 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1785 pkt->rm_dataoffset = sizeof(*pkt);
1786 pkt->rm_datalen = m_head->m_pkthdr.len;
1787 pkt->rm_oobdataoffset = 0;
1788 pkt->rm_oobdatalen = 0;
1789 pkt->rm_oobdataelements = 0;
1790 pkt->rm_pktinfooffset = sizeof(*pkt);
1791 pkt->rm_pktinfolen = 0;
1792 pkt->rm_vchandle = 0;
1793 pkt->rm_reserved = 0;
1795 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1797 * Set the hash value for this packet, so that the host could
1798 * dispatch the TX done event for this packet back to this TX
1801 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1802 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1803 *pi_data = txr->hn_tx_idx;
1806 if (m_head->m_flags & M_VLANTAG) {
1807 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1808 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1809 *pi_data = NDIS_VLAN_INFO_MAKE(
1810 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1811 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1812 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1815 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1816 #if defined(INET6) || defined(INET)
1817 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1818 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1820 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1821 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1822 m_head->m_pkthdr.tso_segsz);
1825 #if defined(INET6) && defined(INET)
1830 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1831 m_head->m_pkthdr.tso_segsz);
1834 #endif /* INET6 || INET */
1835 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1836 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1837 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1838 if (m_head->m_pkthdr.csum_flags &
1839 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1840 *pi_data = NDIS_TXCSUM_INFO_IPV6;
1842 *pi_data = NDIS_TXCSUM_INFO_IPV4;
1843 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1844 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1847 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1848 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1849 else if (m_head->m_pkthdr.csum_flags &
1850 (CSUM_IP_UDP | CSUM_IP6_UDP))
1851 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1854 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1855 /* Convert RNDIS packet message offsets */
1856 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1857 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1860 * Fast path: Chimney sending.
1863 struct hn_txdesc *tgt_txd = txd;
1865 if (txr->hn_agg_txd != NULL) {
1866 tgt_txd = txr->hn_agg_txd;
1872 KASSERT(pkt == chim,
1873 ("RNDIS pkt not in chimney sending buffer"));
1874 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1875 ("chimney sending buffer is not used"));
1876 tgt_txd->chim_size += pkt->rm_len;
1878 m_copydata(m_head, 0, m_head->m_pkthdr.len,
1879 ((uint8_t *)chim) + pkt_hlen);
1881 txr->hn_gpa_cnt = 0;
1882 txr->hn_sendpkt = hn_txpkt_chim;
1886 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1887 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1888 ("chimney buffer is used"));
1889 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1891 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1892 if (__predict_false(error)) {
1896 * This mbuf is not linked w/ the txd yet, so free it now.
1901 freed = hn_txdesc_put(txr, txd);
1903 ("fail to free txd upon txdma error"));
1905 txr->hn_txdma_failed++;
1906 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1911 /* +1 RNDIS packet message */
1912 txr->hn_gpa_cnt = nsegs + 1;
1914 /* send packet with page buffer */
1915 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1916 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1917 txr->hn_gpa[0].gpa_len = pkt_hlen;
1920 * Fill the page buffers with mbuf info after the page
1921 * buffer for RNDIS packet message.
1923 for (i = 0; i < nsegs; ++i) {
1924 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1926 gpa->gpa_page = atop(segs[i].ds_addr);
1927 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1928 gpa->gpa_len = segs[i].ds_len;
1931 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1933 txr->hn_sendpkt = hn_txpkt_sglist;
1937 /* Set the completion routine */
1938 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1940 /* Update temporary stats for later use. */
1941 txr->hn_stat_pkts++;
1942 txr->hn_stat_size += m_head->m_pkthdr.len;
1943 if (m_head->m_flags & M_MCAST)
1944 txr->hn_stat_mcasts++;
1951 * If this function fails, then txd will be freed, but the mbuf
1952 * associated w/ the txd will _not_ be freed.
1955 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1957 int error, send_failed = 0, has_bpf;
1960 has_bpf = bpf_peers_present(ifp->if_bpf);
1963 * Make sure that this txd and any aggregated txds are not
1964 * freed before ETHER_BPF_MTAP.
1966 hn_txdesc_hold(txd);
1968 error = txr->hn_sendpkt(txr, txd);
1971 const struct hn_txdesc *tmp_txd;
1973 ETHER_BPF_MTAP(ifp, txd->m);
1974 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1975 ETHER_BPF_MTAP(ifp, tmp_txd->m);
1978 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1979 #ifdef HN_IFSTART_SUPPORT
1980 if (!hn_use_if_start)
1983 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1985 if (txr->hn_stat_mcasts != 0) {
1986 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1987 txr->hn_stat_mcasts);
1990 txr->hn_pkts += txr->hn_stat_pkts;
1994 hn_txdesc_put(txr, txd);
1996 if (__predict_false(error)) {
2000 * This should "really rarely" happen.
2002 * XXX Too many RX to be acked or too many sideband
2003 * commands to run? Ask netvsc_channel_rollup()
2004 * to kick start later.
2006 txr->hn_has_txeof = 1;
2008 txr->hn_send_failed++;
2011 * Try sending again after set hn_has_txeof;
2012 * in case that we missed the last
2013 * netvsc_channel_rollup().
2017 if_printf(ifp, "send failed\n");
2020 * Caller will perform further processing on the
2021 * associated mbuf, so don't free it in hn_txdesc_put();
2022 * only unload it from the DMA map in hn_txdesc_put(),
2026 freed = hn_txdesc_put(txr, txd);
2028 ("fail to free txd upon send error"));
2030 txr->hn_send_failed++;
2033 /* Reset temporary stats, after this sending is done. */
2034 txr->hn_stat_size = 0;
2035 txr->hn_stat_pkts = 0;
2036 txr->hn_stat_mcasts = 0;
2042 * Append the specified data to the indicated mbuf chain,
2043 * Extend the mbuf chain if the new data does not fit in
2046 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2047 * There should be an equivalent in the kernel mbuf code,
2048 * but there does not appear to be one yet.
2050 * Differs from m_append() in that additional mbufs are
2051 * allocated with cluster size MJUMPAGESIZE, and filled
2054 * Return 1 if able to complete the job; otherwise 0.
2057 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2060 int remainder, space;
2062 for (m = m0; m->m_next != NULL; m = m->m_next)
2065 space = M_TRAILINGSPACE(m);
2068 * Copy into available space.
2070 if (space > remainder)
2072 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2077 while (remainder > 0) {
2079 * Allocate a new mbuf; could check space
2080 * and allocate a cluster instead.
2082 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
2085 n->m_len = min(MJUMPAGESIZE, remainder);
2086 bcopy(cp, mtod(n, caddr_t), n->m_len);
2088 remainder -= n->m_len;
2092 if (m0->m_flags & M_PKTHDR)
2093 m0->m_pkthdr.len += len - remainder;
2095 return (remainder == 0);
2098 #if defined(INET) || defined(INET6)
2100 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2102 #if __FreeBSD_version >= 1100095
2103 if (hn_lro_mbufq_depth) {
2104 tcp_lro_queue_mbuf(lc, m);
2108 return tcp_lro_rx(lc, m, 0);
2113 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2114 const struct hn_rxinfo *info)
2116 struct ifnet *ifp = rxr->hn_ifp;
2118 int size, do_lro = 0, do_csum = 1;
2119 int hash_type = M_HASHTYPE_OPAQUE;
2121 if (dlen <= MHLEN) {
2122 m_new = m_gethdr(M_NOWAIT, MT_DATA);
2123 if (m_new == NULL) {
2124 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2127 memcpy(mtod(m_new, void *), data, dlen);
2128 m_new->m_pkthdr.len = m_new->m_len = dlen;
2129 rxr->hn_small_pkts++;
2132 * Get an mbuf with a cluster. For packets 2K or less,
2133 * get a standard 2K cluster. For anything larger, get a
2134 * 4K cluster. Any buffers larger than 4K can cause problems
2135 * if looped around to the Hyper-V TX channel, so avoid them.
2138 if (dlen > MCLBYTES) {
2140 size = MJUMPAGESIZE;
2143 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2144 if (m_new == NULL) {
2145 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2149 hv_m_append(m_new, dlen, data);
2151 m_new->m_pkthdr.rcvif = ifp;
2153 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2156 /* receive side checksum offload */
2157 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2158 /* IP csum offload */
2159 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2160 m_new->m_pkthdr.csum_flags |=
2161 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2165 /* TCP/UDP csum offload */
2166 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2167 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2168 m_new->m_pkthdr.csum_flags |=
2169 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2170 m_new->m_pkthdr.csum_data = 0xffff;
2171 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2179 * As of this write (Oct 28th, 2016), host side will turn
2180 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2181 * the do_lro setting here is actually _not_ accurate. We
2182 * depend on the RSS hash type check to reset do_lro.
2184 if ((info->csum_info &
2185 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2186 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2189 const struct ether_header *eh;
2194 if (m_new->m_len < hoff)
2196 eh = mtod(m_new, struct ether_header *);
2197 etype = ntohs(eh->ether_type);
2198 if (etype == ETHERTYPE_VLAN) {
2199 const struct ether_vlan_header *evl;
2201 hoff = sizeof(*evl);
2202 if (m_new->m_len < hoff)
2204 evl = mtod(m_new, struct ether_vlan_header *);
2205 etype = ntohs(evl->evl_proto);
2208 if (etype == ETHERTYPE_IP) {
2211 pr = hn_check_iplen(m_new, hoff);
2212 if (pr == IPPROTO_TCP) {
2214 (rxr->hn_trust_hcsum &
2215 HN_TRUST_HCSUM_TCP)) {
2216 rxr->hn_csum_trusted++;
2217 m_new->m_pkthdr.csum_flags |=
2218 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2219 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2220 m_new->m_pkthdr.csum_data = 0xffff;
2223 } else if (pr == IPPROTO_UDP) {
2225 (rxr->hn_trust_hcsum &
2226 HN_TRUST_HCSUM_UDP)) {
2227 rxr->hn_csum_trusted++;
2228 m_new->m_pkthdr.csum_flags |=
2229 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2230 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2231 m_new->m_pkthdr.csum_data = 0xffff;
2233 } else if (pr != IPPROTO_DONE && do_csum &&
2234 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2235 rxr->hn_csum_trusted++;
2236 m_new->m_pkthdr.csum_flags |=
2237 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2242 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2243 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2244 NDIS_VLAN_INFO_ID(info->vlan_info),
2245 NDIS_VLAN_INFO_PRI(info->vlan_info),
2246 NDIS_VLAN_INFO_CFI(info->vlan_info));
2247 m_new->m_flags |= M_VLANTAG;
2250 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2252 m_new->m_pkthdr.flowid = info->hash_value;
2253 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2254 NDIS_HASH_FUNCTION_TOEPLITZ) {
2255 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2259 * do_lro is resetted, if the hash types are not TCP
2260 * related. See the comment in the above csum_flags
2264 case NDIS_HASH_IPV4:
2265 hash_type = M_HASHTYPE_RSS_IPV4;
2269 case NDIS_HASH_TCP_IPV4:
2270 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2273 case NDIS_HASH_IPV6:
2274 hash_type = M_HASHTYPE_RSS_IPV6;
2278 case NDIS_HASH_IPV6_EX:
2279 hash_type = M_HASHTYPE_RSS_IPV6_EX;
2283 case NDIS_HASH_TCP_IPV6:
2284 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2287 case NDIS_HASH_TCP_IPV6_EX:
2288 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2293 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2295 M_HASHTYPE_SET(m_new, hash_type);
2298 * Note: Moved RX completion back to hv_nv_on_receive() so all
2299 * messages (not just data messages) will trigger a response.
2305 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2306 #if defined(INET) || defined(INET6)
2307 struct lro_ctrl *lro = &rxr->hn_lro;
2310 rxr->hn_lro_tried++;
2311 if (hn_lro_rx(lro, m_new) == 0) {
2319 /* We're not holding the lock here, so don't release it */
2320 (*ifp->if_input)(ifp, m_new);
2326 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2328 struct hn_softc *sc = ifp->if_softc;
2329 struct ifreq *ifr = (struct ifreq *)data;
2330 int mask, error = 0;
2334 if (ifr->ifr_mtu > HN_MTU_MAX) {
2341 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2346 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2347 /* Can't change MTU */
2353 if (ifp->if_mtu == ifr->ifr_mtu) {
2359 * Suspend this interface before the synthetic parts
2365 * Detach the synthetics parts, i.e. NVS and RNDIS.
2367 hn_synth_detach(sc);
2370 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2371 * with the new MTU setting.
2373 error = hn_synth_attach(sc, ifr->ifr_mtu);
2380 * Commit the requested MTU, after the synthetic parts
2381 * have been successfully attached.
2383 ifp->if_mtu = ifr->ifr_mtu;
2386 * Make sure that various parameters based on MTU are
2387 * still valid, after the MTU change.
2389 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2390 hn_set_chim_size(sc, sc->hn_chim_szmax);
2391 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2392 #if __FreeBSD_version >= 1100099
2393 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2394 HN_LRO_LENLIM_MIN(ifp))
2395 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2399 * All done! Resume the interface now.
2409 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2414 if (ifp->if_flags & IFF_UP) {
2415 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2417 * Caller meight hold mutex, e.g.
2418 * bpf; use busy-wait for the RNDIS
2422 hn_rxfilter_config(sc);
2428 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2431 sc->hn_if_flags = ifp->if_flags;
2438 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2440 if (mask & IFCAP_TXCSUM) {
2441 ifp->if_capenable ^= IFCAP_TXCSUM;
2442 if (ifp->if_capenable & IFCAP_TXCSUM)
2443 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2445 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2447 if (mask & IFCAP_TXCSUM_IPV6) {
2448 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2449 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2450 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2452 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2455 /* TODO: flip RNDIS offload parameters for RXCSUM. */
2456 if (mask & IFCAP_RXCSUM)
2457 ifp->if_capenable ^= IFCAP_RXCSUM;
2459 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2460 if (mask & IFCAP_RXCSUM_IPV6)
2461 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2464 if (mask & IFCAP_LRO)
2465 ifp->if_capenable ^= IFCAP_LRO;
2467 if (mask & IFCAP_TSO4) {
2468 ifp->if_capenable ^= IFCAP_TSO4;
2469 if (ifp->if_capenable & IFCAP_TSO4)
2470 ifp->if_hwassist |= CSUM_IP_TSO;
2472 ifp->if_hwassist &= ~CSUM_IP_TSO;
2474 if (mask & IFCAP_TSO6) {
2475 ifp->if_capenable ^= IFCAP_TSO6;
2476 if (ifp->if_capenable & IFCAP_TSO6)
2477 ifp->if_hwassist |= CSUM_IP6_TSO;
2479 ifp->if_hwassist &= ~CSUM_IP6_TSO;
2489 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2493 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2495 * Multicast uses mutex; use busy-wait for
2499 hn_rxfilter_config(sc);
2508 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2512 error = ether_ioctl(ifp, cmd, data);
2519 hn_stop(struct hn_softc *sc)
2521 struct ifnet *ifp = sc->hn_ifp;
2526 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2527 ("synthetic parts were not attached"));
2529 /* Disable polling. */
2532 /* Clear RUNNING bit _before_ hn_suspend_data() */
2533 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2534 hn_suspend_data(sc);
2536 /* Clear OACTIVE bit. */
2537 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2538 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2539 sc->hn_tx_ring[i].hn_oactive = 0;
2543 hn_init_locked(struct hn_softc *sc)
2545 struct ifnet *ifp = sc->hn_ifp;
2550 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2553 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2556 /* Configure RX filter */
2557 hn_rxfilter_config(sc);
2559 /* Clear OACTIVE bit. */
2560 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2561 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2562 sc->hn_tx_ring[i].hn_oactive = 0;
2564 /* Clear TX 'suspended' bit. */
2565 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2567 /* Everything is ready; unleash! */
2568 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2570 /* Re-enable polling if requested. */
2571 if (sc->hn_pollhz > 0)
2572 hn_polling(sc, sc->hn_pollhz);
2578 struct hn_softc *sc = xsc;
2585 #if __FreeBSD_version >= 1100099
2588 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2590 struct hn_softc *sc = arg1;
2591 unsigned int lenlim;
2594 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2595 error = sysctl_handle_int(oidp, &lenlim, 0, req);
2596 if (error || req->newptr == NULL)
2600 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2601 lenlim > TCP_LRO_LENGTH_MAX) {
2605 hn_set_lro_lenlim(sc, lenlim);
2612 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2614 struct hn_softc *sc = arg1;
2615 int ackcnt, error, i;
2618 * lro_ackcnt_lim is append count limit,
2619 * +1 to turn it into aggregation limit.
2621 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2622 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2623 if (error || req->newptr == NULL)
2626 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2630 * Convert aggregation limit back to append
2635 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2636 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2644 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2646 struct hn_softc *sc = arg1;
2651 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2654 error = sysctl_handle_int(oidp, &on, 0, req);
2655 if (error || req->newptr == NULL)
2659 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2660 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2663 rxr->hn_trust_hcsum |= hcsum;
2665 rxr->hn_trust_hcsum &= ~hcsum;
2672 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2674 struct hn_softc *sc = arg1;
2675 int chim_size, error;
2677 chim_size = sc->hn_tx_ring[0].hn_chim_size;
2678 error = sysctl_handle_int(oidp, &chim_size, 0, req);
2679 if (error || req->newptr == NULL)
2682 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2686 hn_set_chim_size(sc, chim_size);
2691 #if __FreeBSD_version < 1100095
2693 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2695 struct hn_softc *sc = arg1;
2696 int ofs = arg2, i, error;
2697 struct hn_rx_ring *rxr;
2701 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2702 rxr = &sc->hn_rx_ring[i];
2703 stat += *((int *)((uint8_t *)rxr + ofs));
2706 error = sysctl_handle_64(oidp, &stat, 0, req);
2707 if (error || req->newptr == NULL)
2710 /* Zero out this stat. */
2711 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2712 rxr = &sc->hn_rx_ring[i];
2713 *((int *)((uint8_t *)rxr + ofs)) = 0;
2719 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2721 struct hn_softc *sc = arg1;
2722 int ofs = arg2, i, error;
2723 struct hn_rx_ring *rxr;
2727 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2728 rxr = &sc->hn_rx_ring[i];
2729 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2732 error = sysctl_handle_64(oidp, &stat, 0, req);
2733 if (error || req->newptr == NULL)
2736 /* Zero out this stat. */
2737 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2738 rxr = &sc->hn_rx_ring[i];
2739 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2747 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2749 struct hn_softc *sc = arg1;
2750 int ofs = arg2, i, error;
2751 struct hn_rx_ring *rxr;
2755 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2756 rxr = &sc->hn_rx_ring[i];
2757 stat += *((u_long *)((uint8_t *)rxr + ofs));
2760 error = sysctl_handle_long(oidp, &stat, 0, req);
2761 if (error || req->newptr == NULL)
2764 /* Zero out this stat. */
2765 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2766 rxr = &sc->hn_rx_ring[i];
2767 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
2773 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2775 struct hn_softc *sc = arg1;
2776 int ofs = arg2, i, error;
2777 struct hn_tx_ring *txr;
2781 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2782 txr = &sc->hn_tx_ring[i];
2783 stat += *((u_long *)((uint8_t *)txr + ofs));
2786 error = sysctl_handle_long(oidp, &stat, 0, req);
2787 if (error || req->newptr == NULL)
2790 /* Zero out this stat. */
2791 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2792 txr = &sc->hn_tx_ring[i];
2793 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2799 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2801 struct hn_softc *sc = arg1;
2802 int ofs = arg2, i, error, conf;
2803 struct hn_tx_ring *txr;
2805 txr = &sc->hn_tx_ring[0];
2806 conf = *((int *)((uint8_t *)txr + ofs));
2808 error = sysctl_handle_int(oidp, &conf, 0, req);
2809 if (error || req->newptr == NULL)
2813 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2814 txr = &sc->hn_tx_ring[i];
2815 *((int *)((uint8_t *)txr + ofs)) = conf;
2823 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2825 struct hn_softc *sc = arg1;
2828 size = sc->hn_agg_size;
2829 error = sysctl_handle_int(oidp, &size, 0, req);
2830 if (error || req->newptr == NULL)
2834 sc->hn_agg_size = size;
2842 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2844 struct hn_softc *sc = arg1;
2847 pkts = sc->hn_agg_pkts;
2848 error = sysctl_handle_int(oidp, &pkts, 0, req);
2849 if (error || req->newptr == NULL)
2853 sc->hn_agg_pkts = pkts;
2861 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2863 struct hn_softc *sc = arg1;
2866 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2867 return (sysctl_handle_int(oidp, &pkts, 0, req));
2871 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2873 struct hn_softc *sc = arg1;
2876 align = sc->hn_tx_ring[0].hn_agg_align;
2877 return (sysctl_handle_int(oidp, &align, 0, req));
2881 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
2884 vmbus_chan_poll_disable(chan);
2886 vmbus_chan_poll_enable(chan, pollhz);
2890 hn_polling(struct hn_softc *sc, u_int pollhz)
2892 int nsubch = sc->hn_rx_ring_inuse - 1;
2897 struct vmbus_channel **subch;
2900 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
2901 for (i = 0; i < nsubch; ++i)
2902 hn_chan_polling(subch[i], pollhz);
2903 vmbus_subchan_rel(subch, nsubch);
2905 hn_chan_polling(sc->hn_prichan, pollhz);
2909 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
2911 struct hn_softc *sc = arg1;
2914 pollhz = sc->hn_pollhz;
2915 error = sysctl_handle_int(oidp, &pollhz, 0, req);
2916 if (error || req->newptr == NULL)
2920 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
2924 if (sc->hn_pollhz != pollhz) {
2925 sc->hn_pollhz = pollhz;
2926 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
2927 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
2928 hn_polling(sc, sc->hn_pollhz);
2936 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2938 struct hn_softc *sc = arg1;
2941 snprintf(verstr, sizeof(verstr), "%u.%u",
2942 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2943 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2944 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2948 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2950 struct hn_softc *sc = arg1;
2957 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2958 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2962 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2964 struct hn_softc *sc = arg1;
2965 char assist_str[128];
2969 hwassist = sc->hn_ifp->if_hwassist;
2971 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2972 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2976 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2978 struct hn_softc *sc = arg1;
2979 char filter_str[128];
2983 filter = sc->hn_rx_filter;
2985 snprintf(filter_str, sizeof(filter_str), "%b", filter,
2987 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2991 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2993 struct hn_softc *sc = arg1;
2998 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2999 if (error || req->newptr == NULL)
3002 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3005 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3007 if (sc->hn_rx_ring_inuse > 1) {
3008 error = hn_rss_reconfig(sc);
3010 /* Not RSS capable, at least for now; just save the RSS key. */
3019 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
3021 struct hn_softc *sc = arg1;
3026 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3027 if (error || req->newptr == NULL)
3031 * Don't allow RSS indirect table change, if this interface is not
3032 * RSS capable currently.
3034 if (sc->hn_rx_ring_inuse == 1) {
3039 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3042 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3044 hn_rss_ind_fixup(sc);
3045 error = hn_rss_reconfig(sc);
3052 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3054 struct hn_softc *sc = arg1;
3059 hash = sc->hn_rss_hash;
3061 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3062 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3066 hn_check_iplen(const struct mbuf *m, int hoff)
3068 const struct ip *ip;
3069 int len, iphlen, iplen;
3070 const struct tcphdr *th;
3071 int thoff; /* TCP data offset */
3073 len = hoff + sizeof(struct ip);
3075 /* The packet must be at least the size of an IP header. */
3076 if (m->m_pkthdr.len < len)
3077 return IPPROTO_DONE;
3079 /* The fixed IP header must reside completely in the first mbuf. */
3081 return IPPROTO_DONE;
3083 ip = mtodo(m, hoff);
3085 /* Bound check the packet's stated IP header length. */
3086 iphlen = ip->ip_hl << 2;
3087 if (iphlen < sizeof(struct ip)) /* minimum header length */
3088 return IPPROTO_DONE;
3090 /* The full IP header must reside completely in the one mbuf. */
3091 if (m->m_len < hoff + iphlen)
3092 return IPPROTO_DONE;
3094 iplen = ntohs(ip->ip_len);
3097 * Check that the amount of data in the buffers is as
3098 * at least much as the IP header would have us expect.
3100 if (m->m_pkthdr.len < hoff + iplen)
3101 return IPPROTO_DONE;
3104 * Ignore IP fragments.
3106 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3107 return IPPROTO_DONE;
3110 * The TCP/IP or UDP/IP header must be entirely contained within
3111 * the first fragment of a packet.
3115 if (iplen < iphlen + sizeof(struct tcphdr))
3116 return IPPROTO_DONE;
3117 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3118 return IPPROTO_DONE;
3119 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3120 thoff = th->th_off << 2;
3121 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3122 return IPPROTO_DONE;
3123 if (m->m_len < hoff + iphlen + thoff)
3124 return IPPROTO_DONE;
3127 if (iplen < iphlen + sizeof(struct udphdr))
3128 return IPPROTO_DONE;
3129 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3130 return IPPROTO_DONE;
3134 return IPPROTO_DONE;
3141 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3143 struct sysctl_oid_list *child;
3144 struct sysctl_ctx_list *ctx;
3145 device_t dev = sc->hn_dev;
3146 #if defined(INET) || defined(INET6)
3147 #if __FreeBSD_version >= 1100095
3154 * Create RXBUF for reception.
3157 * - It is shared by all channels.
3158 * - A large enough buffer is allocated, certain version of NVSes
3159 * may further limit the usable space.
3161 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3162 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3163 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3164 if (sc->hn_rxbuf == NULL) {
3165 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3169 sc->hn_rx_ring_cnt = ring_cnt;
3170 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3172 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3173 M_DEVBUF, M_WAITOK | M_ZERO);
3175 #if defined(INET) || defined(INET6)
3176 #if __FreeBSD_version >= 1100095
3177 lroent_cnt = hn_lro_entry_count;
3178 if (lroent_cnt < TCP_LRO_ENTRIES)
3179 lroent_cnt = TCP_LRO_ENTRIES;
3181 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3183 #endif /* INET || INET6 */
3185 ctx = device_get_sysctl_ctx(dev);
3186 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3188 /* Create dev.hn.UNIT.rx sysctl tree */
3189 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3190 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3192 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3193 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3195 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3196 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3197 &rxr->hn_br_dma, BUS_DMA_WAITOK);
3198 if (rxr->hn_br == NULL) {
3199 device_printf(dev, "allocate bufring failed\n");
3203 if (hn_trust_hosttcp)
3204 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3205 if (hn_trust_hostudp)
3206 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3207 if (hn_trust_hostip)
3208 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3209 rxr->hn_ifp = sc->hn_ifp;
3210 if (i < sc->hn_tx_ring_cnt)
3211 rxr->hn_txr = &sc->hn_tx_ring[i];
3212 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3213 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3215 rxr->hn_rxbuf = sc->hn_rxbuf;
3220 #if defined(INET) || defined(INET6)
3221 #if __FreeBSD_version >= 1100095
3222 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3223 hn_lro_mbufq_depth);
3225 tcp_lro_init(&rxr->hn_lro);
3226 rxr->hn_lro.ifp = sc->hn_ifp;
3228 #if __FreeBSD_version >= 1100099
3229 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3230 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3232 #endif /* INET || INET6 */
3234 if (sc->hn_rx_sysctl_tree != NULL) {
3238 * Create per RX ring sysctl tree:
3239 * dev.hn.UNIT.rx.RINGID
3241 snprintf(name, sizeof(name), "%d", i);
3242 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3243 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3244 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3246 if (rxr->hn_rx_sysctl_tree != NULL) {
3247 SYSCTL_ADD_ULONG(ctx,
3248 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3249 OID_AUTO, "packets", CTLFLAG_RW,
3250 &rxr->hn_pkts, "# of packets received");
3251 SYSCTL_ADD_ULONG(ctx,
3252 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3253 OID_AUTO, "rss_pkts", CTLFLAG_RW,
3255 "# of packets w/ RSS info received");
3257 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3258 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3259 &rxr->hn_pktbuf_len, 0,
3260 "Temporary channel packet buffer length");
3265 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3266 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3267 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3268 #if __FreeBSD_version < 1100095
3269 hn_rx_stat_int_sysctl,
3271 hn_rx_stat_u64_sysctl,
3273 "LU", "LRO queued");
3274 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3275 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3276 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3277 #if __FreeBSD_version < 1100095
3278 hn_rx_stat_int_sysctl,
3280 hn_rx_stat_u64_sysctl,
3282 "LU", "LRO flushed");
3283 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3284 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3285 __offsetof(struct hn_rx_ring, hn_lro_tried),
3286 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3287 #if __FreeBSD_version >= 1100099
3288 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3289 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3290 hn_lro_lenlim_sysctl, "IU",
3291 "Max # of data bytes to be aggregated by LRO");
3292 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3293 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3294 hn_lro_ackcnt_sysctl, "I",
3295 "Max # of ACKs to be aggregated by LRO");
3297 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3298 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3299 hn_trust_hcsum_sysctl, "I",
3300 "Trust tcp segement verification on host side, "
3301 "when csum info is missing");
3302 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3303 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3304 hn_trust_hcsum_sysctl, "I",
3305 "Trust udp datagram verification on host side, "
3306 "when csum info is missing");
3307 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3308 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3309 hn_trust_hcsum_sysctl, "I",
3310 "Trust ip packet verification on host side, "
3311 "when csum info is missing");
3312 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3313 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3314 __offsetof(struct hn_rx_ring, hn_csum_ip),
3315 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3316 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3317 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3318 __offsetof(struct hn_rx_ring, hn_csum_tcp),
3319 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3320 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3321 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3322 __offsetof(struct hn_rx_ring, hn_csum_udp),
3323 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3324 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3325 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3326 __offsetof(struct hn_rx_ring, hn_csum_trusted),
3327 hn_rx_stat_ulong_sysctl, "LU",
3328 "# of packets that we trust host's csum verification");
3329 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3330 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3331 __offsetof(struct hn_rx_ring, hn_small_pkts),
3332 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3333 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3334 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3335 __offsetof(struct hn_rx_ring, hn_ack_failed),
3336 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3337 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3338 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3339 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3340 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3346 hn_destroy_rx_data(struct hn_softc *sc)
3350 if (sc->hn_rxbuf != NULL) {
3351 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3352 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3354 device_printf(sc->hn_dev, "RXBUF is referenced\n");
3355 sc->hn_rxbuf = NULL;
3358 if (sc->hn_rx_ring_cnt == 0)
3361 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3362 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3364 if (rxr->hn_br == NULL)
3366 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3367 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3369 device_printf(sc->hn_dev,
3370 "%dth channel bufring is referenced", i);
3374 #if defined(INET) || defined(INET6)
3375 tcp_lro_free(&rxr->hn_lro);
3377 free(rxr->hn_pktbuf, M_DEVBUF);
3379 free(sc->hn_rx_ring, M_DEVBUF);
3380 sc->hn_rx_ring = NULL;
3382 sc->hn_rx_ring_cnt = 0;
3383 sc->hn_rx_ring_inuse = 0;
3387 hn_tx_ring_create(struct hn_softc *sc, int id)
3389 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3390 device_t dev = sc->hn_dev;
3391 bus_dma_tag_t parent_dtag;
3395 txr->hn_tx_idx = id;
3397 #ifndef HN_USE_TXDESC_BUFRING
3398 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3400 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3402 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3403 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3404 M_DEVBUF, M_WAITOK | M_ZERO);
3405 #ifndef HN_USE_TXDESC_BUFRING
3406 SLIST_INIT(&txr->hn_txlist);
3408 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3409 M_WAITOK, &txr->hn_tx_lock);
3412 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3413 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3414 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3416 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3419 #ifdef HN_IFSTART_SUPPORT
3420 if (hn_use_if_start) {
3421 txr->hn_txeof = hn_start_txeof;
3422 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3423 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3429 txr->hn_txeof = hn_xmit_txeof;
3430 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3431 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3433 br_depth = hn_get_txswq_depth(txr);
3434 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3435 M_WAITOK, &txr->hn_tx_lock);
3438 txr->hn_direct_tx_size = hn_direct_tx_size;
3441 * Always schedule transmission instead of trying to do direct
3442 * transmission. This one gives the best performance so far.
3444 txr->hn_sched_tx = 1;
3446 parent_dtag = bus_get_dma_tag(dev);
3448 /* DMA tag for RNDIS packet messages. */
3449 error = bus_dma_tag_create(parent_dtag, /* parent */
3450 HN_RNDIS_PKT_ALIGN, /* alignment */
3451 HN_RNDIS_PKT_BOUNDARY, /* boundary */
3452 BUS_SPACE_MAXADDR, /* lowaddr */
3453 BUS_SPACE_MAXADDR, /* highaddr */
3454 NULL, NULL, /* filter, filterarg */
3455 HN_RNDIS_PKT_LEN, /* maxsize */
3457 HN_RNDIS_PKT_LEN, /* maxsegsize */
3459 NULL, /* lockfunc */
3460 NULL, /* lockfuncarg */
3461 &txr->hn_tx_rndis_dtag);
3463 device_printf(dev, "failed to create rndis dmatag\n");
3467 /* DMA tag for data. */
3468 error = bus_dma_tag_create(parent_dtag, /* parent */
3470 HN_TX_DATA_BOUNDARY, /* boundary */
3471 BUS_SPACE_MAXADDR, /* lowaddr */
3472 BUS_SPACE_MAXADDR, /* highaddr */
3473 NULL, NULL, /* filter, filterarg */
3474 HN_TX_DATA_MAXSIZE, /* maxsize */
3475 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
3476 HN_TX_DATA_SEGSIZE, /* maxsegsize */
3478 NULL, /* lockfunc */
3479 NULL, /* lockfuncarg */
3480 &txr->hn_tx_data_dtag);
3482 device_printf(dev, "failed to create data dmatag\n");
3486 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3487 struct hn_txdesc *txd = &txr->hn_txdesc[i];
3490 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3491 STAILQ_INIT(&txd->agg_list);
3494 * Allocate and load RNDIS packet message.
3496 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3497 (void **)&txd->rndis_pkt,
3498 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3499 &txd->rndis_pkt_dmap);
3502 "failed to allocate rndis_packet_msg, %d\n", i);
3506 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3507 txd->rndis_pkt_dmap,
3508 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3509 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3513 "failed to load rndis_packet_msg, %d\n", i);
3514 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3515 txd->rndis_pkt, txd->rndis_pkt_dmap);
3519 /* DMA map for TX data. */
3520 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3524 "failed to allocate tx data dmamap\n");
3525 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3526 txd->rndis_pkt_dmap);
3527 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3528 txd->rndis_pkt, txd->rndis_pkt_dmap);
3532 /* All set, put it to list */
3533 txd->flags |= HN_TXD_FLAG_ONLIST;
3534 #ifndef HN_USE_TXDESC_BUFRING
3535 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3537 buf_ring_enqueue(txr->hn_txdesc_br, txd);
3540 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3542 if (sc->hn_tx_sysctl_tree != NULL) {
3543 struct sysctl_oid_list *child;
3544 struct sysctl_ctx_list *ctx;
3548 * Create per TX ring sysctl tree:
3549 * dev.hn.UNIT.tx.RINGID
3551 ctx = device_get_sysctl_ctx(dev);
3552 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3554 snprintf(name, sizeof(name), "%d", id);
3555 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3556 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3558 if (txr->hn_tx_sysctl_tree != NULL) {
3559 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3562 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3563 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3564 "# of available TX descs");
3566 #ifdef HN_IFSTART_SUPPORT
3567 if (!hn_use_if_start)
3570 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3571 CTLFLAG_RD, &txr->hn_oactive, 0,
3574 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3575 CTLFLAG_RW, &txr->hn_pkts,
3576 "# of packets transmitted");
3577 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3578 CTLFLAG_RW, &txr->hn_sends, "# of sends");
3586 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3588 struct hn_tx_ring *txr = txd->txr;
3590 KASSERT(txd->m == NULL, ("still has mbuf installed"));
3591 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3593 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3594 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3595 txd->rndis_pkt_dmap);
3596 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3600 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3603 KASSERT(txd->refs == 0 || txd->refs == 1,
3604 ("invalid txd refs %d", txd->refs));
3606 /* Aggregated txds will be freed by their aggregating txd. */
3607 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3610 freed = hn_txdesc_put(txr, txd);
3611 KASSERT(freed, ("can't free txdesc"));
3616 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3620 if (txr->hn_txdesc == NULL)
3625 * Because the freeing of aggregated txds will be deferred
3626 * to the aggregating txd, two passes are used here:
3627 * - The first pass GCes any pending txds. This GC is necessary,
3628 * since if the channels are revoked, hypervisor will not
3629 * deliver send-done for all pending txds.
3630 * - The second pass frees the busdma stuffs, i.e. after all txds
3633 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3634 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3635 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3636 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3638 if (txr->hn_tx_data_dtag != NULL)
3639 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3640 if (txr->hn_tx_rndis_dtag != NULL)
3641 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3643 #ifdef HN_USE_TXDESC_BUFRING
3644 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3647 free(txr->hn_txdesc, M_DEVBUF);
3648 txr->hn_txdesc = NULL;
3650 if (txr->hn_mbuf_br != NULL)
3651 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3653 #ifndef HN_USE_TXDESC_BUFRING
3654 mtx_destroy(&txr->hn_txlist_spin);
3656 mtx_destroy(&txr->hn_tx_lock);
3660 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3662 struct sysctl_oid_list *child;
3663 struct sysctl_ctx_list *ctx;
3667 * Create TXBUF for chimney sending.
3669 * NOTE: It is shared by all channels.
3671 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3672 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3673 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3674 if (sc->hn_chim == NULL) {
3675 device_printf(sc->hn_dev, "allocate txbuf failed\n");
3679 sc->hn_tx_ring_cnt = ring_cnt;
3680 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3682 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3683 M_DEVBUF, M_WAITOK | M_ZERO);
3685 ctx = device_get_sysctl_ctx(sc->hn_dev);
3686 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3688 /* Create dev.hn.UNIT.tx sysctl tree */
3689 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3690 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3692 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3695 error = hn_tx_ring_create(sc, i);
3700 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3701 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3702 __offsetof(struct hn_tx_ring, hn_no_txdescs),
3703 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3704 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3705 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3706 __offsetof(struct hn_tx_ring, hn_send_failed),
3707 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3708 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3709 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3710 __offsetof(struct hn_tx_ring, hn_txdma_failed),
3711 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3712 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3713 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3714 __offsetof(struct hn_tx_ring, hn_flush_failed),
3715 hn_tx_stat_ulong_sysctl, "LU",
3716 "# of packet transmission aggregation flush failure");
3717 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3718 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3719 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3720 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3721 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3722 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3723 __offsetof(struct hn_tx_ring, hn_tx_chimney),
3724 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3725 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3726 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3727 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3728 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3729 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3730 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3731 "# of total TX descs");
3732 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3733 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3734 "Chimney send packet size upper boundary");
3735 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3736 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3737 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3738 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3739 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3740 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3741 hn_tx_conf_int_sysctl, "I",
3742 "Size of the packet for direct transmission");
3743 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3744 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3745 __offsetof(struct hn_tx_ring, hn_sched_tx),
3746 hn_tx_conf_int_sysctl, "I",
3747 "Always schedule transmission "
3748 "instead of doing direct transmission");
3749 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3750 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3751 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3752 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3753 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3754 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3755 "Applied packet transmission aggregation size");
3756 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3757 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3758 hn_txagg_pktmax_sysctl, "I",
3759 "Applied packet transmission aggregation packets");
3760 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3761 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3762 hn_txagg_align_sysctl, "I",
3763 "Applied packet transmission aggregation alignment");
3769 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3773 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3774 sc->hn_tx_ring[i].hn_chim_size = chim_size;
3778 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3780 struct ifnet *ifp = sc->hn_ifp;
3783 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3786 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3787 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3788 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3790 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3791 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3792 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3794 if (tso_maxlen < tso_minlen)
3795 tso_maxlen = tso_minlen;
3796 else if (tso_maxlen > IP_MAXPACKET)
3797 tso_maxlen = IP_MAXPACKET;
3798 if (tso_maxlen > sc->hn_ndis_tso_szmax)
3799 tso_maxlen = sc->hn_ndis_tso_szmax;
3800 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3802 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3806 hn_fixup_tx_data(struct hn_softc *sc)
3808 uint64_t csum_assist;
3811 hn_set_chim_size(sc, sc->hn_chim_szmax);
3812 if (hn_tx_chimney_size > 0 &&
3813 hn_tx_chimney_size < sc->hn_chim_szmax)
3814 hn_set_chim_size(sc, hn_tx_chimney_size);
3817 if (sc->hn_caps & HN_CAP_IPCS)
3818 csum_assist |= CSUM_IP;
3819 if (sc->hn_caps & HN_CAP_TCP4CS)
3820 csum_assist |= CSUM_IP_TCP;
3821 if (sc->hn_caps & HN_CAP_UDP4CS)
3822 csum_assist |= CSUM_IP_UDP;
3823 if (sc->hn_caps & HN_CAP_TCP6CS)
3824 csum_assist |= CSUM_IP6_TCP;
3825 if (sc->hn_caps & HN_CAP_UDP6CS)
3826 csum_assist |= CSUM_IP6_UDP;
3827 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3828 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3830 if (sc->hn_caps & HN_CAP_HASHVAL) {
3832 * Support HASHVAL pktinfo on TX path.
3835 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3836 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3837 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3842 hn_destroy_tx_data(struct hn_softc *sc)
3846 if (sc->hn_chim != NULL) {
3847 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
3848 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3850 device_printf(sc->hn_dev,
3851 "chimney sending buffer is referenced");
3856 if (sc->hn_tx_ring_cnt == 0)
3859 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3860 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3862 free(sc->hn_tx_ring, M_DEVBUF);
3863 sc->hn_tx_ring = NULL;
3865 sc->hn_tx_ring_cnt = 0;
3866 sc->hn_tx_ring_inuse = 0;
3869 #ifdef HN_IFSTART_SUPPORT
3872 hn_start_taskfunc(void *xtxr, int pending __unused)
3874 struct hn_tx_ring *txr = xtxr;
3876 mtx_lock(&txr->hn_tx_lock);
3877 hn_start_locked(txr, 0);
3878 mtx_unlock(&txr->hn_tx_lock);
3882 hn_start_locked(struct hn_tx_ring *txr, int len)
3884 struct hn_softc *sc = txr->hn_sc;
3885 struct ifnet *ifp = sc->hn_ifp;
3888 KASSERT(hn_use_if_start,
3889 ("hn_start_locked is called, when if_start is disabled"));
3890 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3891 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3892 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3894 if (__predict_false(txr->hn_suspended))
3897 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3901 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3902 struct hn_txdesc *txd;
3903 struct mbuf *m_head;
3906 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3910 if (len > 0 && m_head->m_pkthdr.len > len) {
3912 * This sending could be time consuming; let callers
3913 * dispatch this packet sending (and sending of any
3914 * following up packets) to tx taskqueue.
3916 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3921 #if defined(INET6) || defined(INET)
3922 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3923 m_head = hn_tso_fixup(m_head);
3924 if (__predict_false(m_head == NULL)) {
3925 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3931 txd = hn_txdesc_get(txr);
3933 txr->hn_no_txdescs++;
3934 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3935 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3939 error = hn_encap(ifp, txr, txd, &m_head);
3941 /* Both txd and m_head are freed */
3942 KASSERT(txr->hn_agg_txd == NULL,
3943 ("encap failed w/ pending aggregating txdesc"));
3947 if (txr->hn_agg_pktleft == 0) {
3948 if (txr->hn_agg_txd != NULL) {
3949 KASSERT(m_head == NULL,
3950 ("pending mbuf for aggregating txdesc"));
3951 error = hn_flush_txagg(ifp, txr);
3952 if (__predict_false(error)) {
3953 atomic_set_int(&ifp->if_drv_flags,
3958 KASSERT(m_head != NULL, ("mbuf was freed"));
3959 error = hn_txpkt(ifp, txr, txd);
3960 if (__predict_false(error)) {
3961 /* txd is freed, but m_head is not */
3962 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3963 atomic_set_int(&ifp->if_drv_flags,
3971 KASSERT(txr->hn_agg_txd != NULL,
3972 ("no aggregating txdesc"));
3973 KASSERT(m_head == NULL,
3974 ("pending mbuf for aggregating txdesc"));
3979 /* Flush pending aggerated transmission. */
3980 if (txr->hn_agg_txd != NULL)
3981 hn_flush_txagg(ifp, txr);
3986 hn_start(struct ifnet *ifp)
3988 struct hn_softc *sc = ifp->if_softc;
3989 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
3991 if (txr->hn_sched_tx)
3994 if (mtx_trylock(&txr->hn_tx_lock)) {
3997 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3998 mtx_unlock(&txr->hn_tx_lock);
4003 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4007 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
4009 struct hn_tx_ring *txr = xtxr;
4011 mtx_lock(&txr->hn_tx_lock);
4012 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
4013 hn_start_locked(txr, 0);
4014 mtx_unlock(&txr->hn_tx_lock);
4018 hn_start_txeof(struct hn_tx_ring *txr)
4020 struct hn_softc *sc = txr->hn_sc;
4021 struct ifnet *ifp = sc->hn_ifp;
4023 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4025 if (txr->hn_sched_tx)
4028 if (mtx_trylock(&txr->hn_tx_lock)) {
4031 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4032 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4033 mtx_unlock(&txr->hn_tx_lock);
4035 taskqueue_enqueue(txr->hn_tx_taskq,
4041 * Release the OACTIVE earlier, with the hope, that
4042 * others could catch up. The task will clear the
4043 * flag again with the hn_tx_lock to avoid possible
4046 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4047 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4051 #endif /* HN_IFSTART_SUPPORT */
4054 hn_xmit(struct hn_tx_ring *txr, int len)
4056 struct hn_softc *sc = txr->hn_sc;
4057 struct ifnet *ifp = sc->hn_ifp;
4058 struct mbuf *m_head;
4061 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4062 #ifdef HN_IFSTART_SUPPORT
4063 KASSERT(hn_use_if_start == 0,
4064 ("hn_xmit is called, when if_start is enabled"));
4066 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4068 if (__predict_false(txr->hn_suspended))
4071 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4074 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4075 struct hn_txdesc *txd;
4078 if (len > 0 && m_head->m_pkthdr.len > len) {
4080 * This sending could be time consuming; let callers
4081 * dispatch this packet sending (and sending of any
4082 * following up packets) to tx taskqueue.
4084 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4089 txd = hn_txdesc_get(txr);
4091 txr->hn_no_txdescs++;
4092 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4093 txr->hn_oactive = 1;
4097 error = hn_encap(ifp, txr, txd, &m_head);
4099 /* Both txd and m_head are freed; discard */
4100 KASSERT(txr->hn_agg_txd == NULL,
4101 ("encap failed w/ pending aggregating txdesc"));
4102 drbr_advance(ifp, txr->hn_mbuf_br);
4106 if (txr->hn_agg_pktleft == 0) {
4107 if (txr->hn_agg_txd != NULL) {
4108 KASSERT(m_head == NULL,
4109 ("pending mbuf for aggregating txdesc"));
4110 error = hn_flush_txagg(ifp, txr);
4111 if (__predict_false(error)) {
4112 txr->hn_oactive = 1;
4116 KASSERT(m_head != NULL, ("mbuf was freed"));
4117 error = hn_txpkt(ifp, txr, txd);
4118 if (__predict_false(error)) {
4119 /* txd is freed, but m_head is not */
4120 drbr_putback(ifp, txr->hn_mbuf_br,
4122 txr->hn_oactive = 1;
4129 KASSERT(txr->hn_agg_txd != NULL,
4130 ("no aggregating txdesc"));
4131 KASSERT(m_head == NULL,
4132 ("pending mbuf for aggregating txdesc"));
4137 drbr_advance(ifp, txr->hn_mbuf_br);
4140 /* Flush pending aggerated transmission. */
4141 if (txr->hn_agg_txd != NULL)
4142 hn_flush_txagg(ifp, txr);
4147 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4149 struct hn_softc *sc = ifp->if_softc;
4150 struct hn_tx_ring *txr;
4153 #if defined(INET6) || defined(INET)
4155 * Perform TSO packet header fixup now, since the TSO
4156 * packet header should be cache-hot.
4158 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4159 m = hn_tso_fixup(m);
4160 if (__predict_false(m == NULL)) {
4161 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4168 * Select the TX ring based on flowid
4170 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
4171 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4172 txr = &sc->hn_tx_ring[idx];
4174 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4176 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4180 if (txr->hn_oactive)
4183 if (txr->hn_sched_tx)
4186 if (mtx_trylock(&txr->hn_tx_lock)) {
4189 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4190 mtx_unlock(&txr->hn_tx_lock);
4195 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4200 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4204 mtx_lock(&txr->hn_tx_lock);
4205 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4207 mtx_unlock(&txr->hn_tx_lock);
4211 hn_xmit_qflush(struct ifnet *ifp)
4213 struct hn_softc *sc = ifp->if_softc;
4216 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4217 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4222 hn_xmit_txeof(struct hn_tx_ring *txr)
4225 if (txr->hn_sched_tx)
4228 if (mtx_trylock(&txr->hn_tx_lock)) {
4231 txr->hn_oactive = 0;
4232 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4233 mtx_unlock(&txr->hn_tx_lock);
4235 taskqueue_enqueue(txr->hn_tx_taskq,
4241 * Release the oactive earlier, with the hope, that
4242 * others could catch up. The task will clear the
4243 * oactive again with the hn_tx_lock to avoid possible
4246 txr->hn_oactive = 0;
4247 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4252 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4254 struct hn_tx_ring *txr = xtxr;
4256 mtx_lock(&txr->hn_tx_lock);
4258 mtx_unlock(&txr->hn_tx_lock);
4262 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4264 struct hn_tx_ring *txr = xtxr;
4266 mtx_lock(&txr->hn_tx_lock);
4267 txr->hn_oactive = 0;
4269 mtx_unlock(&txr->hn_tx_lock);
4273 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4275 struct vmbus_chan_br cbr;
4276 struct hn_rx_ring *rxr;
4277 struct hn_tx_ring *txr = NULL;
4280 idx = vmbus_chan_subidx(chan);
4283 * Link this channel to RX/TX ring.
4285 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4286 ("invalid channel index %d, should > 0 && < %d",
4287 idx, sc->hn_rx_ring_inuse));
4288 rxr = &sc->hn_rx_ring[idx];
4289 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4290 ("RX ring %d already attached", idx));
4291 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4292 rxr->hn_chan = chan;
4295 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4296 idx, vmbus_chan_id(chan));
4299 if (idx < sc->hn_tx_ring_inuse) {
4300 txr = &sc->hn_tx_ring[idx];
4301 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4302 ("TX ring %d already attached", idx));
4303 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4305 txr->hn_chan = chan;
4307 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4308 idx, vmbus_chan_id(chan));
4312 /* Bind this channel to a proper CPU. */
4313 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4318 cbr.cbr = rxr->hn_br;
4319 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4320 cbr.cbr_txsz = HN_TXBR_SIZE;
4321 cbr.cbr_rxsz = HN_RXBR_SIZE;
4322 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4324 if (error == EISCONN) {
4325 if_printf(sc->hn_ifp, "bufring is connected after "
4326 "chan%u open failure\n", vmbus_chan_id(chan));
4327 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4329 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4330 vmbus_chan_id(chan), error);
4337 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4339 struct hn_rx_ring *rxr;
4342 idx = vmbus_chan_subidx(chan);
4345 * Link this channel to RX/TX ring.
4347 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4348 ("invalid channel index %d, should > 0 && < %d",
4349 idx, sc->hn_rx_ring_inuse));
4350 rxr = &sc->hn_rx_ring[idx];
4351 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4352 ("RX ring %d is not attached", idx));
4353 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4355 if (idx < sc->hn_tx_ring_inuse) {
4356 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4358 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4359 ("TX ring %d is not attached attached", idx));
4360 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4364 * Close this channel.
4367 * Channel closing does _not_ destroy the target channel.
4369 error = vmbus_chan_close_direct(chan);
4370 if (error == EISCONN) {
4371 if_printf(sc->hn_ifp, "chan%u bufring is connected "
4372 "after being closed\n", vmbus_chan_id(chan));
4373 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4375 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4376 vmbus_chan_id(chan), error);
4381 hn_attach_subchans(struct hn_softc *sc)
4383 struct vmbus_channel **subchans;
4384 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4387 KASSERT(subchan_cnt > 0, ("no sub-channels"));
4389 /* Attach the sub-channels. */
4390 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4391 for (i = 0; i < subchan_cnt; ++i) {
4394 error1 = hn_chan_attach(sc, subchans[i]);
4397 /* Move on; all channels will be detached later. */
4400 vmbus_subchan_rel(subchans, subchan_cnt);
4403 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4406 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4414 hn_detach_allchans(struct hn_softc *sc)
4416 struct vmbus_channel **subchans;
4417 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4420 if (subchan_cnt == 0)
4423 /* Detach the sub-channels. */
4424 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4425 for (i = 0; i < subchan_cnt; ++i)
4426 hn_chan_detach(sc, subchans[i]);
4427 vmbus_subchan_rel(subchans, subchan_cnt);
4431 * Detach the primary channel, _after_ all sub-channels
4434 hn_chan_detach(sc, sc->hn_prichan);
4436 /* Wait for sub-channels to be destroyed, if any. */
4437 vmbus_subchan_drain(sc->hn_prichan);
4440 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4441 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4442 HN_RX_FLAG_ATTACHED) == 0,
4443 ("%dth RX ring is still attached", i));
4445 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4446 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4447 HN_TX_FLAG_ATTACHED) == 0,
4448 ("%dth TX ring is still attached", i));
4454 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4456 struct vmbus_channel **subchans;
4457 int nchan, rxr_cnt, error;
4459 nchan = *nsubch + 1;
4462 * Multiple RX/TX rings are not requested.
4469 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4472 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4474 /* No RSS; this is benign. */
4479 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4483 if (nchan > rxr_cnt)
4486 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4492 * Allocate sub-channels from NVS.
4494 *nsubch = nchan - 1;
4495 error = hn_nvs_alloc_subchans(sc, nsubch);
4496 if (error || *nsubch == 0) {
4497 /* Failed to allocate sub-channels. */
4503 * Wait for all sub-channels to become ready before moving on.
4505 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4506 vmbus_subchan_rel(subchans, *nsubch);
4511 hn_synth_attachable(const struct hn_softc *sc)
4515 if (sc->hn_flags & HN_FLAG_ERRORS)
4518 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4519 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4521 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4528 hn_synth_attach(struct hn_softc *sc, int mtu)
4530 #define ATTACHED_NVS 0x0002
4531 #define ATTACHED_RNDIS 0x0004
4533 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4534 int error, nsubch, nchan, i;
4535 uint32_t old_caps, attached = 0;
4537 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4538 ("synthetic parts were attached"));
4540 if (!hn_synth_attachable(sc))
4543 /* Save capabilities for later verification. */
4544 old_caps = sc->hn_caps;
4547 /* Clear RSS stuffs. */
4548 sc->hn_rss_ind_size = 0;
4549 sc->hn_rss_hash = 0;
4552 * Attach the primary channel _before_ attaching NVS and RNDIS.
4554 error = hn_chan_attach(sc, sc->hn_prichan);
4561 error = hn_nvs_attach(sc, mtu);
4564 attached |= ATTACHED_NVS;
4567 * Attach RNDIS _after_ NVS is attached.
4569 error = hn_rndis_attach(sc, mtu);
4572 attached |= ATTACHED_RNDIS;
4575 * Make sure capabilities are not changed.
4577 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4578 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4579 old_caps, sc->hn_caps);
4585 * Allocate sub-channels for multi-TX/RX rings.
4588 * The # of RX rings that can be used is equivalent to the # of
4589 * channels to be requested.
4591 nsubch = sc->hn_rx_ring_cnt - 1;
4592 error = hn_synth_alloc_subchans(sc, &nsubch);
4595 /* NOTE: _Full_ synthetic parts detach is required now. */
4596 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4599 * Set the # of TX/RX rings that could be used according to
4600 * the # of channels that NVS offered.
4603 hn_set_ring_inuse(sc, nchan);
4605 /* Only the primary channel can be used; done */
4610 * Attach the sub-channels.
4612 * NOTE: hn_set_ring_inuse() _must_ have been called.
4614 error = hn_attach_subchans(sc);
4619 * Configure RSS key and indirect table _after_ all sub-channels
4622 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4624 * RSS key is not set yet; set it to the default RSS key.
4627 if_printf(sc->hn_ifp, "setup default RSS key\n");
4628 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4629 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4632 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4634 * RSS indirect table is not set yet; set it up in round-
4638 if_printf(sc->hn_ifp, "setup default RSS indirect "
4641 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
4642 rss->rss_ind[i] = i % nchan;
4643 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4646 * # of usable channels may be changed, so we have to
4647 * make sure that all entries in RSS indirect table
4650 * NOTE: hn_set_ring_inuse() _must_ have been called.
4652 hn_rss_ind_fixup(sc);
4655 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4660 * Fixup transmission aggregation setup.
4666 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4667 hn_synth_detach(sc);
4669 if (attached & ATTACHED_RNDIS)
4670 hn_rndis_detach(sc);
4671 if (attached & ATTACHED_NVS)
4673 hn_chan_detach(sc, sc->hn_prichan);
4674 /* Restore old capabilities. */
4675 sc->hn_caps = old_caps;
4679 #undef ATTACHED_RNDIS
4685 * The interface must have been suspended though hn_suspend(), before
4686 * this function get called.
4689 hn_synth_detach(struct hn_softc *sc)
4692 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4693 ("synthetic parts were not attached"));
4695 /* Detach the RNDIS first. */
4696 hn_rndis_detach(sc);
4701 /* Detach all of the channels. */
4702 hn_detach_allchans(sc);
4704 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4708 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4710 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4711 ("invalid ring count %d", ring_cnt));
4713 if (sc->hn_tx_ring_cnt > ring_cnt)
4714 sc->hn_tx_ring_inuse = ring_cnt;
4716 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4717 sc->hn_rx_ring_inuse = ring_cnt;
4720 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4721 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4726 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4731 * The TX bufring will not be drained by the hypervisor,
4732 * if the primary channel is revoked.
4734 while (!vmbus_chan_rx_empty(chan) ||
4735 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4736 !vmbus_chan_tx_empty(chan)))
4738 vmbus_chan_intr_drain(chan);
4742 hn_suspend_data(struct hn_softc *sc)
4744 struct vmbus_channel **subch = NULL;
4745 struct hn_tx_ring *txr;
4753 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4754 txr = &sc->hn_tx_ring[i];
4756 mtx_lock(&txr->hn_tx_lock);
4757 txr->hn_suspended = 1;
4758 mtx_unlock(&txr->hn_tx_lock);
4759 /* No one is able send more packets now. */
4762 * Wait for all pending sends to finish.
4765 * We will _not_ receive all pending send-done, if the
4766 * primary channel is revoked.
4768 while (hn_tx_ring_pending(txr) &&
4769 !vmbus_chan_is_revoked(sc->hn_prichan))
4770 pause("hnwtx", 1 /* 1 tick */);
4774 * Disable RX by clearing RX filter.
4776 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
4779 * Give RNDIS enough time to flush all pending data packets.
4781 pause("waitrx", (200 * hz) / 1000);
4784 * Drain RX/TX bufrings and interrupts.
4786 nsubch = sc->hn_rx_ring_inuse - 1;
4788 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4790 if (subch != NULL) {
4791 for (i = 0; i < nsubch; ++i)
4792 hn_chan_drain(sc, subch[i]);
4794 hn_chan_drain(sc, sc->hn_prichan);
4797 vmbus_subchan_rel(subch, nsubch);
4800 * Drain any pending TX tasks.
4803 * The above hn_chan_drain() can dispatch TX tasks, so the TX
4804 * tasks will have to be drained _after_ the above hn_chan_drain()
4807 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4808 txr = &sc->hn_tx_ring[i];
4810 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4811 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4816 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4819 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4823 hn_suspend_mgmt(struct hn_softc *sc)
4830 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4831 * through hn_mgmt_taskq.
4833 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4834 vmbus_chan_run_task(sc->hn_prichan, &task);
4837 * Make sure that all pending management tasks are completed.
4839 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4840 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4841 taskqueue_drain_all(sc->hn_mgmt_taskq0);
4845 hn_suspend(struct hn_softc *sc)
4848 /* Disable polling. */
4851 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4852 hn_suspend_data(sc);
4853 hn_suspend_mgmt(sc);
4857 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4861 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4862 ("invalid TX ring count %d", tx_ring_cnt));
4864 for (i = 0; i < tx_ring_cnt; ++i) {
4865 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4867 mtx_lock(&txr->hn_tx_lock);
4868 txr->hn_suspended = 0;
4869 mtx_unlock(&txr->hn_tx_lock);
4874 hn_resume_data(struct hn_softc *sc)
4883 hn_rxfilter_config(sc);
4886 * Make sure to clear suspend status on "all" TX rings,
4887 * since hn_tx_ring_inuse can be changed after
4888 * hn_suspend_data().
4890 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4892 #ifdef HN_IFSTART_SUPPORT
4893 if (!hn_use_if_start)
4897 * Flush unused drbrs, since hn_tx_ring_inuse may be
4900 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4901 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4907 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4908 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4911 * Use txeof task, so that any pending oactive can be
4914 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4919 hn_resume_mgmt(struct hn_softc *sc)
4922 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4925 * Kick off network change detection, if it was pending.
4926 * If no network change was pending, start link status
4927 * checks, which is more lightweight than network change
4930 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4931 hn_change_network(sc);
4933 hn_update_link_status(sc);
4937 hn_resume(struct hn_softc *sc)
4940 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4945 * Re-enable polling if this interface is running and
4946 * the polling is requested.
4948 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
4949 hn_polling(sc, sc->hn_pollhz);
4953 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4955 const struct rndis_status_msg *msg;
4958 if (dlen < sizeof(*msg)) {
4959 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4964 switch (msg->rm_status) {
4965 case RNDIS_STATUS_MEDIA_CONNECT:
4966 case RNDIS_STATUS_MEDIA_DISCONNECT:
4967 hn_update_link_status(sc);
4970 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4971 /* Not really useful; ignore. */
4974 case RNDIS_STATUS_NETWORK_CHANGE:
4975 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4976 if (dlen < ofs + msg->rm_stbuflen ||
4977 msg->rm_stbuflen < sizeof(uint32_t)) {
4978 if_printf(sc->hn_ifp, "network changed\n");
4982 memcpy(&change, ((const uint8_t *)msg) + ofs,
4984 if_printf(sc->hn_ifp, "network changed, change %u\n",
4987 hn_change_network(sc);
4991 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4998 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
5000 const struct rndis_pktinfo *pi = info_data;
5003 while (info_dlen != 0) {
5007 if (__predict_false(info_dlen < sizeof(*pi)))
5009 if (__predict_false(info_dlen < pi->rm_size))
5011 info_dlen -= pi->rm_size;
5013 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
5015 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
5017 dlen = pi->rm_size - pi->rm_pktinfooffset;
5020 switch (pi->rm_type) {
5021 case NDIS_PKTINFO_TYPE_VLAN:
5022 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
5024 info->vlan_info = *((const uint32_t *)data);
5025 mask |= HN_RXINFO_VLAN;
5028 case NDIS_PKTINFO_TYPE_CSUM:
5029 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
5031 info->csum_info = *((const uint32_t *)data);
5032 mask |= HN_RXINFO_CSUM;
5035 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5036 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5038 info->hash_value = *((const uint32_t *)data);
5039 mask |= HN_RXINFO_HASHVAL;
5042 case HN_NDIS_PKTINFO_TYPE_HASHINF:
5043 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5045 info->hash_info = *((const uint32_t *)data);
5046 mask |= HN_RXINFO_HASHINF;
5053 if (mask == HN_RXINFO_ALL) {
5054 /* All found; done */
5058 pi = (const struct rndis_pktinfo *)
5059 ((const uint8_t *)pi + pi->rm_size);
5064 * - If there is no hash value, invalidate the hash info.
5066 if ((mask & HN_RXINFO_HASHVAL) == 0)
5067 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5071 static __inline bool
5072 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5075 if (off < check_off) {
5076 if (__predict_true(off + len <= check_off))
5078 } else if (off > check_off) {
5079 if (__predict_true(check_off + check_len <= off))
5086 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5088 const struct rndis_packet_msg *pkt;
5089 struct hn_rxinfo info;
5090 int data_off, pktinfo_off, data_len, pktinfo_len;
5095 if (__predict_false(dlen < sizeof(*pkt))) {
5096 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5101 if (__predict_false(dlen < pkt->rm_len)) {
5102 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5103 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5106 if (__predict_false(pkt->rm_len <
5107 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5108 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5109 "msglen %u, data %u, oob %u, pktinfo %u\n",
5110 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5111 pkt->rm_pktinfolen);
5114 if (__predict_false(pkt->rm_datalen == 0)) {
5115 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5122 #define IS_OFFSET_INVALID(ofs) \
5123 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
5124 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5126 /* XXX Hyper-V does not meet data offset alignment requirement */
5127 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5128 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5129 "data offset %u\n", pkt->rm_dataoffset);
5132 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5133 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5134 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5135 "oob offset %u\n", pkt->rm_oobdataoffset);
5138 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5139 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5140 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5141 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5145 #undef IS_OFFSET_INVALID
5147 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5148 data_len = pkt->rm_datalen;
5149 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5150 pktinfo_len = pkt->rm_pktinfolen;
5153 * Check OOB coverage.
5155 if (__predict_false(pkt->rm_oobdatalen != 0)) {
5156 int oob_off, oob_len;
5158 if_printf(rxr->hn_ifp, "got oobdata\n");
5159 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5160 oob_len = pkt->rm_oobdatalen;
5162 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5163 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5164 "oob overflow, msglen %u, oob abs %d len %d\n",
5165 pkt->rm_len, oob_off, oob_len);
5170 * Check against data.
5172 if (hn_rndis_check_overlap(oob_off, oob_len,
5173 data_off, data_len)) {
5174 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5175 "oob overlaps data, oob abs %d len %d, "
5176 "data abs %d len %d\n",
5177 oob_off, oob_len, data_off, data_len);
5182 * Check against pktinfo.
5184 if (pktinfo_len != 0 &&
5185 hn_rndis_check_overlap(oob_off, oob_len,
5186 pktinfo_off, pktinfo_len)) {
5187 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5188 "oob overlaps pktinfo, oob abs %d len %d, "
5189 "pktinfo abs %d len %d\n",
5190 oob_off, oob_len, pktinfo_off, pktinfo_len);
5196 * Check per-packet-info coverage and find useful per-packet-info.
5198 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5199 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5200 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5201 if (__predict_true(pktinfo_len != 0)) {
5205 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5206 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5207 "pktinfo overflow, msglen %u, "
5208 "pktinfo abs %d len %d\n",
5209 pkt->rm_len, pktinfo_off, pktinfo_len);
5214 * Check packet info coverage.
5216 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5217 data_off, data_len);
5218 if (__predict_false(overlap)) {
5219 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5220 "pktinfo overlap data, pktinfo abs %d len %d, "
5221 "data abs %d len %d\n",
5222 pktinfo_off, pktinfo_len, data_off, data_len);
5227 * Find useful per-packet-info.
5229 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5230 pktinfo_len, &info);
5231 if (__predict_false(error)) {
5232 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5238 if (__predict_false(data_off + data_len > pkt->rm_len)) {
5239 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5240 "data overflow, msglen %u, data abs %d len %d\n",
5241 pkt->rm_len, data_off, data_len);
5244 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5247 static __inline void
5248 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5250 const struct rndis_msghdr *hdr;
5252 if (__predict_false(dlen < sizeof(*hdr))) {
5253 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5258 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5259 /* Hot data path. */
5260 hn_rndis_rx_data(rxr, data, dlen);
5265 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5266 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5268 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5272 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5274 const struct hn_nvs_hdr *hdr;
5276 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5277 if_printf(sc->hn_ifp, "invalid nvs notify\n");
5280 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5282 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5283 /* Useless; ignore */
5286 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5290 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5291 const struct vmbus_chanpkt_hdr *pkt)
5293 struct hn_nvs_sendctx *sndc;
5295 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5296 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5297 VMBUS_CHANPKT_DATALEN(pkt));
5300 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5306 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5307 const struct vmbus_chanpkt_hdr *pkthdr)
5309 const struct vmbus_chanpkt_rxbuf *pkt;
5310 const struct hn_nvs_hdr *nvs_hdr;
5313 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5314 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5317 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5319 /* Make sure that this is a RNDIS message. */
5320 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5321 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5326 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5327 if (__predict_false(hlen < sizeof(*pkt))) {
5328 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5331 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5333 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5334 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5339 count = pkt->cp_rxbuf_cnt;
5340 if (__predict_false(hlen <
5341 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5342 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5346 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5347 for (i = 0; i < count; ++i) {
5350 ofs = pkt->cp_rxbuf[i].rb_ofs;
5351 len = pkt->cp_rxbuf[i].rb_len;
5352 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5353 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5354 "ofs %d, len %d\n", i, ofs, len);
5357 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5361 * Ack the consumed RXBUF associated w/ this channel packet,
5362 * so that this RXBUF can be recycled by the hypervisor.
5364 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5368 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5371 struct hn_nvs_rndis_ack ack;
5374 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5375 ack.nvs_status = HN_NVS_STATUS_OK;
5379 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5380 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5381 if (__predict_false(error == EAGAIN)) {
5384 * This should _not_ happen in real world, since the
5385 * consumption of the TX bufring from the TX path is
5388 if (rxr->hn_ack_failed == 0)
5389 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5390 rxr->hn_ack_failed++;
5397 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5402 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5404 struct hn_rx_ring *rxr = xrxr;
5405 struct hn_softc *sc = rxr->hn_ifp->if_softc;
5408 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5411 pktlen = rxr->hn_pktbuf_len;
5412 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5413 if (__predict_false(error == ENOBUFS)) {
5418 * Expand channel packet buffer.
5421 * Use M_WAITOK here, since allocation failure
5424 nlen = rxr->hn_pktbuf_len * 2;
5425 while (nlen < pktlen)
5427 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5429 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5430 rxr->hn_pktbuf_len, nlen);
5432 free(rxr->hn_pktbuf, M_DEVBUF);
5433 rxr->hn_pktbuf = nbuf;
5434 rxr->hn_pktbuf_len = nlen;
5437 } else if (__predict_false(error == EAGAIN)) {
5438 /* No more channel packets; done! */
5441 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5443 switch (pkt->cph_type) {
5444 case VMBUS_CHANPKT_TYPE_COMP:
5445 hn_nvs_handle_comp(sc, chan, pkt);
5448 case VMBUS_CHANPKT_TYPE_RXBUF:
5449 hn_nvs_handle_rxbuf(rxr, chan, pkt);
5452 case VMBUS_CHANPKT_TYPE_INBAND:
5453 hn_nvs_handle_notify(sc, pkt);
5457 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5462 hn_chan_rollup(rxr, rxr->hn_txr);
5466 hn_tx_taskq_create(void *arg __unused)
5471 * Fix the # of TX taskqueues.
5473 if (hn_tx_taskq_cnt <= 0)
5474 hn_tx_taskq_cnt = 1;
5475 else if (hn_tx_taskq_cnt > mp_ncpus)
5476 hn_tx_taskq_cnt = mp_ncpus;
5479 * Fix the TX taskqueue mode.
5481 switch (hn_tx_taskq_mode) {
5482 case HN_TX_TASKQ_M_INDEP:
5483 case HN_TX_TASKQ_M_GLOBAL:
5484 case HN_TX_TASKQ_M_EVTTQ:
5487 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
5491 if (vm_guest != VM_GUEST_HV)
5494 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
5497 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
5498 M_DEVBUF, M_WAITOK);
5499 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
5500 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
5501 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
5502 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
5506 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5507 hn_tx_taskq_create, NULL);
5510 hn_tx_taskq_destroy(void *arg __unused)
5513 if (hn_tx_taskque != NULL) {
5516 for (i = 0; i < hn_tx_taskq_cnt; ++i)
5517 taskqueue_free(hn_tx_taskque[i]);
5518 free(hn_tx_taskque, M_DEVBUF);
5521 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5522 hn_tx_taskq_destroy, NULL);