2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
62 #include <sys/param.h>
64 #include <sys/kernel.h>
65 #include <sys/limits.h>
66 #include <sys/malloc.h>
68 #include <sys/module.h>
70 #include <sys/queue.h>
73 #include <sys/socket.h>
74 #include <sys/sockio.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/taskqueue.h>
79 #include <sys/buf_ring.h>
81 #include <machine/atomic.h>
82 #include <machine/in_cksum.h>
85 #include <net/ethernet.h>
87 #include <net/if_arp.h>
88 #include <net/if_media.h>
89 #include <net/if_types.h>
90 #include <net/if_var.h>
91 #include <net/if_vlan_var.h>
92 #include <net/rndis.h>
94 #include <netinet/in_systm.h>
95 #include <netinet/in.h>
96 #include <netinet/ip.h>
97 #include <netinet/ip6.h>
98 #include <netinet/tcp.h>
99 #include <netinet/tcp_lro.h>
100 #include <netinet/udp.h>
102 #include <dev/hyperv/include/hyperv.h>
103 #include <dev/hyperv/include/hyperv_busdma.h>
104 #include <dev/hyperv/include/vmbus.h>
105 #include <dev/hyperv/include/vmbus_xact.h>
107 #include <dev/hyperv/netvsc/ndis.h>
108 #include <dev/hyperv/netvsc/if_hnreg.h>
109 #include <dev/hyperv/netvsc/if_hnvar.h>
110 #include <dev/hyperv/netvsc/hn_nvs.h>
111 #include <dev/hyperv/netvsc/hn_rndis.h>
113 #include "vmbus_if.h"
115 #define HN_IFSTART_SUPPORT
117 #define HN_RING_CNT_DEF_MAX 8
119 /* YYY should get it from the underlying channel */
120 #define HN_TX_DESC_CNT 512
122 #define HN_RNDIS_PKT_LEN \
123 (sizeof(struct rndis_packet_msg) + \
124 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
125 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
126 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
127 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
128 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
129 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
131 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
132 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
133 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
134 /* -1 for RNDIS packet message */
135 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
137 #define HN_DIRECT_TX_SIZE_DEF 128
139 #define HN_EARLY_TXEOF_THRESH 8
141 #define HN_PKTBUF_LEN_DEF (16 * 1024)
143 #define HN_LROENT_CNT_DEF 128
145 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
146 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
147 /* YYY 2*MTU is a bit rough, but should be good enough. */
148 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
150 #define HN_LRO_ACKCNT_DEF 1
152 #define HN_LOCK_INIT(sc) \
153 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
154 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
155 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
156 #define HN_LOCK(sc) \
158 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
161 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
163 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
164 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
165 #define HN_CSUM_IP_HWASSIST(sc) \
166 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
167 #define HN_CSUM_IP6_HWASSIST(sc) \
168 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
170 #define HN_PKTSIZE_MIN(align) \
171 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
172 HN_RNDIS_PKT_LEN, (align))
173 #define HN_PKTSIZE(m, align) \
174 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
176 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
179 #ifndef HN_USE_TXDESC_BUFRING
180 SLIST_ENTRY(hn_txdesc) link;
182 STAILQ_ENTRY(hn_txdesc) agg_link;
184 /* Aggregated txdescs, in sending order. */
185 STAILQ_HEAD(, hn_txdesc) agg_list;
187 /* The oldest packet, if transmission aggregation happens. */
189 struct hn_tx_ring *txr;
191 uint32_t flags; /* HN_TXD_FLAG_ */
192 struct hn_nvs_sendctx send_ctx;
196 bus_dmamap_t data_dmap;
198 bus_addr_t rndis_pkt_paddr;
199 struct rndis_packet_msg *rndis_pkt;
200 bus_dmamap_t rndis_pkt_dmap;
203 #define HN_TXD_FLAG_ONLIST 0x0001
204 #define HN_TXD_FLAG_DMAMAP 0x0002
205 #define HN_TXD_FLAG_ONAGG 0x0004
214 #define HN_RXINFO_VLAN 0x0001
215 #define HN_RXINFO_CSUM 0x0002
216 #define HN_RXINFO_HASHINF 0x0004
217 #define HN_RXINFO_HASHVAL 0x0008
218 #define HN_RXINFO_ALL \
221 HN_RXINFO_HASHINF | \
224 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
225 #define HN_NDIS_RXCSUM_INFO_INVALID 0
226 #define HN_NDIS_HASH_INFO_INVALID 0
228 static int hn_probe(device_t);
229 static int hn_attach(device_t);
230 static int hn_detach(device_t);
231 static int hn_shutdown(device_t);
232 static void hn_chan_callback(struct vmbus_channel *,
235 static void hn_init(void *);
236 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
237 #ifdef HN_IFSTART_SUPPORT
238 static void hn_start(struct ifnet *);
240 static int hn_transmit(struct ifnet *, struct mbuf *);
241 static void hn_xmit_qflush(struct ifnet *);
242 static int hn_ifmedia_upd(struct ifnet *);
243 static void hn_ifmedia_sts(struct ifnet *,
244 struct ifmediareq *);
246 static int hn_rndis_rxinfo(const void *, int,
248 static void hn_rndis_rx_data(struct hn_rx_ring *,
250 static void hn_rndis_rx_status(struct hn_softc *,
253 static void hn_nvs_handle_notify(struct hn_softc *,
254 const struct vmbus_chanpkt_hdr *);
255 static void hn_nvs_handle_comp(struct hn_softc *,
256 struct vmbus_channel *,
257 const struct vmbus_chanpkt_hdr *);
258 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
259 struct vmbus_channel *,
260 const struct vmbus_chanpkt_hdr *);
261 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
262 struct vmbus_channel *, uint64_t);
264 #if __FreeBSD_version >= 1100099
265 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
266 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
268 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
269 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
270 #if __FreeBSD_version < 1100095
271 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
273 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
275 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
276 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
277 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
278 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
279 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
280 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
281 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
282 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
283 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
284 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
285 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
286 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
287 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
288 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
289 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
291 static void hn_stop(struct hn_softc *);
292 static void hn_init_locked(struct hn_softc *);
293 static int hn_chan_attach(struct hn_softc *,
294 struct vmbus_channel *);
295 static void hn_chan_detach(struct hn_softc *,
296 struct vmbus_channel *);
297 static int hn_attach_subchans(struct hn_softc *);
298 static void hn_detach_allchans(struct hn_softc *);
299 static void hn_chan_rollup(struct hn_rx_ring *,
300 struct hn_tx_ring *);
301 static void hn_set_ring_inuse(struct hn_softc *, int);
302 static int hn_synth_attach(struct hn_softc *, int);
303 static void hn_synth_detach(struct hn_softc *);
304 static int hn_synth_alloc_subchans(struct hn_softc *,
306 static bool hn_synth_attachable(const struct hn_softc *);
307 static void hn_suspend(struct hn_softc *);
308 static void hn_suspend_data(struct hn_softc *);
309 static void hn_suspend_mgmt(struct hn_softc *);
310 static void hn_resume(struct hn_softc *);
311 static void hn_resume_data(struct hn_softc *);
312 static void hn_resume_mgmt(struct hn_softc *);
313 static void hn_suspend_mgmt_taskfunc(void *, int);
314 static void hn_chan_drain(struct hn_softc *,
315 struct vmbus_channel *);
316 static void hn_polling(struct hn_softc *, u_int);
317 static void hn_chan_polling(struct vmbus_channel *, u_int);
319 static void hn_update_link_status(struct hn_softc *);
320 static void hn_change_network(struct hn_softc *);
321 static void hn_link_taskfunc(void *, int);
322 static void hn_netchg_init_taskfunc(void *, int);
323 static void hn_netchg_status_taskfunc(void *, int);
324 static void hn_link_status(struct hn_softc *);
326 static int hn_create_rx_data(struct hn_softc *, int);
327 static void hn_destroy_rx_data(struct hn_softc *);
328 static int hn_check_iplen(const struct mbuf *, int);
329 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
330 static int hn_rxfilter_config(struct hn_softc *);
331 static int hn_rss_reconfig(struct hn_softc *);
332 static void hn_rss_ind_fixup(struct hn_softc *);
333 static int hn_rxpkt(struct hn_rx_ring *, const void *,
334 int, const struct hn_rxinfo *);
336 static int hn_tx_ring_create(struct hn_softc *, int);
337 static void hn_tx_ring_destroy(struct hn_tx_ring *);
338 static int hn_create_tx_data(struct hn_softc *, int);
339 static void hn_fixup_tx_data(struct hn_softc *);
340 static void hn_destroy_tx_data(struct hn_softc *);
341 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
342 static void hn_txdesc_gc(struct hn_tx_ring *,
344 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
345 struct hn_txdesc *, struct mbuf **);
346 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
348 static void hn_set_chim_size(struct hn_softc *, int);
349 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
350 static bool hn_tx_ring_pending(struct hn_tx_ring *);
351 static void hn_tx_ring_qflush(struct hn_tx_ring *);
352 static void hn_resume_tx(struct hn_softc *, int);
353 static void hn_set_txagg(struct hn_softc *);
354 static void *hn_try_txagg(struct ifnet *,
355 struct hn_tx_ring *, struct hn_txdesc *,
357 static int hn_get_txswq_depth(const struct hn_tx_ring *);
358 static void hn_txpkt_done(struct hn_nvs_sendctx *,
359 struct hn_softc *, struct vmbus_channel *,
361 static int hn_txpkt_sglist(struct hn_tx_ring *,
363 static int hn_txpkt_chim(struct hn_tx_ring *,
365 static int hn_xmit(struct hn_tx_ring *, int);
366 static void hn_xmit_taskfunc(void *, int);
367 static void hn_xmit_txeof(struct hn_tx_ring *);
368 static void hn_xmit_txeof_taskfunc(void *, int);
369 #ifdef HN_IFSTART_SUPPORT
370 static int hn_start_locked(struct hn_tx_ring *, int);
371 static void hn_start_taskfunc(void *, int);
372 static void hn_start_txeof(struct hn_tx_ring *);
373 static void hn_start_txeof_taskfunc(void *, int);
376 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
377 "Hyper-V network interface");
379 /* Trust tcp segements verification on host side. */
380 static int hn_trust_hosttcp = 1;
381 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
382 &hn_trust_hosttcp, 0,
383 "Trust tcp segement verification on host side, "
384 "when csum info is missing (global setting)");
386 /* Trust udp datagrams verification on host side. */
387 static int hn_trust_hostudp = 1;
388 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
389 &hn_trust_hostudp, 0,
390 "Trust udp datagram verification on host side, "
391 "when csum info is missing (global setting)");
393 /* Trust ip packets verification on host side. */
394 static int hn_trust_hostip = 1;
395 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
397 "Trust ip packet verification on host side, "
398 "when csum info is missing (global setting)");
400 /* Limit TSO burst size */
401 static int hn_tso_maxlen = IP_MAXPACKET;
402 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
403 &hn_tso_maxlen, 0, "TSO burst limit");
405 /* Limit chimney send size */
406 static int hn_tx_chimney_size = 0;
407 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
408 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
410 /* Limit the size of packet for direct transmission */
411 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
412 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
413 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
415 /* # of LRO entries per RX ring */
416 #if defined(INET) || defined(INET6)
417 #if __FreeBSD_version >= 1100095
418 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
419 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
420 &hn_lro_entry_count, 0, "LRO entry count");
424 static int hn_tx_taskq_cnt = 1;
425 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
426 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
428 #define HN_TX_TASKQ_M_INDEP 0
429 #define HN_TX_TASKQ_M_GLOBAL 1
430 #define HN_TX_TASKQ_M_EVTTQ 2
432 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
433 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
434 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
435 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
437 #ifndef HN_USE_TXDESC_BUFRING
438 static int hn_use_txdesc_bufring = 0;
440 static int hn_use_txdesc_bufring = 1;
442 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
443 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
445 #ifdef HN_IFSTART_SUPPORT
446 /* Use ifnet.if_start instead of ifnet.if_transmit */
447 static int hn_use_if_start = 0;
448 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
449 &hn_use_if_start, 0, "Use if_start TX method");
452 /* # of channels to use */
453 static int hn_chan_cnt = 0;
454 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
456 "# of channels to use; each channel has one RX ring and one TX ring");
458 /* # of transmit rings to use */
459 static int hn_tx_ring_cnt = 0;
460 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
461 &hn_tx_ring_cnt, 0, "# of TX rings to use");
463 /* Software TX ring deptch */
464 static int hn_tx_swq_depth = 0;
465 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
466 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
468 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
469 #if __FreeBSD_version >= 1100095
470 static u_int hn_lro_mbufq_depth = 0;
471 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
472 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
475 /* Packet transmission aggregation size limit */
476 static int hn_tx_agg_size = -1;
477 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
478 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
480 /* Packet transmission aggregation count limit */
481 static int hn_tx_agg_pkts = -1;
482 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
483 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
485 static u_int hn_cpu_index; /* next CPU for channel */
486 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
489 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
490 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
491 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
492 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
493 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
494 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
497 static device_method_t hn_methods[] = {
498 /* Device interface */
499 DEVMETHOD(device_probe, hn_probe),
500 DEVMETHOD(device_attach, hn_attach),
501 DEVMETHOD(device_detach, hn_detach),
502 DEVMETHOD(device_shutdown, hn_shutdown),
506 static driver_t hn_driver = {
509 sizeof(struct hn_softc)
512 static devclass_t hn_devclass;
514 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
515 MODULE_VERSION(hn, 1);
516 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
518 #if __FreeBSD_version >= 1100099
520 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
524 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
525 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
530 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
533 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
534 txd->chim_size == 0, ("invalid rndis sglist txd"));
535 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
536 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
540 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
542 struct hn_nvs_rndis rndis;
544 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
545 txd->chim_size > 0, ("invalid rndis chim txd"));
547 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
548 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
549 rndis.nvs_chim_idx = txd->chim_index;
550 rndis.nvs_chim_sz = txd->chim_size;
552 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
553 &rndis, sizeof(rndis), &txd->send_ctx));
556 static __inline uint32_t
557 hn_chim_alloc(struct hn_softc *sc)
559 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
560 u_long *bmap = sc->hn_chim_bmap;
561 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
563 for (i = 0; i < bmap_cnt; ++i) {
566 idx = ffsl(~bmap[i]);
570 --idx; /* ffsl is 1-based */
571 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
572 ("invalid i %d and idx %d", i, idx));
574 if (atomic_testandset_long(&bmap[i], idx))
577 ret = i * LONG_BIT + idx;
584 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
589 idx = chim_idx / LONG_BIT;
590 KASSERT(idx < sc->hn_chim_bmap_cnt,
591 ("invalid chimney index 0x%x", chim_idx));
593 mask = 1UL << (chim_idx % LONG_BIT);
594 KASSERT(sc->hn_chim_bmap[idx] & mask,
595 ("index bitmap 0x%lx, chimney index %u, "
596 "bitmap idx %d, bitmask 0x%lx",
597 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
599 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
602 #if defined(INET6) || defined(INET)
604 * NOTE: If this function failed, the m_head would be freed.
606 static __inline struct mbuf *
607 hn_tso_fixup(struct mbuf *m_head)
609 struct ether_vlan_header *evl;
613 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
615 #define PULLUP_HDR(m, len) \
617 if (__predict_false((m)->m_len < (len))) { \
618 (m) = m_pullup((m), (len)); \
624 PULLUP_HDR(m_head, sizeof(*evl));
625 evl = mtod(m_head, struct ether_vlan_header *);
626 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
627 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
629 ehlen = ETHER_HDR_LEN;
632 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
636 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
637 ip = mtodo(m_head, ehlen);
638 iphlen = ip->ip_hl << 2;
640 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
641 th = mtodo(m_head, ehlen + iphlen);
645 th->th_sum = in_pseudo(ip->ip_src.s_addr,
646 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
649 #if defined(INET6) && defined(INET)
656 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
657 ip6 = mtodo(m_head, ehlen);
658 if (ip6->ip6_nxt != IPPROTO_TCP) {
663 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
664 th = mtodo(m_head, ehlen + sizeof(*ip6));
667 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
674 #endif /* INET6 || INET */
677 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
683 if (sc->hn_rx_filter != filter) {
684 error = hn_rndis_set_rxfilter(sc, filter);
686 sc->hn_rx_filter = filter;
692 hn_rxfilter_config(struct hn_softc *sc)
694 struct ifnet *ifp = sc->hn_ifp;
699 if (ifp->if_flags & IFF_PROMISC) {
700 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
702 filter = NDIS_PACKET_TYPE_DIRECTED;
703 if (ifp->if_flags & IFF_BROADCAST)
704 filter |= NDIS_PACKET_TYPE_BROADCAST;
705 /* TODO: support multicast list */
706 if ((ifp->if_flags & IFF_ALLMULTI) ||
707 !TAILQ_EMPTY(&ifp->if_multiaddrs))
708 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
710 return (hn_set_rxfilter(sc, filter));
714 hn_set_txagg(struct hn_softc *sc)
720 * Setup aggregation size.
722 if (sc->hn_agg_size < 0)
725 size = sc->hn_agg_size;
727 if (sc->hn_rndis_agg_size < size)
728 size = sc->hn_rndis_agg_size;
730 /* NOTE: We only aggregate packets using chimney sending buffers. */
731 if (size > (uint32_t)sc->hn_chim_szmax)
732 size = sc->hn_chim_szmax;
734 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
741 /* NOTE: Type of the per TX ring setting is 'int'. */
746 * Setup aggregation packet count.
748 if (sc->hn_agg_pkts < 0)
751 pkts = sc->hn_agg_pkts;
753 if (sc->hn_rndis_agg_pkts < pkts)
754 pkts = sc->hn_rndis_agg_pkts;
763 /* NOTE: Type of the per TX ring setting is 'short'. */
768 /* NOTE: Type of the per TX ring setting is 'short'. */
769 if (sc->hn_rndis_agg_align > SHRT_MAX) {
776 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
777 size, pkts, sc->hn_rndis_agg_align);
780 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
781 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
783 mtx_lock(&txr->hn_tx_lock);
784 txr->hn_agg_szmax = size;
785 txr->hn_agg_pktmax = pkts;
786 txr->hn_agg_align = sc->hn_rndis_agg_align;
787 mtx_unlock(&txr->hn_tx_lock);
792 hn_get_txswq_depth(const struct hn_tx_ring *txr)
795 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
796 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
797 return txr->hn_txdesc_cnt;
798 return hn_tx_swq_depth;
802 hn_rss_reconfig(struct hn_softc *sc)
808 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
815 * Direct reconfiguration by setting the UNCHG flags does
816 * _not_ work properly.
819 if_printf(sc->hn_ifp, "disable RSS\n");
820 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
822 if_printf(sc->hn_ifp, "RSS disable failed\n");
827 * Reenable the RSS w/ the updated RSS key or indirect
831 if_printf(sc->hn_ifp, "reconfig RSS\n");
832 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
834 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
841 hn_rss_ind_fixup(struct hn_softc *sc)
843 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
846 nchan = sc->hn_rx_ring_inuse;
847 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
850 * Check indirect table to make sure that all channels in it
853 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
854 if (rss->rss_ind[i] >= nchan) {
855 if_printf(sc->hn_ifp,
856 "RSS indirect table %d fixup: %u -> %d\n",
857 i, rss->rss_ind[i], nchan - 1);
858 rss->rss_ind[i] = nchan - 1;
864 hn_ifmedia_upd(struct ifnet *ifp __unused)
871 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
873 struct hn_softc *sc = ifp->if_softc;
875 ifmr->ifm_status = IFM_AVALID;
876 ifmr->ifm_active = IFM_ETHER;
878 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
879 ifmr->ifm_active |= IFM_NONE;
882 ifmr->ifm_status |= IFM_ACTIVE;
883 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
886 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
887 static const struct hyperv_guid g_net_vsc_device_type = {
888 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
889 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
893 hn_probe(device_t dev)
896 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
897 &g_net_vsc_device_type) == 0) {
898 device_set_desc(dev, "Hyper-V Network Interface");
899 return BUS_PROBE_DEFAULT;
905 hn_attach(device_t dev)
907 struct hn_softc *sc = device_get_softc(dev);
908 struct sysctl_oid_list *child;
909 struct sysctl_ctx_list *ctx;
910 uint8_t eaddr[ETHER_ADDR_LEN];
911 struct ifnet *ifp = NULL;
912 int error, ring_cnt, tx_ring_cnt;
915 sc->hn_prichan = vmbus_get_channel(dev);
919 * Initialize these tunables once.
921 sc->hn_agg_size = hn_tx_agg_size;
922 sc->hn_agg_pkts = hn_tx_agg_pkts;
925 * Setup taskqueue for transmission.
927 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
931 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
933 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
934 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
935 M_WAITOK, taskqueue_thread_enqueue,
936 &sc->hn_tx_taskqs[i]);
937 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
938 "%s tx%d", device_get_nameunit(dev), i);
940 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
941 sc->hn_tx_taskqs = hn_tx_taskque;
945 * Setup taskqueue for mangement tasks, e.g. link status.
947 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
948 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
949 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
950 device_get_nameunit(dev));
951 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
952 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
953 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
954 hn_netchg_status_taskfunc, sc);
957 * Allocate ifnet and setup its name earlier, so that if_printf
958 * can be used by functions, which will be called after
961 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
963 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
966 * Initialize ifmedia earlier so that it can be unconditionally
967 * destroyed, if error happened later on.
969 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
972 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
973 * to use (tx_ring_cnt).
976 * The # of RX rings to use is same as the # of channels to use.
978 ring_cnt = hn_chan_cnt;
982 if (ring_cnt > HN_RING_CNT_DEF_MAX)
983 ring_cnt = HN_RING_CNT_DEF_MAX;
984 } else if (ring_cnt > mp_ncpus) {
988 tx_ring_cnt = hn_tx_ring_cnt;
989 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
990 tx_ring_cnt = ring_cnt;
991 #ifdef HN_IFSTART_SUPPORT
992 if (hn_use_if_start) {
993 /* ifnet.if_start only needs one TX ring. */
999 * Set the leader CPU for channels.
1001 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1004 * Create enough TX/RX rings, even if only limited number of
1005 * channels can be allocated.
1007 error = hn_create_tx_data(sc, tx_ring_cnt);
1010 error = hn_create_rx_data(sc, ring_cnt);
1015 * Create transaction context for NVS and RNDIS transactions.
1017 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1018 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1019 if (sc->hn_xact == NULL) {
1025 * Install orphan handler for the revocation of this device's
1029 * The processing order is critical here:
1030 * Install the orphan handler, _before_ testing whether this
1031 * device's primary channel has been revoked or not.
1033 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1034 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1040 * Attach the synthetic parts, i.e. NVS and RNDIS.
1042 error = hn_synth_attach(sc, ETHERMTU);
1046 error = hn_rndis_get_eaddr(sc, eaddr);
1050 #if __FreeBSD_version >= 1100099
1051 if (sc->hn_rx_ring_inuse > 1) {
1053 * Reduce TCP segment aggregation limit for multiple
1054 * RX rings to increase ACK timeliness.
1056 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1061 * Fixup TX stuffs after synthetic parts are attached.
1063 hn_fixup_tx_data(sc);
1065 ctx = device_get_sysctl_ctx(dev);
1066 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1067 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1068 &sc->hn_nvs_ver, 0, "NVS version");
1069 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1070 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1071 hn_ndis_version_sysctl, "A", "NDIS version");
1072 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1073 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1074 hn_caps_sysctl, "A", "capabilities");
1075 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1076 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1077 hn_hwassist_sysctl, "A", "hwassist");
1078 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1079 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1080 hn_rxfilter_sysctl, "A", "rxfilter");
1081 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1082 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1083 hn_rss_hash_sysctl, "A", "RSS hash");
1084 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1085 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1086 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1087 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1088 hn_rss_key_sysctl, "IU", "RSS key");
1089 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1090 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1091 hn_rss_ind_sysctl, "IU", "RSS indirect table");
1092 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1093 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1094 "RNDIS offered packet transmission aggregation size limit");
1095 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1096 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1097 "RNDIS offered packet transmission aggregation count limit");
1098 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1099 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1100 "RNDIS packet transmission aggregation alignment");
1101 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1102 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1103 hn_txagg_size_sysctl, "I",
1104 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1105 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1106 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1107 hn_txagg_pkts_sysctl, "I",
1108 "Packet transmission aggregation packets, "
1109 "0 -- disable, -1 -- auto");
1110 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1111 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1112 hn_polling_sysctl, "I",
1113 "Polling frequency: [100,1000000], 0 disable polling");
1116 * Setup the ifmedia, which has been initialized earlier.
1118 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1119 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1120 /* XXX ifmedia_set really should do this for us */
1121 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1124 * Setup the ifnet for this interface.
1128 ifp->if_baudrate = IF_Gbps(10);
1130 /* if_baudrate is 32bits on 32bit system. */
1131 ifp->if_baudrate = IF_Gbps(1);
1133 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1134 ifp->if_ioctl = hn_ioctl;
1135 ifp->if_init = hn_init;
1136 #ifdef HN_IFSTART_SUPPORT
1137 if (hn_use_if_start) {
1138 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1140 ifp->if_start = hn_start;
1141 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1142 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1143 IFQ_SET_READY(&ifp->if_snd);
1147 ifp->if_transmit = hn_transmit;
1148 ifp->if_qflush = hn_xmit_qflush;
1151 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1153 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1154 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1156 if (sc->hn_caps & HN_CAP_VLAN) {
1157 /* XXX not sure about VLAN_MTU. */
1158 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1161 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1162 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1163 ifp->if_capabilities |= IFCAP_TXCSUM;
1164 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1165 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1166 if (sc->hn_caps & HN_CAP_TSO4) {
1167 ifp->if_capabilities |= IFCAP_TSO4;
1168 ifp->if_hwassist |= CSUM_IP_TSO;
1170 if (sc->hn_caps & HN_CAP_TSO6) {
1171 ifp->if_capabilities |= IFCAP_TSO6;
1172 ifp->if_hwassist |= CSUM_IP6_TSO;
1175 /* Enable all available capabilities by default. */
1176 ifp->if_capenable = ifp->if_capabilities;
1179 * Disable IPv6 TSO and TXCSUM by default, they still can
1180 * be enabled through SIOCSIFCAP.
1182 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1183 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1185 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1186 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1187 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1188 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1191 ether_ifattach(ifp, eaddr);
1193 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1194 if_printf(ifp, "TSO segcnt %u segsz %u\n",
1195 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1198 /* Inform the upper layer about the long frame support. */
1199 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1202 * Kick off link status check.
1204 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1205 hn_update_link_status(sc);
1209 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1210 hn_synth_detach(sc);
1216 hn_detach(device_t dev)
1218 struct hn_softc *sc = device_get_softc(dev);
1219 struct ifnet *ifp = sc->hn_ifp;
1221 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1223 * In case that the vmbus missed the orphan handler
1226 vmbus_xact_ctx_orphan(sc->hn_xact);
1229 if (device_is_attached(dev)) {
1231 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1232 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1236 * hn_stop() only suspends data, so managment
1237 * stuffs have to be suspended manually here.
1239 hn_suspend_mgmt(sc);
1240 hn_synth_detach(sc);
1243 ether_ifdetach(ifp);
1246 ifmedia_removeall(&sc->hn_media);
1247 hn_destroy_rx_data(sc);
1248 hn_destroy_tx_data(sc);
1250 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1253 for (i = 0; i < hn_tx_taskq_cnt; ++i)
1254 taskqueue_free(sc->hn_tx_taskqs[i]);
1255 free(sc->hn_tx_taskqs, M_DEVBUF);
1257 taskqueue_free(sc->hn_mgmt_taskq0);
1259 if (sc->hn_xact != NULL) {
1261 * Uninstall the orphan handler _before_ the xact is
1264 vmbus_chan_unset_orphan(sc->hn_prichan);
1265 vmbus_xact_ctx_destroy(sc->hn_xact);
1270 HN_LOCK_DESTROY(sc);
1275 hn_shutdown(device_t dev)
1282 hn_link_status(struct hn_softc *sc)
1284 uint32_t link_status;
1287 error = hn_rndis_get_linkstatus(sc, &link_status);
1289 /* XXX what to do? */
1293 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1294 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1296 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1297 if_link_state_change(sc->hn_ifp,
1298 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1299 LINK_STATE_UP : LINK_STATE_DOWN);
1303 hn_link_taskfunc(void *xsc, int pending __unused)
1305 struct hn_softc *sc = xsc;
1307 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1313 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1315 struct hn_softc *sc = xsc;
1317 /* Prevent any link status checks from running. */
1318 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1321 * Fake up a [link down --> link up] state change; 5 seconds
1322 * delay is used, which closely simulates miibus reaction
1323 * upon link down event.
1325 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1326 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1327 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1328 &sc->hn_netchg_status, 5 * hz);
1332 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1334 struct hn_softc *sc = xsc;
1336 /* Re-allow link status checks. */
1337 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1342 hn_update_link_status(struct hn_softc *sc)
1345 if (sc->hn_mgmt_taskq != NULL)
1346 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1350 hn_change_network(struct hn_softc *sc)
1353 if (sc->hn_mgmt_taskq != NULL)
1354 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1358 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1359 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1361 struct mbuf *m = *m_head;
1364 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1366 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1367 m, segs, nsegs, BUS_DMA_NOWAIT);
1368 if (error == EFBIG) {
1371 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1375 *m_head = m = m_new;
1376 txr->hn_tx_collapsed++;
1378 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1379 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1382 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1383 BUS_DMASYNC_PREWRITE);
1384 txd->flags |= HN_TXD_FLAG_DMAMAP;
1390 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1393 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1394 ("put an onlist txd %#x", txd->flags));
1395 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1396 ("put an onagg txd %#x", txd->flags));
1398 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1399 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1402 if (!STAILQ_EMPTY(&txd->agg_list)) {
1403 struct hn_txdesc *tmp_txd;
1405 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1408 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1409 ("resursive aggregation on aggregated txdesc"));
1410 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1411 ("not aggregated txdesc"));
1412 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1413 ("aggregated txdesc uses dmamap"));
1414 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1415 ("aggregated txdesc consumes "
1416 "chimney sending buffer"));
1417 KASSERT(tmp_txd->chim_size == 0,
1418 ("aggregated txdesc has non-zero "
1419 "chimney sending size"));
1421 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1422 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1423 freed = hn_txdesc_put(txr, tmp_txd);
1424 KASSERT(freed, ("failed to free aggregated txdesc"));
1428 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1429 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1430 ("chim txd uses dmamap"));
1431 hn_chim_free(txr->hn_sc, txd->chim_index);
1432 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1434 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1435 bus_dmamap_sync(txr->hn_tx_data_dtag,
1436 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1437 bus_dmamap_unload(txr->hn_tx_data_dtag,
1439 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1442 if (txd->m != NULL) {
1447 txd->flags |= HN_TXD_FLAG_ONLIST;
1448 #ifndef HN_USE_TXDESC_BUFRING
1449 mtx_lock_spin(&txr->hn_txlist_spin);
1450 KASSERT(txr->hn_txdesc_avail >= 0 &&
1451 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1452 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1453 txr->hn_txdesc_avail++;
1454 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1455 mtx_unlock_spin(&txr->hn_txlist_spin);
1456 #else /* HN_USE_TXDESC_BUFRING */
1458 atomic_add_int(&txr->hn_txdesc_avail, 1);
1460 buf_ring_enqueue(txr->hn_txdesc_br, txd);
1461 #endif /* !HN_USE_TXDESC_BUFRING */
1466 static __inline struct hn_txdesc *
1467 hn_txdesc_get(struct hn_tx_ring *txr)
1469 struct hn_txdesc *txd;
1471 #ifndef HN_USE_TXDESC_BUFRING
1472 mtx_lock_spin(&txr->hn_txlist_spin);
1473 txd = SLIST_FIRST(&txr->hn_txlist);
1475 KASSERT(txr->hn_txdesc_avail > 0,
1476 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1477 txr->hn_txdesc_avail--;
1478 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1480 mtx_unlock_spin(&txr->hn_txlist_spin);
1482 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1486 #ifdef HN_USE_TXDESC_BUFRING
1488 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1490 #endif /* HN_USE_TXDESC_BUFRING */
1491 KASSERT(txd->m == NULL && txd->refs == 0 &&
1492 STAILQ_EMPTY(&txd->agg_list) &&
1493 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1494 txd->chim_size == 0 &&
1495 (txd->flags & HN_TXD_FLAG_ONLIST) &&
1496 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1497 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1498 txd->flags &= ~HN_TXD_FLAG_ONLIST;
1504 static __inline void
1505 hn_txdesc_hold(struct hn_txdesc *txd)
1508 /* 0->1 transition will never work */
1509 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1510 atomic_add_int(&txd->refs, 1);
1513 static __inline void
1514 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1517 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1518 ("recursive aggregation on aggregating txdesc"));
1520 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1521 ("already aggregated"));
1522 KASSERT(STAILQ_EMPTY(&txd->agg_list),
1523 ("recursive aggregation on to-be-aggregated txdesc"));
1525 txd->flags |= HN_TXD_FLAG_ONAGG;
1526 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1530 hn_tx_ring_pending(struct hn_tx_ring *txr)
1532 bool pending = false;
1534 #ifndef HN_USE_TXDESC_BUFRING
1535 mtx_lock_spin(&txr->hn_txlist_spin);
1536 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1538 mtx_unlock_spin(&txr->hn_txlist_spin);
1540 if (!buf_ring_full(txr->hn_txdesc_br))
1546 static __inline void
1547 hn_txeof(struct hn_tx_ring *txr)
1549 txr->hn_has_txeof = 0;
1554 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1555 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1557 struct hn_txdesc *txd = sndc->hn_cbarg;
1558 struct hn_tx_ring *txr;
1561 KASSERT(txr->hn_chan == chan,
1562 ("channel mismatch, on chan%u, should be chan%u",
1563 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1565 txr->hn_has_txeof = 1;
1566 hn_txdesc_put(txr, txd);
1568 ++txr->hn_txdone_cnt;
1569 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1570 txr->hn_txdone_cnt = 0;
1571 if (txr->hn_oactive)
1577 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1579 #if defined(INET) || defined(INET6)
1580 struct lro_ctrl *lro = &rxr->hn_lro;
1581 struct lro_entry *queued;
1583 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1584 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1585 tcp_lro_flush(lro, queued);
1591 * 'txr' could be NULL, if multiple channels and
1592 * ifnet.if_start method are enabled.
1594 if (txr == NULL || !txr->hn_has_txeof)
1597 txr->hn_txdone_cnt = 0;
1601 static __inline uint32_t
1602 hn_rndis_pktmsg_offset(uint32_t ofs)
1605 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1606 ("invalid RNDIS packet msg offset %u", ofs));
1607 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1610 static __inline void *
1611 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1612 size_t pi_dlen, uint32_t pi_type)
1614 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1615 struct rndis_pktinfo *pi;
1617 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1618 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1621 * Per-packet-info does not move; it only grows.
1624 * rm_pktinfooffset in this phase counts from the beginning
1625 * of rndis_packet_msg.
1627 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1628 ("%u pktinfo overflows RNDIS packet msg", pi_type));
1629 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1630 pkt->rm_pktinfolen);
1631 pkt->rm_pktinfolen += pi_size;
1633 pi->rm_size = pi_size;
1634 pi->rm_type = pi_type;
1635 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1637 /* Data immediately follow per-packet-info. */
1638 pkt->rm_dataoffset += pi_size;
1640 /* Update RNDIS packet msg length */
1641 pkt->rm_len += pi_size;
1643 return (pi->rm_data);
1647 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1649 struct hn_txdesc *txd;
1653 txd = txr->hn_agg_txd;
1654 KASSERT(txd != NULL, ("no aggregate txdesc"));
1657 * Since hn_txpkt() will reset this temporary stat, save
1658 * it now, so that oerrors can be updated properly, if
1659 * hn_txpkt() ever fails.
1661 pkts = txr->hn_stat_pkts;
1664 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1665 * failure, save it for later freeing, if hn_txpkt() ever
1669 error = hn_txpkt(ifp, txr, txd);
1670 if (__predict_false(error)) {
1671 /* txd is freed, but m is not. */
1674 txr->hn_flush_failed++;
1675 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1678 /* Reset all aggregation states. */
1679 txr->hn_agg_txd = NULL;
1680 txr->hn_agg_szleft = 0;
1681 txr->hn_agg_pktleft = 0;
1682 txr->hn_agg_prevpkt = NULL;
1688 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1693 if (txr->hn_agg_txd != NULL) {
1694 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1695 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1696 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1700 * Update the previous RNDIS packet's total length,
1701 * it can be increased due to the mandatory alignment
1702 * padding for this RNDIS packet. And update the
1703 * aggregating txdesc's chimney sending buffer size
1707 * Zero-out the padding, as required by the RNDIS spec.
1710 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1711 agg_txd->chim_size += pkt->rm_len - olen;
1713 /* Link this txdesc to the parent. */
1714 hn_txdesc_agg(agg_txd, txd);
1716 chim = (uint8_t *)pkt + pkt->rm_len;
1717 /* Save the current packet for later fixup. */
1718 txr->hn_agg_prevpkt = chim;
1720 txr->hn_agg_pktleft--;
1721 txr->hn_agg_szleft -= pktsize;
1722 if (txr->hn_agg_szleft <=
1723 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1725 * Probably can't aggregate more packets,
1726 * flush this aggregating txdesc proactively.
1728 txr->hn_agg_pktleft = 0;
1733 hn_flush_txagg(ifp, txr);
1735 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1737 txr->hn_tx_chimney_tried++;
1738 txd->chim_index = hn_chim_alloc(txr->hn_sc);
1739 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1741 txr->hn_tx_chimney++;
1743 chim = txr->hn_sc->hn_chim +
1744 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1746 if (txr->hn_agg_pktmax > 1 &&
1747 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1748 txr->hn_agg_txd = txd;
1749 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1750 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1751 txr->hn_agg_prevpkt = chim;
1758 * If this function fails, then both txd and m_head0 will be freed.
1761 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1762 struct mbuf **m_head0)
1764 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1765 int error, nsegs, i;
1766 struct mbuf *m_head = *m_head0;
1767 struct rndis_packet_msg *pkt;
1770 int pkt_hlen, pkt_size;
1772 pkt = txd->rndis_pkt;
1773 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1774 if (pkt_size < txr->hn_chim_size) {
1775 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1779 if (txr->hn_agg_txd != NULL)
1780 hn_flush_txagg(ifp, txr);
1783 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1784 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1785 pkt->rm_dataoffset = sizeof(*pkt);
1786 pkt->rm_datalen = m_head->m_pkthdr.len;
1787 pkt->rm_oobdataoffset = 0;
1788 pkt->rm_oobdatalen = 0;
1789 pkt->rm_oobdataelements = 0;
1790 pkt->rm_pktinfooffset = sizeof(*pkt);
1791 pkt->rm_pktinfolen = 0;
1792 pkt->rm_vchandle = 0;
1793 pkt->rm_reserved = 0;
1795 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1797 * Set the hash value for this packet, so that the host could
1798 * dispatch the TX done event for this packet back to this TX
1801 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1802 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1803 *pi_data = txr->hn_tx_idx;
1806 if (m_head->m_flags & M_VLANTAG) {
1807 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1808 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1809 *pi_data = NDIS_VLAN_INFO_MAKE(
1810 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1811 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1812 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1815 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1816 #if defined(INET6) || defined(INET)
1817 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1818 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1820 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1821 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1822 m_head->m_pkthdr.tso_segsz);
1825 #if defined(INET6) && defined(INET)
1830 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1831 m_head->m_pkthdr.tso_segsz);
1834 #endif /* INET6 || INET */
1835 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1836 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1837 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1838 if (m_head->m_pkthdr.csum_flags &
1839 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1840 *pi_data = NDIS_TXCSUM_INFO_IPV6;
1842 *pi_data = NDIS_TXCSUM_INFO_IPV4;
1843 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1844 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1847 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1848 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1849 else if (m_head->m_pkthdr.csum_flags &
1850 (CSUM_IP_UDP | CSUM_IP6_UDP))
1851 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1854 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1855 /* Convert RNDIS packet message offsets */
1856 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1857 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1860 * Fast path: Chimney sending.
1863 struct hn_txdesc *tgt_txd = txd;
1865 if (txr->hn_agg_txd != NULL) {
1866 tgt_txd = txr->hn_agg_txd;
1872 KASSERT(pkt == chim,
1873 ("RNDIS pkt not in chimney sending buffer"));
1874 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1875 ("chimney sending buffer is not used"));
1876 tgt_txd->chim_size += pkt->rm_len;
1878 m_copydata(m_head, 0, m_head->m_pkthdr.len,
1879 ((uint8_t *)chim) + pkt_hlen);
1881 txr->hn_gpa_cnt = 0;
1882 txr->hn_sendpkt = hn_txpkt_chim;
1886 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1887 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1888 ("chimney buffer is used"));
1889 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1891 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1892 if (__predict_false(error)) {
1896 * This mbuf is not linked w/ the txd yet, so free it now.
1901 freed = hn_txdesc_put(txr, txd);
1903 ("fail to free txd upon txdma error"));
1905 txr->hn_txdma_failed++;
1906 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1911 /* +1 RNDIS packet message */
1912 txr->hn_gpa_cnt = nsegs + 1;
1914 /* send packet with page buffer */
1915 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1916 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1917 txr->hn_gpa[0].gpa_len = pkt_hlen;
1920 * Fill the page buffers with mbuf info after the page
1921 * buffer for RNDIS packet message.
1923 for (i = 0; i < nsegs; ++i) {
1924 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1926 gpa->gpa_page = atop(segs[i].ds_addr);
1927 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1928 gpa->gpa_len = segs[i].ds_len;
1931 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1933 txr->hn_sendpkt = hn_txpkt_sglist;
1937 /* Set the completion routine */
1938 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1940 /* Update temporary stats for later use. */
1941 txr->hn_stat_pkts++;
1942 txr->hn_stat_size += m_head->m_pkthdr.len;
1943 if (m_head->m_flags & M_MCAST)
1944 txr->hn_stat_mcasts++;
1951 * If this function fails, then txd will be freed, but the mbuf
1952 * associated w/ the txd will _not_ be freed.
1955 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1957 int error, send_failed = 0, has_bpf;
1960 has_bpf = bpf_peers_present(ifp->if_bpf);
1963 * Make sure that this txd and any aggregated txds are not
1964 * freed before ETHER_BPF_MTAP.
1966 hn_txdesc_hold(txd);
1968 error = txr->hn_sendpkt(txr, txd);
1971 const struct hn_txdesc *tmp_txd;
1973 ETHER_BPF_MTAP(ifp, txd->m);
1974 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1975 ETHER_BPF_MTAP(ifp, tmp_txd->m);
1978 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1979 #ifdef HN_IFSTART_SUPPORT
1980 if (!hn_use_if_start)
1983 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1985 if (txr->hn_stat_mcasts != 0) {
1986 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1987 txr->hn_stat_mcasts);
1990 txr->hn_pkts += txr->hn_stat_pkts;
1994 hn_txdesc_put(txr, txd);
1996 if (__predict_false(error)) {
2000 * This should "really rarely" happen.
2002 * XXX Too many RX to be acked or too many sideband
2003 * commands to run? Ask netvsc_channel_rollup()
2004 * to kick start later.
2006 txr->hn_has_txeof = 1;
2008 txr->hn_send_failed++;
2011 * Try sending again after set hn_has_txeof;
2012 * in case that we missed the last
2013 * netvsc_channel_rollup().
2017 if_printf(ifp, "send failed\n");
2020 * Caller will perform further processing on the
2021 * associated mbuf, so don't free it in hn_txdesc_put();
2022 * only unload it from the DMA map in hn_txdesc_put(),
2026 freed = hn_txdesc_put(txr, txd);
2028 ("fail to free txd upon send error"));
2030 txr->hn_send_failed++;
2033 /* Reset temporary stats, after this sending is done. */
2034 txr->hn_stat_size = 0;
2035 txr->hn_stat_pkts = 0;
2036 txr->hn_stat_mcasts = 0;
2042 * Append the specified data to the indicated mbuf chain,
2043 * Extend the mbuf chain if the new data does not fit in
2046 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2047 * There should be an equivalent in the kernel mbuf code,
2048 * but there does not appear to be one yet.
2050 * Differs from m_append() in that additional mbufs are
2051 * allocated with cluster size MJUMPAGESIZE, and filled
2054 * Return 1 if able to complete the job; otherwise 0.
2057 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2060 int remainder, space;
2062 for (m = m0; m->m_next != NULL; m = m->m_next)
2065 space = M_TRAILINGSPACE(m);
2068 * Copy into available space.
2070 if (space > remainder)
2072 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2077 while (remainder > 0) {
2079 * Allocate a new mbuf; could check space
2080 * and allocate a cluster instead.
2082 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
2085 n->m_len = min(MJUMPAGESIZE, remainder);
2086 bcopy(cp, mtod(n, caddr_t), n->m_len);
2088 remainder -= n->m_len;
2092 if (m0->m_flags & M_PKTHDR)
2093 m0->m_pkthdr.len += len - remainder;
2095 return (remainder == 0);
2098 #if defined(INET) || defined(INET6)
2100 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2102 #if __FreeBSD_version >= 1100095
2103 if (hn_lro_mbufq_depth) {
2104 tcp_lro_queue_mbuf(lc, m);
2108 return tcp_lro_rx(lc, m, 0);
2113 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2114 const struct hn_rxinfo *info)
2116 struct ifnet *ifp = rxr->hn_ifp;
2118 int size, do_lro = 0, do_csum = 1;
2119 int hash_type = M_HASHTYPE_OPAQUE;
2121 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2125 * Bail out if packet contains more data than configured MTU.
2127 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
2129 } else if (dlen <= MHLEN) {
2130 m_new = m_gethdr(M_NOWAIT, MT_DATA);
2131 if (m_new == NULL) {
2132 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2135 memcpy(mtod(m_new, void *), data, dlen);
2136 m_new->m_pkthdr.len = m_new->m_len = dlen;
2137 rxr->hn_small_pkts++;
2140 * Get an mbuf with a cluster. For packets 2K or less,
2141 * get a standard 2K cluster. For anything larger, get a
2142 * 4K cluster. Any buffers larger than 4K can cause problems
2143 * if looped around to the Hyper-V TX channel, so avoid them.
2146 if (dlen > MCLBYTES) {
2148 size = MJUMPAGESIZE;
2151 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2152 if (m_new == NULL) {
2153 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2157 hv_m_append(m_new, dlen, data);
2159 m_new->m_pkthdr.rcvif = ifp;
2161 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2164 /* receive side checksum offload */
2165 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2166 /* IP csum offload */
2167 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2168 m_new->m_pkthdr.csum_flags |=
2169 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2173 /* TCP/UDP csum offload */
2174 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2175 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2176 m_new->m_pkthdr.csum_flags |=
2177 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2178 m_new->m_pkthdr.csum_data = 0xffff;
2179 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2187 * As of this write (Oct 28th, 2016), host side will turn
2188 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2189 * the do_lro setting here is actually _not_ accurate. We
2190 * depend on the RSS hash type check to reset do_lro.
2192 if ((info->csum_info &
2193 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2194 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2197 const struct ether_header *eh;
2202 if (m_new->m_len < hoff)
2204 eh = mtod(m_new, struct ether_header *);
2205 etype = ntohs(eh->ether_type);
2206 if (etype == ETHERTYPE_VLAN) {
2207 const struct ether_vlan_header *evl;
2209 hoff = sizeof(*evl);
2210 if (m_new->m_len < hoff)
2212 evl = mtod(m_new, struct ether_vlan_header *);
2213 etype = ntohs(evl->evl_proto);
2216 if (etype == ETHERTYPE_IP) {
2219 pr = hn_check_iplen(m_new, hoff);
2220 if (pr == IPPROTO_TCP) {
2222 (rxr->hn_trust_hcsum &
2223 HN_TRUST_HCSUM_TCP)) {
2224 rxr->hn_csum_trusted++;
2225 m_new->m_pkthdr.csum_flags |=
2226 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2227 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2228 m_new->m_pkthdr.csum_data = 0xffff;
2231 } else if (pr == IPPROTO_UDP) {
2233 (rxr->hn_trust_hcsum &
2234 HN_TRUST_HCSUM_UDP)) {
2235 rxr->hn_csum_trusted++;
2236 m_new->m_pkthdr.csum_flags |=
2237 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2238 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2239 m_new->m_pkthdr.csum_data = 0xffff;
2241 } else if (pr != IPPROTO_DONE && do_csum &&
2242 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2243 rxr->hn_csum_trusted++;
2244 m_new->m_pkthdr.csum_flags |=
2245 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2250 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2251 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2252 NDIS_VLAN_INFO_ID(info->vlan_info),
2253 NDIS_VLAN_INFO_PRI(info->vlan_info),
2254 NDIS_VLAN_INFO_CFI(info->vlan_info));
2255 m_new->m_flags |= M_VLANTAG;
2258 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2260 m_new->m_pkthdr.flowid = info->hash_value;
2261 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2262 NDIS_HASH_FUNCTION_TOEPLITZ) {
2263 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2267 * do_lro is resetted, if the hash types are not TCP
2268 * related. See the comment in the above csum_flags
2272 case NDIS_HASH_IPV4:
2273 hash_type = M_HASHTYPE_RSS_IPV4;
2277 case NDIS_HASH_TCP_IPV4:
2278 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2281 case NDIS_HASH_IPV6:
2282 hash_type = M_HASHTYPE_RSS_IPV6;
2286 case NDIS_HASH_IPV6_EX:
2287 hash_type = M_HASHTYPE_RSS_IPV6_EX;
2291 case NDIS_HASH_TCP_IPV6:
2292 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2295 case NDIS_HASH_TCP_IPV6_EX:
2296 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2301 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2303 M_HASHTYPE_SET(m_new, hash_type);
2306 * Note: Moved RX completion back to hv_nv_on_receive() so all
2307 * messages (not just data messages) will trigger a response.
2313 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2314 #if defined(INET) || defined(INET6)
2315 struct lro_ctrl *lro = &rxr->hn_lro;
2318 rxr->hn_lro_tried++;
2319 if (hn_lro_rx(lro, m_new) == 0) {
2327 /* We're not holding the lock here, so don't release it */
2328 (*ifp->if_input)(ifp, m_new);
2334 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2336 struct hn_softc *sc = ifp->if_softc;
2337 struct ifreq *ifr = (struct ifreq *)data;
2338 int mask, error = 0;
2342 if (ifr->ifr_mtu > HN_MTU_MAX) {
2349 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2354 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2355 /* Can't change MTU */
2361 if (ifp->if_mtu == ifr->ifr_mtu) {
2367 * Suspend this interface before the synthetic parts
2373 * Detach the synthetics parts, i.e. NVS and RNDIS.
2375 hn_synth_detach(sc);
2378 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2379 * with the new MTU setting.
2381 error = hn_synth_attach(sc, ifr->ifr_mtu);
2388 * Commit the requested MTU, after the synthetic parts
2389 * have been successfully attached.
2391 ifp->if_mtu = ifr->ifr_mtu;
2394 * Make sure that various parameters based on MTU are
2395 * still valid, after the MTU change.
2397 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2398 hn_set_chim_size(sc, sc->hn_chim_szmax);
2399 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2400 #if __FreeBSD_version >= 1100099
2401 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2402 HN_LRO_LENLIM_MIN(ifp))
2403 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2407 * All done! Resume the interface now.
2417 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2422 if (ifp->if_flags & IFF_UP) {
2423 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2425 * Caller meight hold mutex, e.g.
2426 * bpf; use busy-wait for the RNDIS
2430 hn_rxfilter_config(sc);
2436 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2439 sc->hn_if_flags = ifp->if_flags;
2446 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2448 if (mask & IFCAP_TXCSUM) {
2449 ifp->if_capenable ^= IFCAP_TXCSUM;
2450 if (ifp->if_capenable & IFCAP_TXCSUM)
2451 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2453 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2455 if (mask & IFCAP_TXCSUM_IPV6) {
2456 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2457 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2458 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2460 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2463 /* TODO: flip RNDIS offload parameters for RXCSUM. */
2464 if (mask & IFCAP_RXCSUM)
2465 ifp->if_capenable ^= IFCAP_RXCSUM;
2467 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2468 if (mask & IFCAP_RXCSUM_IPV6)
2469 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2472 if (mask & IFCAP_LRO)
2473 ifp->if_capenable ^= IFCAP_LRO;
2475 if (mask & IFCAP_TSO4) {
2476 ifp->if_capenable ^= IFCAP_TSO4;
2477 if (ifp->if_capenable & IFCAP_TSO4)
2478 ifp->if_hwassist |= CSUM_IP_TSO;
2480 ifp->if_hwassist &= ~CSUM_IP_TSO;
2482 if (mask & IFCAP_TSO6) {
2483 ifp->if_capenable ^= IFCAP_TSO6;
2484 if (ifp->if_capenable & IFCAP_TSO6)
2485 ifp->if_hwassist |= CSUM_IP6_TSO;
2487 ifp->if_hwassist &= ~CSUM_IP6_TSO;
2497 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2501 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2503 * Multicast uses mutex; use busy-wait for
2507 hn_rxfilter_config(sc);
2516 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2520 error = ether_ioctl(ifp, cmd, data);
2527 hn_stop(struct hn_softc *sc)
2529 struct ifnet *ifp = sc->hn_ifp;
2534 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2535 ("synthetic parts were not attached"));
2537 /* Disable polling. */
2540 /* Clear RUNNING bit _before_ hn_suspend_data() */
2541 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2542 hn_suspend_data(sc);
2544 /* Clear OACTIVE bit. */
2545 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2546 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2547 sc->hn_tx_ring[i].hn_oactive = 0;
2551 hn_init_locked(struct hn_softc *sc)
2553 struct ifnet *ifp = sc->hn_ifp;
2558 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2561 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2564 /* Configure RX filter */
2565 hn_rxfilter_config(sc);
2567 /* Clear OACTIVE bit. */
2568 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2569 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2570 sc->hn_tx_ring[i].hn_oactive = 0;
2572 /* Clear TX 'suspended' bit. */
2573 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2575 /* Everything is ready; unleash! */
2576 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2578 /* Re-enable polling if requested. */
2579 if (sc->hn_pollhz > 0)
2580 hn_polling(sc, sc->hn_pollhz);
2586 struct hn_softc *sc = xsc;
2593 #if __FreeBSD_version >= 1100099
2596 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2598 struct hn_softc *sc = arg1;
2599 unsigned int lenlim;
2602 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2603 error = sysctl_handle_int(oidp, &lenlim, 0, req);
2604 if (error || req->newptr == NULL)
2608 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2609 lenlim > TCP_LRO_LENGTH_MAX) {
2613 hn_set_lro_lenlim(sc, lenlim);
2620 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2622 struct hn_softc *sc = arg1;
2623 int ackcnt, error, i;
2626 * lro_ackcnt_lim is append count limit,
2627 * +1 to turn it into aggregation limit.
2629 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2630 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2631 if (error || req->newptr == NULL)
2634 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2638 * Convert aggregation limit back to append
2643 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2644 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2652 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2654 struct hn_softc *sc = arg1;
2659 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2662 error = sysctl_handle_int(oidp, &on, 0, req);
2663 if (error || req->newptr == NULL)
2667 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2668 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2671 rxr->hn_trust_hcsum |= hcsum;
2673 rxr->hn_trust_hcsum &= ~hcsum;
2680 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2682 struct hn_softc *sc = arg1;
2683 int chim_size, error;
2685 chim_size = sc->hn_tx_ring[0].hn_chim_size;
2686 error = sysctl_handle_int(oidp, &chim_size, 0, req);
2687 if (error || req->newptr == NULL)
2690 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2694 hn_set_chim_size(sc, chim_size);
2699 #if __FreeBSD_version < 1100095
2701 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2703 struct hn_softc *sc = arg1;
2704 int ofs = arg2, i, error;
2705 struct hn_rx_ring *rxr;
2709 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2710 rxr = &sc->hn_rx_ring[i];
2711 stat += *((int *)((uint8_t *)rxr + ofs));
2714 error = sysctl_handle_64(oidp, &stat, 0, req);
2715 if (error || req->newptr == NULL)
2718 /* Zero out this stat. */
2719 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2720 rxr = &sc->hn_rx_ring[i];
2721 *((int *)((uint8_t *)rxr + ofs)) = 0;
2727 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2729 struct hn_softc *sc = arg1;
2730 int ofs = arg2, i, error;
2731 struct hn_rx_ring *rxr;
2735 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2736 rxr = &sc->hn_rx_ring[i];
2737 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2740 error = sysctl_handle_64(oidp, &stat, 0, req);
2741 if (error || req->newptr == NULL)
2744 /* Zero out this stat. */
2745 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2746 rxr = &sc->hn_rx_ring[i];
2747 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2755 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2757 struct hn_softc *sc = arg1;
2758 int ofs = arg2, i, error;
2759 struct hn_rx_ring *rxr;
2763 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2764 rxr = &sc->hn_rx_ring[i];
2765 stat += *((u_long *)((uint8_t *)rxr + ofs));
2768 error = sysctl_handle_long(oidp, &stat, 0, req);
2769 if (error || req->newptr == NULL)
2772 /* Zero out this stat. */
2773 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2774 rxr = &sc->hn_rx_ring[i];
2775 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
2781 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2783 struct hn_softc *sc = arg1;
2784 int ofs = arg2, i, error;
2785 struct hn_tx_ring *txr;
2789 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2790 txr = &sc->hn_tx_ring[i];
2791 stat += *((u_long *)((uint8_t *)txr + ofs));
2794 error = sysctl_handle_long(oidp, &stat, 0, req);
2795 if (error || req->newptr == NULL)
2798 /* Zero out this stat. */
2799 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2800 txr = &sc->hn_tx_ring[i];
2801 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2807 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2809 struct hn_softc *sc = arg1;
2810 int ofs = arg2, i, error, conf;
2811 struct hn_tx_ring *txr;
2813 txr = &sc->hn_tx_ring[0];
2814 conf = *((int *)((uint8_t *)txr + ofs));
2816 error = sysctl_handle_int(oidp, &conf, 0, req);
2817 if (error || req->newptr == NULL)
2821 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2822 txr = &sc->hn_tx_ring[i];
2823 *((int *)((uint8_t *)txr + ofs)) = conf;
2831 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2833 struct hn_softc *sc = arg1;
2836 size = sc->hn_agg_size;
2837 error = sysctl_handle_int(oidp, &size, 0, req);
2838 if (error || req->newptr == NULL)
2842 sc->hn_agg_size = size;
2850 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2852 struct hn_softc *sc = arg1;
2855 pkts = sc->hn_agg_pkts;
2856 error = sysctl_handle_int(oidp, &pkts, 0, req);
2857 if (error || req->newptr == NULL)
2861 sc->hn_agg_pkts = pkts;
2869 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2871 struct hn_softc *sc = arg1;
2874 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2875 return (sysctl_handle_int(oidp, &pkts, 0, req));
2879 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2881 struct hn_softc *sc = arg1;
2884 align = sc->hn_tx_ring[0].hn_agg_align;
2885 return (sysctl_handle_int(oidp, &align, 0, req));
2889 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
2892 vmbus_chan_poll_disable(chan);
2894 vmbus_chan_poll_enable(chan, pollhz);
2898 hn_polling(struct hn_softc *sc, u_int pollhz)
2900 int nsubch = sc->hn_rx_ring_inuse - 1;
2905 struct vmbus_channel **subch;
2908 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
2909 for (i = 0; i < nsubch; ++i)
2910 hn_chan_polling(subch[i], pollhz);
2911 vmbus_subchan_rel(subch, nsubch);
2913 hn_chan_polling(sc->hn_prichan, pollhz);
2917 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
2919 struct hn_softc *sc = arg1;
2922 pollhz = sc->hn_pollhz;
2923 error = sysctl_handle_int(oidp, &pollhz, 0, req);
2924 if (error || req->newptr == NULL)
2928 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
2932 if (sc->hn_pollhz != pollhz) {
2933 sc->hn_pollhz = pollhz;
2934 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
2935 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
2936 hn_polling(sc, sc->hn_pollhz);
2944 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2946 struct hn_softc *sc = arg1;
2949 snprintf(verstr, sizeof(verstr), "%u.%u",
2950 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2951 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2952 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2956 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2958 struct hn_softc *sc = arg1;
2965 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2966 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2970 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2972 struct hn_softc *sc = arg1;
2973 char assist_str[128];
2977 hwassist = sc->hn_ifp->if_hwassist;
2979 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2980 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2984 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2986 struct hn_softc *sc = arg1;
2987 char filter_str[128];
2991 filter = sc->hn_rx_filter;
2993 snprintf(filter_str, sizeof(filter_str), "%b", filter,
2995 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2999 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
3001 struct hn_softc *sc = arg1;
3006 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3007 if (error || req->newptr == NULL)
3010 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3013 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3015 if (sc->hn_rx_ring_inuse > 1) {
3016 error = hn_rss_reconfig(sc);
3018 /* Not RSS capable, at least for now; just save the RSS key. */
3027 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
3029 struct hn_softc *sc = arg1;
3034 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3035 if (error || req->newptr == NULL)
3039 * Don't allow RSS indirect table change, if this interface is not
3040 * RSS capable currently.
3042 if (sc->hn_rx_ring_inuse == 1) {
3047 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3050 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3052 hn_rss_ind_fixup(sc);
3053 error = hn_rss_reconfig(sc);
3060 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3062 struct hn_softc *sc = arg1;
3067 hash = sc->hn_rss_hash;
3069 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3070 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3074 hn_check_iplen(const struct mbuf *m, int hoff)
3076 const struct ip *ip;
3077 int len, iphlen, iplen;
3078 const struct tcphdr *th;
3079 int thoff; /* TCP data offset */
3081 len = hoff + sizeof(struct ip);
3083 /* The packet must be at least the size of an IP header. */
3084 if (m->m_pkthdr.len < len)
3085 return IPPROTO_DONE;
3087 /* The fixed IP header must reside completely in the first mbuf. */
3089 return IPPROTO_DONE;
3091 ip = mtodo(m, hoff);
3093 /* Bound check the packet's stated IP header length. */
3094 iphlen = ip->ip_hl << 2;
3095 if (iphlen < sizeof(struct ip)) /* minimum header length */
3096 return IPPROTO_DONE;
3098 /* The full IP header must reside completely in the one mbuf. */
3099 if (m->m_len < hoff + iphlen)
3100 return IPPROTO_DONE;
3102 iplen = ntohs(ip->ip_len);
3105 * Check that the amount of data in the buffers is as
3106 * at least much as the IP header would have us expect.
3108 if (m->m_pkthdr.len < hoff + iplen)
3109 return IPPROTO_DONE;
3112 * Ignore IP fragments.
3114 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3115 return IPPROTO_DONE;
3118 * The TCP/IP or UDP/IP header must be entirely contained within
3119 * the first fragment of a packet.
3123 if (iplen < iphlen + sizeof(struct tcphdr))
3124 return IPPROTO_DONE;
3125 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3126 return IPPROTO_DONE;
3127 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3128 thoff = th->th_off << 2;
3129 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3130 return IPPROTO_DONE;
3131 if (m->m_len < hoff + iphlen + thoff)
3132 return IPPROTO_DONE;
3135 if (iplen < iphlen + sizeof(struct udphdr))
3136 return IPPROTO_DONE;
3137 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3138 return IPPROTO_DONE;
3142 return IPPROTO_DONE;
3149 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3151 struct sysctl_oid_list *child;
3152 struct sysctl_ctx_list *ctx;
3153 device_t dev = sc->hn_dev;
3154 #if defined(INET) || defined(INET6)
3155 #if __FreeBSD_version >= 1100095
3162 * Create RXBUF for reception.
3165 * - It is shared by all channels.
3166 * - A large enough buffer is allocated, certain version of NVSes
3167 * may further limit the usable space.
3169 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3170 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3171 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3172 if (sc->hn_rxbuf == NULL) {
3173 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3177 sc->hn_rx_ring_cnt = ring_cnt;
3178 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3180 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3181 M_DEVBUF, M_WAITOK | M_ZERO);
3183 #if defined(INET) || defined(INET6)
3184 #if __FreeBSD_version >= 1100095
3185 lroent_cnt = hn_lro_entry_count;
3186 if (lroent_cnt < TCP_LRO_ENTRIES)
3187 lroent_cnt = TCP_LRO_ENTRIES;
3189 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3191 #endif /* INET || INET6 */
3193 ctx = device_get_sysctl_ctx(dev);
3194 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3196 /* Create dev.hn.UNIT.rx sysctl tree */
3197 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3198 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3200 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3201 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3203 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3204 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3205 &rxr->hn_br_dma, BUS_DMA_WAITOK);
3206 if (rxr->hn_br == NULL) {
3207 device_printf(dev, "allocate bufring failed\n");
3211 if (hn_trust_hosttcp)
3212 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3213 if (hn_trust_hostudp)
3214 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3215 if (hn_trust_hostip)
3216 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3217 rxr->hn_ifp = sc->hn_ifp;
3218 if (i < sc->hn_tx_ring_cnt)
3219 rxr->hn_txr = &sc->hn_tx_ring[i];
3220 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3221 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3223 rxr->hn_rxbuf = sc->hn_rxbuf;
3228 #if defined(INET) || defined(INET6)
3229 #if __FreeBSD_version >= 1100095
3230 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3231 hn_lro_mbufq_depth);
3233 tcp_lro_init(&rxr->hn_lro);
3234 rxr->hn_lro.ifp = sc->hn_ifp;
3236 #if __FreeBSD_version >= 1100099
3237 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3238 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3240 #endif /* INET || INET6 */
3242 if (sc->hn_rx_sysctl_tree != NULL) {
3246 * Create per RX ring sysctl tree:
3247 * dev.hn.UNIT.rx.RINGID
3249 snprintf(name, sizeof(name), "%d", i);
3250 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3251 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3252 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3254 if (rxr->hn_rx_sysctl_tree != NULL) {
3255 SYSCTL_ADD_ULONG(ctx,
3256 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3257 OID_AUTO, "packets", CTLFLAG_RW,
3258 &rxr->hn_pkts, "# of packets received");
3259 SYSCTL_ADD_ULONG(ctx,
3260 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3261 OID_AUTO, "rss_pkts", CTLFLAG_RW,
3263 "# of packets w/ RSS info received");
3265 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3266 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3267 &rxr->hn_pktbuf_len, 0,
3268 "Temporary channel packet buffer length");
3273 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3274 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3275 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3276 #if __FreeBSD_version < 1100095
3277 hn_rx_stat_int_sysctl,
3279 hn_rx_stat_u64_sysctl,
3281 "LU", "LRO queued");
3282 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3283 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3284 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3285 #if __FreeBSD_version < 1100095
3286 hn_rx_stat_int_sysctl,
3288 hn_rx_stat_u64_sysctl,
3290 "LU", "LRO flushed");
3291 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3292 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3293 __offsetof(struct hn_rx_ring, hn_lro_tried),
3294 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3295 #if __FreeBSD_version >= 1100099
3296 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3297 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3298 hn_lro_lenlim_sysctl, "IU",
3299 "Max # of data bytes to be aggregated by LRO");
3300 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3301 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3302 hn_lro_ackcnt_sysctl, "I",
3303 "Max # of ACKs to be aggregated by LRO");
3305 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3306 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3307 hn_trust_hcsum_sysctl, "I",
3308 "Trust tcp segement verification on host side, "
3309 "when csum info is missing");
3310 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3311 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3312 hn_trust_hcsum_sysctl, "I",
3313 "Trust udp datagram verification on host side, "
3314 "when csum info is missing");
3315 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3316 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3317 hn_trust_hcsum_sysctl, "I",
3318 "Trust ip packet verification on host side, "
3319 "when csum info is missing");
3320 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3321 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3322 __offsetof(struct hn_rx_ring, hn_csum_ip),
3323 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3324 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3325 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3326 __offsetof(struct hn_rx_ring, hn_csum_tcp),
3327 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3328 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3329 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3330 __offsetof(struct hn_rx_ring, hn_csum_udp),
3331 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3332 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3333 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3334 __offsetof(struct hn_rx_ring, hn_csum_trusted),
3335 hn_rx_stat_ulong_sysctl, "LU",
3336 "# of packets that we trust host's csum verification");
3337 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3338 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3339 __offsetof(struct hn_rx_ring, hn_small_pkts),
3340 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3341 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3342 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3343 __offsetof(struct hn_rx_ring, hn_ack_failed),
3344 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3345 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3346 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3347 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3348 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3354 hn_destroy_rx_data(struct hn_softc *sc)
3358 if (sc->hn_rxbuf != NULL) {
3359 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3360 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3362 device_printf(sc->hn_dev, "RXBUF is referenced\n");
3363 sc->hn_rxbuf = NULL;
3366 if (sc->hn_rx_ring_cnt == 0)
3369 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3370 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3372 if (rxr->hn_br == NULL)
3374 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3375 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3377 device_printf(sc->hn_dev,
3378 "%dth channel bufring is referenced", i);
3382 #if defined(INET) || defined(INET6)
3383 tcp_lro_free(&rxr->hn_lro);
3385 free(rxr->hn_pktbuf, M_DEVBUF);
3387 free(sc->hn_rx_ring, M_DEVBUF);
3388 sc->hn_rx_ring = NULL;
3390 sc->hn_rx_ring_cnt = 0;
3391 sc->hn_rx_ring_inuse = 0;
3395 hn_tx_ring_create(struct hn_softc *sc, int id)
3397 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3398 device_t dev = sc->hn_dev;
3399 bus_dma_tag_t parent_dtag;
3403 txr->hn_tx_idx = id;
3405 #ifndef HN_USE_TXDESC_BUFRING
3406 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3408 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3410 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3411 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3412 M_DEVBUF, M_WAITOK | M_ZERO);
3413 #ifndef HN_USE_TXDESC_BUFRING
3414 SLIST_INIT(&txr->hn_txlist);
3416 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3417 M_WAITOK, &txr->hn_tx_lock);
3420 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3421 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3422 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3424 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3427 #ifdef HN_IFSTART_SUPPORT
3428 if (hn_use_if_start) {
3429 txr->hn_txeof = hn_start_txeof;
3430 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3431 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3437 txr->hn_txeof = hn_xmit_txeof;
3438 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3439 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3441 br_depth = hn_get_txswq_depth(txr);
3442 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3443 M_WAITOK, &txr->hn_tx_lock);
3446 txr->hn_direct_tx_size = hn_direct_tx_size;
3449 * Always schedule transmission instead of trying to do direct
3450 * transmission. This one gives the best performance so far.
3452 txr->hn_sched_tx = 1;
3454 parent_dtag = bus_get_dma_tag(dev);
3456 /* DMA tag for RNDIS packet messages. */
3457 error = bus_dma_tag_create(parent_dtag, /* parent */
3458 HN_RNDIS_PKT_ALIGN, /* alignment */
3459 HN_RNDIS_PKT_BOUNDARY, /* boundary */
3460 BUS_SPACE_MAXADDR, /* lowaddr */
3461 BUS_SPACE_MAXADDR, /* highaddr */
3462 NULL, NULL, /* filter, filterarg */
3463 HN_RNDIS_PKT_LEN, /* maxsize */
3465 HN_RNDIS_PKT_LEN, /* maxsegsize */
3467 NULL, /* lockfunc */
3468 NULL, /* lockfuncarg */
3469 &txr->hn_tx_rndis_dtag);
3471 device_printf(dev, "failed to create rndis dmatag\n");
3475 /* DMA tag for data. */
3476 error = bus_dma_tag_create(parent_dtag, /* parent */
3478 HN_TX_DATA_BOUNDARY, /* boundary */
3479 BUS_SPACE_MAXADDR, /* lowaddr */
3480 BUS_SPACE_MAXADDR, /* highaddr */
3481 NULL, NULL, /* filter, filterarg */
3482 HN_TX_DATA_MAXSIZE, /* maxsize */
3483 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
3484 HN_TX_DATA_SEGSIZE, /* maxsegsize */
3486 NULL, /* lockfunc */
3487 NULL, /* lockfuncarg */
3488 &txr->hn_tx_data_dtag);
3490 device_printf(dev, "failed to create data dmatag\n");
3494 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3495 struct hn_txdesc *txd = &txr->hn_txdesc[i];
3498 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3499 STAILQ_INIT(&txd->agg_list);
3502 * Allocate and load RNDIS packet message.
3504 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3505 (void **)&txd->rndis_pkt,
3506 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3507 &txd->rndis_pkt_dmap);
3510 "failed to allocate rndis_packet_msg, %d\n", i);
3514 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3515 txd->rndis_pkt_dmap,
3516 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3517 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3521 "failed to load rndis_packet_msg, %d\n", i);
3522 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3523 txd->rndis_pkt, txd->rndis_pkt_dmap);
3527 /* DMA map for TX data. */
3528 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3532 "failed to allocate tx data dmamap\n");
3533 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3534 txd->rndis_pkt_dmap);
3535 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3536 txd->rndis_pkt, txd->rndis_pkt_dmap);
3540 /* All set, put it to list */
3541 txd->flags |= HN_TXD_FLAG_ONLIST;
3542 #ifndef HN_USE_TXDESC_BUFRING
3543 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3545 buf_ring_enqueue(txr->hn_txdesc_br, txd);
3548 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3550 if (sc->hn_tx_sysctl_tree != NULL) {
3551 struct sysctl_oid_list *child;
3552 struct sysctl_ctx_list *ctx;
3556 * Create per TX ring sysctl tree:
3557 * dev.hn.UNIT.tx.RINGID
3559 ctx = device_get_sysctl_ctx(dev);
3560 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3562 snprintf(name, sizeof(name), "%d", id);
3563 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3564 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3566 if (txr->hn_tx_sysctl_tree != NULL) {
3567 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3570 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3571 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3572 "# of available TX descs");
3574 #ifdef HN_IFSTART_SUPPORT
3575 if (!hn_use_if_start)
3578 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3579 CTLFLAG_RD, &txr->hn_oactive, 0,
3582 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3583 CTLFLAG_RW, &txr->hn_pkts,
3584 "# of packets transmitted");
3585 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3586 CTLFLAG_RW, &txr->hn_sends, "# of sends");
3594 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3596 struct hn_tx_ring *txr = txd->txr;
3598 KASSERT(txd->m == NULL, ("still has mbuf installed"));
3599 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3601 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3602 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3603 txd->rndis_pkt_dmap);
3604 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3608 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3611 KASSERT(txd->refs == 0 || txd->refs == 1,
3612 ("invalid txd refs %d", txd->refs));
3614 /* Aggregated txds will be freed by their aggregating txd. */
3615 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3618 freed = hn_txdesc_put(txr, txd);
3619 KASSERT(freed, ("can't free txdesc"));
3624 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3628 if (txr->hn_txdesc == NULL)
3633 * Because the freeing of aggregated txds will be deferred
3634 * to the aggregating txd, two passes are used here:
3635 * - The first pass GCes any pending txds. This GC is necessary,
3636 * since if the channels are revoked, hypervisor will not
3637 * deliver send-done for all pending txds.
3638 * - The second pass frees the busdma stuffs, i.e. after all txds
3641 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3642 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3643 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3644 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3646 if (txr->hn_tx_data_dtag != NULL)
3647 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3648 if (txr->hn_tx_rndis_dtag != NULL)
3649 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3651 #ifdef HN_USE_TXDESC_BUFRING
3652 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3655 free(txr->hn_txdesc, M_DEVBUF);
3656 txr->hn_txdesc = NULL;
3658 if (txr->hn_mbuf_br != NULL)
3659 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3661 #ifndef HN_USE_TXDESC_BUFRING
3662 mtx_destroy(&txr->hn_txlist_spin);
3664 mtx_destroy(&txr->hn_tx_lock);
3668 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3670 struct sysctl_oid_list *child;
3671 struct sysctl_ctx_list *ctx;
3675 * Create TXBUF for chimney sending.
3677 * NOTE: It is shared by all channels.
3679 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3680 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3681 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3682 if (sc->hn_chim == NULL) {
3683 device_printf(sc->hn_dev, "allocate txbuf failed\n");
3687 sc->hn_tx_ring_cnt = ring_cnt;
3688 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3690 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3691 M_DEVBUF, M_WAITOK | M_ZERO);
3693 ctx = device_get_sysctl_ctx(sc->hn_dev);
3694 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3696 /* Create dev.hn.UNIT.tx sysctl tree */
3697 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3698 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3700 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3703 error = hn_tx_ring_create(sc, i);
3708 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3709 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3710 __offsetof(struct hn_tx_ring, hn_no_txdescs),
3711 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3712 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3713 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3714 __offsetof(struct hn_tx_ring, hn_send_failed),
3715 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3716 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3717 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3718 __offsetof(struct hn_tx_ring, hn_txdma_failed),
3719 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3720 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3721 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3722 __offsetof(struct hn_tx_ring, hn_flush_failed),
3723 hn_tx_stat_ulong_sysctl, "LU",
3724 "# of packet transmission aggregation flush failure");
3725 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3726 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3727 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3728 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3729 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3730 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3731 __offsetof(struct hn_tx_ring, hn_tx_chimney),
3732 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3733 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3734 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3735 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3736 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3737 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3738 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3739 "# of total TX descs");
3740 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3741 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3742 "Chimney send packet size upper boundary");
3743 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3744 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3745 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3746 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3747 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3748 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3749 hn_tx_conf_int_sysctl, "I",
3750 "Size of the packet for direct transmission");
3751 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3752 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3753 __offsetof(struct hn_tx_ring, hn_sched_tx),
3754 hn_tx_conf_int_sysctl, "I",
3755 "Always schedule transmission "
3756 "instead of doing direct transmission");
3757 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3758 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3759 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3760 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3761 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3762 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3763 "Applied packet transmission aggregation size");
3764 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3765 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3766 hn_txagg_pktmax_sysctl, "I",
3767 "Applied packet transmission aggregation packets");
3768 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3769 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3770 hn_txagg_align_sysctl, "I",
3771 "Applied packet transmission aggregation alignment");
3777 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3781 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3782 sc->hn_tx_ring[i].hn_chim_size = chim_size;
3786 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3788 struct ifnet *ifp = sc->hn_ifp;
3791 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3794 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3795 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3796 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3798 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3799 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3800 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3802 if (tso_maxlen < tso_minlen)
3803 tso_maxlen = tso_minlen;
3804 else if (tso_maxlen > IP_MAXPACKET)
3805 tso_maxlen = IP_MAXPACKET;
3806 if (tso_maxlen > sc->hn_ndis_tso_szmax)
3807 tso_maxlen = sc->hn_ndis_tso_szmax;
3808 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3810 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3814 hn_fixup_tx_data(struct hn_softc *sc)
3816 uint64_t csum_assist;
3819 hn_set_chim_size(sc, sc->hn_chim_szmax);
3820 if (hn_tx_chimney_size > 0 &&
3821 hn_tx_chimney_size < sc->hn_chim_szmax)
3822 hn_set_chim_size(sc, hn_tx_chimney_size);
3825 if (sc->hn_caps & HN_CAP_IPCS)
3826 csum_assist |= CSUM_IP;
3827 if (sc->hn_caps & HN_CAP_TCP4CS)
3828 csum_assist |= CSUM_IP_TCP;
3829 if (sc->hn_caps & HN_CAP_UDP4CS)
3830 csum_assist |= CSUM_IP_UDP;
3831 if (sc->hn_caps & HN_CAP_TCP6CS)
3832 csum_assist |= CSUM_IP6_TCP;
3833 if (sc->hn_caps & HN_CAP_UDP6CS)
3834 csum_assist |= CSUM_IP6_UDP;
3835 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3836 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3838 if (sc->hn_caps & HN_CAP_HASHVAL) {
3840 * Support HASHVAL pktinfo on TX path.
3843 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3844 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3845 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3850 hn_destroy_tx_data(struct hn_softc *sc)
3854 if (sc->hn_chim != NULL) {
3855 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
3856 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3858 device_printf(sc->hn_dev,
3859 "chimney sending buffer is referenced");
3864 if (sc->hn_tx_ring_cnt == 0)
3867 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3868 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3870 free(sc->hn_tx_ring, M_DEVBUF);
3871 sc->hn_tx_ring = NULL;
3873 sc->hn_tx_ring_cnt = 0;
3874 sc->hn_tx_ring_inuse = 0;
3877 #ifdef HN_IFSTART_SUPPORT
3880 hn_start_taskfunc(void *xtxr, int pending __unused)
3882 struct hn_tx_ring *txr = xtxr;
3884 mtx_lock(&txr->hn_tx_lock);
3885 hn_start_locked(txr, 0);
3886 mtx_unlock(&txr->hn_tx_lock);
3890 hn_start_locked(struct hn_tx_ring *txr, int len)
3892 struct hn_softc *sc = txr->hn_sc;
3893 struct ifnet *ifp = sc->hn_ifp;
3896 KASSERT(hn_use_if_start,
3897 ("hn_start_locked is called, when if_start is disabled"));
3898 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3899 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3900 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3902 if (__predict_false(txr->hn_suspended))
3905 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3909 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3910 struct hn_txdesc *txd;
3911 struct mbuf *m_head;
3914 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3918 if (len > 0 && m_head->m_pkthdr.len > len) {
3920 * This sending could be time consuming; let callers
3921 * dispatch this packet sending (and sending of any
3922 * following up packets) to tx taskqueue.
3924 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3929 #if defined(INET6) || defined(INET)
3930 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3931 m_head = hn_tso_fixup(m_head);
3932 if (__predict_false(m_head == NULL)) {
3933 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3939 txd = hn_txdesc_get(txr);
3941 txr->hn_no_txdescs++;
3942 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3943 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3947 error = hn_encap(ifp, txr, txd, &m_head);
3949 /* Both txd and m_head are freed */
3950 KASSERT(txr->hn_agg_txd == NULL,
3951 ("encap failed w/ pending aggregating txdesc"));
3955 if (txr->hn_agg_pktleft == 0) {
3956 if (txr->hn_agg_txd != NULL) {
3957 KASSERT(m_head == NULL,
3958 ("pending mbuf for aggregating txdesc"));
3959 error = hn_flush_txagg(ifp, txr);
3960 if (__predict_false(error)) {
3961 atomic_set_int(&ifp->if_drv_flags,
3966 KASSERT(m_head != NULL, ("mbuf was freed"));
3967 error = hn_txpkt(ifp, txr, txd);
3968 if (__predict_false(error)) {
3969 /* txd is freed, but m_head is not */
3970 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3971 atomic_set_int(&ifp->if_drv_flags,
3979 KASSERT(txr->hn_agg_txd != NULL,
3980 ("no aggregating txdesc"));
3981 KASSERT(m_head == NULL,
3982 ("pending mbuf for aggregating txdesc"));
3987 /* Flush pending aggerated transmission. */
3988 if (txr->hn_agg_txd != NULL)
3989 hn_flush_txagg(ifp, txr);
3994 hn_start(struct ifnet *ifp)
3996 struct hn_softc *sc = ifp->if_softc;
3997 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
3999 if (txr->hn_sched_tx)
4002 if (mtx_trylock(&txr->hn_tx_lock)) {
4005 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4006 mtx_unlock(&txr->hn_tx_lock);
4011 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4015 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
4017 struct hn_tx_ring *txr = xtxr;
4019 mtx_lock(&txr->hn_tx_lock);
4020 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
4021 hn_start_locked(txr, 0);
4022 mtx_unlock(&txr->hn_tx_lock);
4026 hn_start_txeof(struct hn_tx_ring *txr)
4028 struct hn_softc *sc = txr->hn_sc;
4029 struct ifnet *ifp = sc->hn_ifp;
4031 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4033 if (txr->hn_sched_tx)
4036 if (mtx_trylock(&txr->hn_tx_lock)) {
4039 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4040 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4041 mtx_unlock(&txr->hn_tx_lock);
4043 taskqueue_enqueue(txr->hn_tx_taskq,
4049 * Release the OACTIVE earlier, with the hope, that
4050 * others could catch up. The task will clear the
4051 * flag again with the hn_tx_lock to avoid possible
4054 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4055 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4059 #endif /* HN_IFSTART_SUPPORT */
4062 hn_xmit(struct hn_tx_ring *txr, int len)
4064 struct hn_softc *sc = txr->hn_sc;
4065 struct ifnet *ifp = sc->hn_ifp;
4066 struct mbuf *m_head;
4069 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4070 #ifdef HN_IFSTART_SUPPORT
4071 KASSERT(hn_use_if_start == 0,
4072 ("hn_xmit is called, when if_start is enabled"));
4074 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4076 if (__predict_false(txr->hn_suspended))
4079 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4082 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4083 struct hn_txdesc *txd;
4086 if (len > 0 && m_head->m_pkthdr.len > len) {
4088 * This sending could be time consuming; let callers
4089 * dispatch this packet sending (and sending of any
4090 * following up packets) to tx taskqueue.
4092 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4097 txd = hn_txdesc_get(txr);
4099 txr->hn_no_txdescs++;
4100 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4101 txr->hn_oactive = 1;
4105 error = hn_encap(ifp, txr, txd, &m_head);
4107 /* Both txd and m_head are freed; discard */
4108 KASSERT(txr->hn_agg_txd == NULL,
4109 ("encap failed w/ pending aggregating txdesc"));
4110 drbr_advance(ifp, txr->hn_mbuf_br);
4114 if (txr->hn_agg_pktleft == 0) {
4115 if (txr->hn_agg_txd != NULL) {
4116 KASSERT(m_head == NULL,
4117 ("pending mbuf for aggregating txdesc"));
4118 error = hn_flush_txagg(ifp, txr);
4119 if (__predict_false(error)) {
4120 txr->hn_oactive = 1;
4124 KASSERT(m_head != NULL, ("mbuf was freed"));
4125 error = hn_txpkt(ifp, txr, txd);
4126 if (__predict_false(error)) {
4127 /* txd is freed, but m_head is not */
4128 drbr_putback(ifp, txr->hn_mbuf_br,
4130 txr->hn_oactive = 1;
4137 KASSERT(txr->hn_agg_txd != NULL,
4138 ("no aggregating txdesc"));
4139 KASSERT(m_head == NULL,
4140 ("pending mbuf for aggregating txdesc"));
4145 drbr_advance(ifp, txr->hn_mbuf_br);
4148 /* Flush pending aggerated transmission. */
4149 if (txr->hn_agg_txd != NULL)
4150 hn_flush_txagg(ifp, txr);
4155 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4157 struct hn_softc *sc = ifp->if_softc;
4158 struct hn_tx_ring *txr;
4161 #if defined(INET6) || defined(INET)
4163 * Perform TSO packet header fixup now, since the TSO
4164 * packet header should be cache-hot.
4166 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4167 m = hn_tso_fixup(m);
4168 if (__predict_false(m == NULL)) {
4169 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4176 * Select the TX ring based on flowid
4178 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
4179 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4180 txr = &sc->hn_tx_ring[idx];
4182 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4184 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4188 if (txr->hn_oactive)
4191 if (txr->hn_sched_tx)
4194 if (mtx_trylock(&txr->hn_tx_lock)) {
4197 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4198 mtx_unlock(&txr->hn_tx_lock);
4203 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4208 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4212 mtx_lock(&txr->hn_tx_lock);
4213 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4215 mtx_unlock(&txr->hn_tx_lock);
4219 hn_xmit_qflush(struct ifnet *ifp)
4221 struct hn_softc *sc = ifp->if_softc;
4224 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4225 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4230 hn_xmit_txeof(struct hn_tx_ring *txr)
4233 if (txr->hn_sched_tx)
4236 if (mtx_trylock(&txr->hn_tx_lock)) {
4239 txr->hn_oactive = 0;
4240 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4241 mtx_unlock(&txr->hn_tx_lock);
4243 taskqueue_enqueue(txr->hn_tx_taskq,
4249 * Release the oactive earlier, with the hope, that
4250 * others could catch up. The task will clear the
4251 * oactive again with the hn_tx_lock to avoid possible
4254 txr->hn_oactive = 0;
4255 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4260 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4262 struct hn_tx_ring *txr = xtxr;
4264 mtx_lock(&txr->hn_tx_lock);
4266 mtx_unlock(&txr->hn_tx_lock);
4270 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4272 struct hn_tx_ring *txr = xtxr;
4274 mtx_lock(&txr->hn_tx_lock);
4275 txr->hn_oactive = 0;
4277 mtx_unlock(&txr->hn_tx_lock);
4281 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4283 struct vmbus_chan_br cbr;
4284 struct hn_rx_ring *rxr;
4285 struct hn_tx_ring *txr = NULL;
4288 idx = vmbus_chan_subidx(chan);
4291 * Link this channel to RX/TX ring.
4293 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4294 ("invalid channel index %d, should > 0 && < %d",
4295 idx, sc->hn_rx_ring_inuse));
4296 rxr = &sc->hn_rx_ring[idx];
4297 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4298 ("RX ring %d already attached", idx));
4299 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4302 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4303 idx, vmbus_chan_id(chan));
4306 if (idx < sc->hn_tx_ring_inuse) {
4307 txr = &sc->hn_tx_ring[idx];
4308 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4309 ("TX ring %d already attached", idx));
4310 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4312 txr->hn_chan = chan;
4314 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4315 idx, vmbus_chan_id(chan));
4319 /* Bind this channel to a proper CPU. */
4320 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4325 cbr.cbr = rxr->hn_br;
4326 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4327 cbr.cbr_txsz = HN_TXBR_SIZE;
4328 cbr.cbr_rxsz = HN_RXBR_SIZE;
4329 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4331 if (error == EISCONN) {
4332 if_printf(sc->hn_ifp, "bufring is connected after "
4333 "chan%u open failure\n", vmbus_chan_id(chan));
4334 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4336 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4337 vmbus_chan_id(chan), error);
4344 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4346 struct hn_rx_ring *rxr;
4349 idx = vmbus_chan_subidx(chan);
4352 * Link this channel to RX/TX ring.
4354 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4355 ("invalid channel index %d, should > 0 && < %d",
4356 idx, sc->hn_rx_ring_inuse));
4357 rxr = &sc->hn_rx_ring[idx];
4358 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4359 ("RX ring %d is not attached", idx));
4360 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4362 if (idx < sc->hn_tx_ring_inuse) {
4363 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4365 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4366 ("TX ring %d is not attached attached", idx));
4367 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4371 * Close this channel.
4374 * Channel closing does _not_ destroy the target channel.
4376 error = vmbus_chan_close_direct(chan);
4377 if (error == EISCONN) {
4378 if_printf(sc->hn_ifp, "chan%u bufring is connected "
4379 "after being closed\n", vmbus_chan_id(chan));
4380 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4382 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4383 vmbus_chan_id(chan), error);
4388 hn_attach_subchans(struct hn_softc *sc)
4390 struct vmbus_channel **subchans;
4391 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4394 KASSERT(subchan_cnt > 0, ("no sub-channels"));
4396 /* Attach the sub-channels. */
4397 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4398 for (i = 0; i < subchan_cnt; ++i) {
4401 error1 = hn_chan_attach(sc, subchans[i]);
4404 /* Move on; all channels will be detached later. */
4407 vmbus_subchan_rel(subchans, subchan_cnt);
4410 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4413 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4421 hn_detach_allchans(struct hn_softc *sc)
4423 struct vmbus_channel **subchans;
4424 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4427 if (subchan_cnt == 0)
4430 /* Detach the sub-channels. */
4431 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4432 for (i = 0; i < subchan_cnt; ++i)
4433 hn_chan_detach(sc, subchans[i]);
4434 vmbus_subchan_rel(subchans, subchan_cnt);
4438 * Detach the primary channel, _after_ all sub-channels
4441 hn_chan_detach(sc, sc->hn_prichan);
4443 /* Wait for sub-channels to be destroyed, if any. */
4444 vmbus_subchan_drain(sc->hn_prichan);
4447 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4448 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4449 HN_RX_FLAG_ATTACHED) == 0,
4450 ("%dth RX ring is still attached", i));
4452 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4453 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4454 HN_TX_FLAG_ATTACHED) == 0,
4455 ("%dth TX ring is still attached", i));
4461 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4463 struct vmbus_channel **subchans;
4464 int nchan, rxr_cnt, error;
4466 nchan = *nsubch + 1;
4469 * Multiple RX/TX rings are not requested.
4476 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4479 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4481 /* No RSS; this is benign. */
4486 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4490 if (nchan > rxr_cnt)
4493 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4499 * Allocate sub-channels from NVS.
4501 *nsubch = nchan - 1;
4502 error = hn_nvs_alloc_subchans(sc, nsubch);
4503 if (error || *nsubch == 0) {
4504 /* Failed to allocate sub-channels. */
4510 * Wait for all sub-channels to become ready before moving on.
4512 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4513 vmbus_subchan_rel(subchans, *nsubch);
4518 hn_synth_attachable(const struct hn_softc *sc)
4522 if (sc->hn_flags & HN_FLAG_ERRORS)
4525 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4526 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4528 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4535 hn_synth_attach(struct hn_softc *sc, int mtu)
4537 #define ATTACHED_NVS 0x0002
4538 #define ATTACHED_RNDIS 0x0004
4540 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4541 int error, nsubch, nchan, i;
4542 uint32_t old_caps, attached = 0;
4544 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4545 ("synthetic parts were attached"));
4547 if (!hn_synth_attachable(sc))
4550 /* Save capabilities for later verification. */
4551 old_caps = sc->hn_caps;
4554 /* Clear RSS stuffs. */
4555 sc->hn_rss_ind_size = 0;
4556 sc->hn_rss_hash = 0;
4559 * Attach the primary channel _before_ attaching NVS and RNDIS.
4561 error = hn_chan_attach(sc, sc->hn_prichan);
4568 error = hn_nvs_attach(sc, mtu);
4571 attached |= ATTACHED_NVS;
4574 * Attach RNDIS _after_ NVS is attached.
4576 error = hn_rndis_attach(sc, mtu);
4579 attached |= ATTACHED_RNDIS;
4582 * Make sure capabilities are not changed.
4584 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4585 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4586 old_caps, sc->hn_caps);
4592 * Allocate sub-channels for multi-TX/RX rings.
4595 * The # of RX rings that can be used is equivalent to the # of
4596 * channels to be requested.
4598 nsubch = sc->hn_rx_ring_cnt - 1;
4599 error = hn_synth_alloc_subchans(sc, &nsubch);
4602 /* NOTE: _Full_ synthetic parts detach is required now. */
4603 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4606 * Set the # of TX/RX rings that could be used according to
4607 * the # of channels that NVS offered.
4610 hn_set_ring_inuse(sc, nchan);
4612 /* Only the primary channel can be used; done */
4617 * Attach the sub-channels.
4619 * NOTE: hn_set_ring_inuse() _must_ have been called.
4621 error = hn_attach_subchans(sc);
4626 * Configure RSS key and indirect table _after_ all sub-channels
4629 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4631 * RSS key is not set yet; set it to the default RSS key.
4634 if_printf(sc->hn_ifp, "setup default RSS key\n");
4635 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4636 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4639 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4641 * RSS indirect table is not set yet; set it up in round-
4645 if_printf(sc->hn_ifp, "setup default RSS indirect "
4648 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
4649 rss->rss_ind[i] = i % nchan;
4650 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4653 * # of usable channels may be changed, so we have to
4654 * make sure that all entries in RSS indirect table
4657 * NOTE: hn_set_ring_inuse() _must_ have been called.
4659 hn_rss_ind_fixup(sc);
4662 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4667 * Fixup transmission aggregation setup.
4673 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4674 hn_synth_detach(sc);
4676 if (attached & ATTACHED_RNDIS)
4677 hn_rndis_detach(sc);
4678 if (attached & ATTACHED_NVS)
4680 hn_chan_detach(sc, sc->hn_prichan);
4681 /* Restore old capabilities. */
4682 sc->hn_caps = old_caps;
4686 #undef ATTACHED_RNDIS
4692 * The interface must have been suspended though hn_suspend(), before
4693 * this function get called.
4696 hn_synth_detach(struct hn_softc *sc)
4699 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4700 ("synthetic parts were not attached"));
4702 /* Detach the RNDIS first. */
4703 hn_rndis_detach(sc);
4708 /* Detach all of the channels. */
4709 hn_detach_allchans(sc);
4711 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4715 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4717 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4718 ("invalid ring count %d", ring_cnt));
4720 if (sc->hn_tx_ring_cnt > ring_cnt)
4721 sc->hn_tx_ring_inuse = ring_cnt;
4723 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4724 sc->hn_rx_ring_inuse = ring_cnt;
4727 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4728 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4733 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4738 * The TX bufring will not be drained by the hypervisor,
4739 * if the primary channel is revoked.
4741 while (!vmbus_chan_rx_empty(chan) ||
4742 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4743 !vmbus_chan_tx_empty(chan)))
4745 vmbus_chan_intr_drain(chan);
4749 hn_suspend_data(struct hn_softc *sc)
4751 struct vmbus_channel **subch = NULL;
4752 struct hn_tx_ring *txr;
4760 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4761 txr = &sc->hn_tx_ring[i];
4763 mtx_lock(&txr->hn_tx_lock);
4764 txr->hn_suspended = 1;
4765 mtx_unlock(&txr->hn_tx_lock);
4766 /* No one is able send more packets now. */
4769 * Wait for all pending sends to finish.
4772 * We will _not_ receive all pending send-done, if the
4773 * primary channel is revoked.
4775 while (hn_tx_ring_pending(txr) &&
4776 !vmbus_chan_is_revoked(sc->hn_prichan))
4777 pause("hnwtx", 1 /* 1 tick */);
4781 * Disable RX by clearing RX filter.
4783 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
4786 * Give RNDIS enough time to flush all pending data packets.
4788 pause("waitrx", (200 * hz) / 1000);
4791 * Drain RX/TX bufrings and interrupts.
4793 nsubch = sc->hn_rx_ring_inuse - 1;
4795 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4797 if (subch != NULL) {
4798 for (i = 0; i < nsubch; ++i)
4799 hn_chan_drain(sc, subch[i]);
4801 hn_chan_drain(sc, sc->hn_prichan);
4804 vmbus_subchan_rel(subch, nsubch);
4807 * Drain any pending TX tasks.
4810 * The above hn_chan_drain() can dispatch TX tasks, so the TX
4811 * tasks will have to be drained _after_ the above hn_chan_drain()
4814 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4815 txr = &sc->hn_tx_ring[i];
4817 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4818 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4823 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4826 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4830 hn_suspend_mgmt(struct hn_softc *sc)
4837 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4838 * through hn_mgmt_taskq.
4840 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4841 vmbus_chan_run_task(sc->hn_prichan, &task);
4844 * Make sure that all pending management tasks are completed.
4846 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4847 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4848 taskqueue_drain_all(sc->hn_mgmt_taskq0);
4852 hn_suspend(struct hn_softc *sc)
4855 /* Disable polling. */
4858 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4859 hn_suspend_data(sc);
4860 hn_suspend_mgmt(sc);
4864 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4868 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4869 ("invalid TX ring count %d", tx_ring_cnt));
4871 for (i = 0; i < tx_ring_cnt; ++i) {
4872 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4874 mtx_lock(&txr->hn_tx_lock);
4875 txr->hn_suspended = 0;
4876 mtx_unlock(&txr->hn_tx_lock);
4881 hn_resume_data(struct hn_softc *sc)
4890 hn_rxfilter_config(sc);
4893 * Make sure to clear suspend status on "all" TX rings,
4894 * since hn_tx_ring_inuse can be changed after
4895 * hn_suspend_data().
4897 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4899 #ifdef HN_IFSTART_SUPPORT
4900 if (!hn_use_if_start)
4904 * Flush unused drbrs, since hn_tx_ring_inuse may be
4907 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4908 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4914 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4915 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4918 * Use txeof task, so that any pending oactive can be
4921 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4926 hn_resume_mgmt(struct hn_softc *sc)
4929 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4932 * Kick off network change detection, if it was pending.
4933 * If no network change was pending, start link status
4934 * checks, which is more lightweight than network change
4937 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4938 hn_change_network(sc);
4940 hn_update_link_status(sc);
4944 hn_resume(struct hn_softc *sc)
4947 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4952 * Re-enable polling if this interface is running and
4953 * the polling is requested.
4955 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
4956 hn_polling(sc, sc->hn_pollhz);
4960 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4962 const struct rndis_status_msg *msg;
4965 if (dlen < sizeof(*msg)) {
4966 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4971 switch (msg->rm_status) {
4972 case RNDIS_STATUS_MEDIA_CONNECT:
4973 case RNDIS_STATUS_MEDIA_DISCONNECT:
4974 hn_update_link_status(sc);
4977 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4978 /* Not really useful; ignore. */
4981 case RNDIS_STATUS_NETWORK_CHANGE:
4982 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4983 if (dlen < ofs + msg->rm_stbuflen ||
4984 msg->rm_stbuflen < sizeof(uint32_t)) {
4985 if_printf(sc->hn_ifp, "network changed\n");
4989 memcpy(&change, ((const uint8_t *)msg) + ofs,
4991 if_printf(sc->hn_ifp, "network changed, change %u\n",
4994 hn_change_network(sc);
4998 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
5005 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
5007 const struct rndis_pktinfo *pi = info_data;
5010 while (info_dlen != 0) {
5014 if (__predict_false(info_dlen < sizeof(*pi)))
5016 if (__predict_false(info_dlen < pi->rm_size))
5018 info_dlen -= pi->rm_size;
5020 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
5022 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
5024 dlen = pi->rm_size - pi->rm_pktinfooffset;
5027 switch (pi->rm_type) {
5028 case NDIS_PKTINFO_TYPE_VLAN:
5029 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
5031 info->vlan_info = *((const uint32_t *)data);
5032 mask |= HN_RXINFO_VLAN;
5035 case NDIS_PKTINFO_TYPE_CSUM:
5036 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
5038 info->csum_info = *((const uint32_t *)data);
5039 mask |= HN_RXINFO_CSUM;
5042 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5043 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5045 info->hash_value = *((const uint32_t *)data);
5046 mask |= HN_RXINFO_HASHVAL;
5049 case HN_NDIS_PKTINFO_TYPE_HASHINF:
5050 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5052 info->hash_info = *((const uint32_t *)data);
5053 mask |= HN_RXINFO_HASHINF;
5060 if (mask == HN_RXINFO_ALL) {
5061 /* All found; done */
5065 pi = (const struct rndis_pktinfo *)
5066 ((const uint8_t *)pi + pi->rm_size);
5071 * - If there is no hash value, invalidate the hash info.
5073 if ((mask & HN_RXINFO_HASHVAL) == 0)
5074 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5078 static __inline bool
5079 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5082 if (off < check_off) {
5083 if (__predict_true(off + len <= check_off))
5085 } else if (off > check_off) {
5086 if (__predict_true(check_off + check_len <= off))
5093 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5095 const struct rndis_packet_msg *pkt;
5096 struct hn_rxinfo info;
5097 int data_off, pktinfo_off, data_len, pktinfo_len;
5102 if (__predict_false(dlen < sizeof(*pkt))) {
5103 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5108 if (__predict_false(dlen < pkt->rm_len)) {
5109 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5110 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5113 if (__predict_false(pkt->rm_len <
5114 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5115 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5116 "msglen %u, data %u, oob %u, pktinfo %u\n",
5117 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5118 pkt->rm_pktinfolen);
5121 if (__predict_false(pkt->rm_datalen == 0)) {
5122 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5129 #define IS_OFFSET_INVALID(ofs) \
5130 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
5131 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5133 /* XXX Hyper-V does not meet data offset alignment requirement */
5134 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5135 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5136 "data offset %u\n", pkt->rm_dataoffset);
5139 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5140 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5141 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5142 "oob offset %u\n", pkt->rm_oobdataoffset);
5145 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5146 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5147 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5148 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5152 #undef IS_OFFSET_INVALID
5154 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5155 data_len = pkt->rm_datalen;
5156 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5157 pktinfo_len = pkt->rm_pktinfolen;
5160 * Check OOB coverage.
5162 if (__predict_false(pkt->rm_oobdatalen != 0)) {
5163 int oob_off, oob_len;
5165 if_printf(rxr->hn_ifp, "got oobdata\n");
5166 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5167 oob_len = pkt->rm_oobdatalen;
5169 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5170 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5171 "oob overflow, msglen %u, oob abs %d len %d\n",
5172 pkt->rm_len, oob_off, oob_len);
5177 * Check against data.
5179 if (hn_rndis_check_overlap(oob_off, oob_len,
5180 data_off, data_len)) {
5181 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5182 "oob overlaps data, oob abs %d len %d, "
5183 "data abs %d len %d\n",
5184 oob_off, oob_len, data_off, data_len);
5189 * Check against pktinfo.
5191 if (pktinfo_len != 0 &&
5192 hn_rndis_check_overlap(oob_off, oob_len,
5193 pktinfo_off, pktinfo_len)) {
5194 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5195 "oob overlaps pktinfo, oob abs %d len %d, "
5196 "pktinfo abs %d len %d\n",
5197 oob_off, oob_len, pktinfo_off, pktinfo_len);
5203 * Check per-packet-info coverage and find useful per-packet-info.
5205 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5206 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5207 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5208 if (__predict_true(pktinfo_len != 0)) {
5212 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5213 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5214 "pktinfo overflow, msglen %u, "
5215 "pktinfo abs %d len %d\n",
5216 pkt->rm_len, pktinfo_off, pktinfo_len);
5221 * Check packet info coverage.
5223 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5224 data_off, data_len);
5225 if (__predict_false(overlap)) {
5226 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5227 "pktinfo overlap data, pktinfo abs %d len %d, "
5228 "data abs %d len %d\n",
5229 pktinfo_off, pktinfo_len, data_off, data_len);
5234 * Find useful per-packet-info.
5236 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5237 pktinfo_len, &info);
5238 if (__predict_false(error)) {
5239 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5245 if (__predict_false(data_off + data_len > pkt->rm_len)) {
5246 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5247 "data overflow, msglen %u, data abs %d len %d\n",
5248 pkt->rm_len, data_off, data_len);
5251 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5254 static __inline void
5255 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5257 const struct rndis_msghdr *hdr;
5259 if (__predict_false(dlen < sizeof(*hdr))) {
5260 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5265 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5266 /* Hot data path. */
5267 hn_rndis_rx_data(rxr, data, dlen);
5272 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5273 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5275 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5279 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5281 const struct hn_nvs_hdr *hdr;
5283 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5284 if_printf(sc->hn_ifp, "invalid nvs notify\n");
5287 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5289 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5290 /* Useless; ignore */
5293 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5297 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5298 const struct vmbus_chanpkt_hdr *pkt)
5300 struct hn_nvs_sendctx *sndc;
5302 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5303 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5304 VMBUS_CHANPKT_DATALEN(pkt));
5307 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5313 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5314 const struct vmbus_chanpkt_hdr *pkthdr)
5316 const struct vmbus_chanpkt_rxbuf *pkt;
5317 const struct hn_nvs_hdr *nvs_hdr;
5320 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5321 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5324 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5326 /* Make sure that this is a RNDIS message. */
5327 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5328 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5333 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5334 if (__predict_false(hlen < sizeof(*pkt))) {
5335 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5338 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5340 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5341 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5346 count = pkt->cp_rxbuf_cnt;
5347 if (__predict_false(hlen <
5348 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5349 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5353 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5354 for (i = 0; i < count; ++i) {
5357 ofs = pkt->cp_rxbuf[i].rb_ofs;
5358 len = pkt->cp_rxbuf[i].rb_len;
5359 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5360 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5361 "ofs %d, len %d\n", i, ofs, len);
5364 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5368 * Ack the consumed RXBUF associated w/ this channel packet,
5369 * so that this RXBUF can be recycled by the hypervisor.
5371 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5375 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5378 struct hn_nvs_rndis_ack ack;
5381 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5382 ack.nvs_status = HN_NVS_STATUS_OK;
5386 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5387 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5388 if (__predict_false(error == EAGAIN)) {
5391 * This should _not_ happen in real world, since the
5392 * consumption of the TX bufring from the TX path is
5395 if (rxr->hn_ack_failed == 0)
5396 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5397 rxr->hn_ack_failed++;
5404 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5409 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5411 struct hn_rx_ring *rxr = xrxr;
5412 struct hn_softc *sc = rxr->hn_ifp->if_softc;
5415 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5418 pktlen = rxr->hn_pktbuf_len;
5419 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5420 if (__predict_false(error == ENOBUFS)) {
5425 * Expand channel packet buffer.
5428 * Use M_WAITOK here, since allocation failure
5431 nlen = rxr->hn_pktbuf_len * 2;
5432 while (nlen < pktlen)
5434 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5436 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5437 rxr->hn_pktbuf_len, nlen);
5439 free(rxr->hn_pktbuf, M_DEVBUF);
5440 rxr->hn_pktbuf = nbuf;
5441 rxr->hn_pktbuf_len = nlen;
5444 } else if (__predict_false(error == EAGAIN)) {
5445 /* No more channel packets; done! */
5448 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5450 switch (pkt->cph_type) {
5451 case VMBUS_CHANPKT_TYPE_COMP:
5452 hn_nvs_handle_comp(sc, chan, pkt);
5455 case VMBUS_CHANPKT_TYPE_RXBUF:
5456 hn_nvs_handle_rxbuf(rxr, chan, pkt);
5459 case VMBUS_CHANPKT_TYPE_INBAND:
5460 hn_nvs_handle_notify(sc, pkt);
5464 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5469 hn_chan_rollup(rxr, rxr->hn_txr);
5473 hn_tx_taskq_create(void *arg __unused)
5478 * Fix the # of TX taskqueues.
5480 if (hn_tx_taskq_cnt <= 0)
5481 hn_tx_taskq_cnt = 1;
5482 else if (hn_tx_taskq_cnt > mp_ncpus)
5483 hn_tx_taskq_cnt = mp_ncpus;
5486 * Fix the TX taskqueue mode.
5488 switch (hn_tx_taskq_mode) {
5489 case HN_TX_TASKQ_M_INDEP:
5490 case HN_TX_TASKQ_M_GLOBAL:
5491 case HN_TX_TASKQ_M_EVTTQ:
5494 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
5498 if (vm_guest != VM_GUEST_HV)
5501 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
5504 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
5505 M_DEVBUF, M_WAITOK);
5506 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
5507 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
5508 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
5509 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
5513 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5514 hn_tx_taskq_create, NULL);
5517 hn_tx_taskq_destroy(void *arg __unused)
5520 if (hn_tx_taskque != NULL) {
5523 for (i = 0; i < hn_tx_taskq_cnt; ++i)
5524 taskqueue_free(hn_tx_taskque[i]);
5525 free(hn_tx_taskque, M_DEVBUF);
5528 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5529 hn_tx_taskq_destroy, NULL);