2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
61 #include <sys/param.h>
63 #include <sys/kernel.h>
64 #include <sys/limits.h>
65 #include <sys/malloc.h>
67 #include <sys/module.h>
69 #include <sys/queue.h>
72 #include <sys/socket.h>
73 #include <sys/sockio.h>
75 #include <sys/sysctl.h>
76 #include <sys/systm.h>
77 #include <sys/taskqueue.h>
78 #include <sys/buf_ring.h>
80 #include <machine/atomic.h>
81 #include <machine/in_cksum.h>
84 #include <net/ethernet.h>
86 #include <net/if_arp.h>
87 #include <net/if_media.h>
88 #include <net/if_types.h>
89 #include <net/if_var.h>
90 #include <net/if_vlan_var.h>
91 #include <net/rndis.h>
93 #include <netinet/in_systm.h>
94 #include <netinet/in.h>
95 #include <netinet/ip.h>
96 #include <netinet/ip6.h>
97 #include <netinet/tcp.h>
98 #include <netinet/tcp_lro.h>
99 #include <netinet/udp.h>
101 #include <dev/hyperv/include/hyperv.h>
102 #include <dev/hyperv/include/hyperv_busdma.h>
103 #include <dev/hyperv/include/vmbus.h>
104 #include <dev/hyperv/include/vmbus_xact.h>
106 #include <dev/hyperv/netvsc/ndis.h>
107 #include <dev/hyperv/netvsc/if_hnreg.h>
108 #include <dev/hyperv/netvsc/if_hnvar.h>
109 #include <dev/hyperv/netvsc/hn_nvs.h>
110 #include <dev/hyperv/netvsc/hn_rndis.h>
112 #include "vmbus_if.h"
114 #define HN_IFSTART_SUPPORT
116 #define HN_RING_CNT_DEF_MAX 8
118 /* YYY should get it from the underlying channel */
119 #define HN_TX_DESC_CNT 512
121 #define HN_RNDIS_PKT_LEN \
122 (sizeof(struct rndis_packet_msg) + \
123 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
124 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
125 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
126 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
127 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
128 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
130 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
131 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
132 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
133 /* -1 for RNDIS packet message */
134 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
136 #define HN_DIRECT_TX_SIZE_DEF 128
138 #define HN_EARLY_TXEOF_THRESH 8
140 #define HN_PKTBUF_LEN_DEF (16 * 1024)
142 #define HN_LROENT_CNT_DEF 128
144 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
145 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
146 /* YYY 2*MTU is a bit rough, but should be good enough. */
147 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
149 #define HN_LRO_ACKCNT_DEF 1
151 #define HN_LOCK_INIT(sc) \
152 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
153 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
154 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
155 #define HN_LOCK(sc) \
157 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
160 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
162 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
163 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
164 #define HN_CSUM_IP_HWASSIST(sc) \
165 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
166 #define HN_CSUM_IP6_HWASSIST(sc) \
167 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
169 #define HN_PKTSIZE_MIN(align) \
170 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
171 HN_RNDIS_PKT_LEN, (align))
172 #define HN_PKTSIZE(m, align) \
173 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
176 #ifndef HN_USE_TXDESC_BUFRING
177 SLIST_ENTRY(hn_txdesc) link;
179 STAILQ_ENTRY(hn_txdesc) agg_link;
181 /* Aggregated txdescs, in sending order. */
182 STAILQ_HEAD(, hn_txdesc) agg_list;
184 /* The oldest packet, if transmission aggregation happens. */
186 struct hn_tx_ring *txr;
188 uint32_t flags; /* HN_TXD_FLAG_ */
189 struct hn_nvs_sendctx send_ctx;
193 bus_dmamap_t data_dmap;
195 bus_addr_t rndis_pkt_paddr;
196 struct rndis_packet_msg *rndis_pkt;
197 bus_dmamap_t rndis_pkt_dmap;
200 #define HN_TXD_FLAG_ONLIST 0x0001
201 #define HN_TXD_FLAG_DMAMAP 0x0002
202 #define HN_TXD_FLAG_ONAGG 0x0004
211 #define HN_RXINFO_VLAN 0x0001
212 #define HN_RXINFO_CSUM 0x0002
213 #define HN_RXINFO_HASHINF 0x0004
214 #define HN_RXINFO_HASHVAL 0x0008
215 #define HN_RXINFO_ALL \
218 HN_RXINFO_HASHINF | \
221 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
222 #define HN_NDIS_RXCSUM_INFO_INVALID 0
223 #define HN_NDIS_HASH_INFO_INVALID 0
225 static int hn_probe(device_t);
226 static int hn_attach(device_t);
227 static int hn_detach(device_t);
228 static int hn_shutdown(device_t);
229 static void hn_chan_callback(struct vmbus_channel *,
232 static void hn_init(void *);
233 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
234 #ifdef HN_IFSTART_SUPPORT
235 static void hn_start(struct ifnet *);
237 static int hn_transmit(struct ifnet *, struct mbuf *);
238 static void hn_xmit_qflush(struct ifnet *);
239 static int hn_ifmedia_upd(struct ifnet *);
240 static void hn_ifmedia_sts(struct ifnet *,
241 struct ifmediareq *);
243 static int hn_rndis_rxinfo(const void *, int,
245 static void hn_rndis_rx_data(struct hn_rx_ring *,
247 static void hn_rndis_rx_status(struct hn_softc *,
250 static void hn_nvs_handle_notify(struct hn_softc *,
251 const struct vmbus_chanpkt_hdr *);
252 static void hn_nvs_handle_comp(struct hn_softc *,
253 struct vmbus_channel *,
254 const struct vmbus_chanpkt_hdr *);
255 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
256 struct vmbus_channel *,
257 const struct vmbus_chanpkt_hdr *);
258 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
259 struct vmbus_channel *, uint64_t);
261 #if __FreeBSD_version >= 1100099
262 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
263 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
265 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
266 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
267 #if __FreeBSD_version < 1100095
268 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
270 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
272 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
273 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
274 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
275 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
276 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
277 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
278 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
279 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
280 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
281 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
282 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
283 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
284 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
285 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
287 static void hn_stop(struct hn_softc *);
288 static void hn_init_locked(struct hn_softc *);
289 static int hn_chan_attach(struct hn_softc *,
290 struct vmbus_channel *);
291 static void hn_chan_detach(struct hn_softc *,
292 struct vmbus_channel *);
293 static int hn_attach_subchans(struct hn_softc *);
294 static void hn_detach_allchans(struct hn_softc *);
295 static void hn_chan_rollup(struct hn_rx_ring *,
296 struct hn_tx_ring *);
297 static void hn_set_ring_inuse(struct hn_softc *, int);
298 static int hn_synth_attach(struct hn_softc *, int);
299 static void hn_synth_detach(struct hn_softc *);
300 static int hn_synth_alloc_subchans(struct hn_softc *,
302 static void hn_suspend(struct hn_softc *);
303 static void hn_suspend_data(struct hn_softc *);
304 static void hn_suspend_mgmt(struct hn_softc *);
305 static void hn_resume(struct hn_softc *);
306 static void hn_resume_data(struct hn_softc *);
307 static void hn_resume_mgmt(struct hn_softc *);
308 static void hn_suspend_mgmt_taskfunc(void *, int);
309 static void hn_chan_drain(struct vmbus_channel *);
311 static void hn_update_link_status(struct hn_softc *);
312 static void hn_change_network(struct hn_softc *);
313 static void hn_link_taskfunc(void *, int);
314 static void hn_netchg_init_taskfunc(void *, int);
315 static void hn_netchg_status_taskfunc(void *, int);
316 static void hn_link_status(struct hn_softc *);
318 static int hn_create_rx_data(struct hn_softc *, int);
319 static void hn_destroy_rx_data(struct hn_softc *);
320 static int hn_check_iplen(const struct mbuf *, int);
321 static int hn_set_rxfilter(struct hn_softc *);
322 static int hn_rss_reconfig(struct hn_softc *);
323 static void hn_rss_ind_fixup(struct hn_softc *, int);
324 static int hn_rxpkt(struct hn_rx_ring *, const void *,
325 int, const struct hn_rxinfo *);
327 static int hn_tx_ring_create(struct hn_softc *, int);
328 static void hn_tx_ring_destroy(struct hn_tx_ring *);
329 static int hn_create_tx_data(struct hn_softc *, int);
330 static void hn_fixup_tx_data(struct hn_softc *);
331 static void hn_destroy_tx_data(struct hn_softc *);
332 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
333 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
334 struct hn_txdesc *, struct mbuf **);
335 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
337 static void hn_set_chim_size(struct hn_softc *, int);
338 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
339 static bool hn_tx_ring_pending(struct hn_tx_ring *);
340 static void hn_tx_ring_qflush(struct hn_tx_ring *);
341 static void hn_resume_tx(struct hn_softc *, int);
342 static void hn_set_txagg(struct hn_softc *);
343 static void *hn_try_txagg(struct ifnet *,
344 struct hn_tx_ring *, struct hn_txdesc *,
346 static int hn_get_txswq_depth(const struct hn_tx_ring *);
347 static void hn_txpkt_done(struct hn_nvs_sendctx *,
348 struct hn_softc *, struct vmbus_channel *,
350 static int hn_txpkt_sglist(struct hn_tx_ring *,
352 static int hn_txpkt_chim(struct hn_tx_ring *,
354 static int hn_xmit(struct hn_tx_ring *, int);
355 static void hn_xmit_taskfunc(void *, int);
356 static void hn_xmit_txeof(struct hn_tx_ring *);
357 static void hn_xmit_txeof_taskfunc(void *, int);
358 #ifdef HN_IFSTART_SUPPORT
359 static int hn_start_locked(struct hn_tx_ring *, int);
360 static void hn_start_taskfunc(void *, int);
361 static void hn_start_txeof(struct hn_tx_ring *);
362 static void hn_start_txeof_taskfunc(void *, int);
365 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
366 "Hyper-V network interface");
368 /* Trust tcp segements verification on host side. */
369 static int hn_trust_hosttcp = 1;
370 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
371 &hn_trust_hosttcp, 0,
372 "Trust tcp segement verification on host side, "
373 "when csum info is missing (global setting)");
375 /* Trust udp datagrams verification on host side. */
376 static int hn_trust_hostudp = 1;
377 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
378 &hn_trust_hostudp, 0,
379 "Trust udp datagram verification on host side, "
380 "when csum info is missing (global setting)");
382 /* Trust ip packets verification on host side. */
383 static int hn_trust_hostip = 1;
384 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
386 "Trust ip packet verification on host side, "
387 "when csum info is missing (global setting)");
389 /* Limit TSO burst size */
390 static int hn_tso_maxlen = IP_MAXPACKET;
391 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
392 &hn_tso_maxlen, 0, "TSO burst limit");
394 /* Limit chimney send size */
395 static int hn_tx_chimney_size = 0;
396 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
397 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
399 /* Limit the size of packet for direct transmission */
400 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
401 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
402 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
404 /* # of LRO entries per RX ring */
405 #if defined(INET) || defined(INET6)
406 #if __FreeBSD_version >= 1100095
407 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
408 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
409 &hn_lro_entry_count, 0, "LRO entry count");
413 /* Use shared TX taskqueue */
414 static int hn_share_tx_taskq = 0;
415 SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
416 &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
418 #ifndef HN_USE_TXDESC_BUFRING
419 static int hn_use_txdesc_bufring = 0;
421 static int hn_use_txdesc_bufring = 1;
423 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
424 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
426 /* Bind TX taskqueue to the target CPU */
427 static int hn_bind_tx_taskq = -1;
428 SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
429 &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
431 #ifdef HN_IFSTART_SUPPORT
432 /* Use ifnet.if_start instead of ifnet.if_transmit */
433 static int hn_use_if_start = 0;
434 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
435 &hn_use_if_start, 0, "Use if_start TX method");
438 /* # of channels to use */
439 static int hn_chan_cnt = 0;
440 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
442 "# of channels to use; each channel has one RX ring and one TX ring");
444 /* # of transmit rings to use */
445 static int hn_tx_ring_cnt = 0;
446 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
447 &hn_tx_ring_cnt, 0, "# of TX rings to use");
449 /* Software TX ring deptch */
450 static int hn_tx_swq_depth = 0;
451 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
452 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
454 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
455 #if __FreeBSD_version >= 1100095
456 static u_int hn_lro_mbufq_depth = 0;
457 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
458 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
461 /* Packet transmission aggregation size limit */
462 static int hn_tx_agg_size = -1;
463 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
464 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
466 /* Packet transmission aggregation count limit */
467 static int hn_tx_agg_pkts = 0;
468 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
469 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
471 static u_int hn_cpu_index; /* next CPU for channel */
472 static struct taskqueue *hn_tx_taskq; /* shared TX taskqueue */
475 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
476 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
477 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
478 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
479 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
480 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
483 static device_method_t hn_methods[] = {
484 /* Device interface */
485 DEVMETHOD(device_probe, hn_probe),
486 DEVMETHOD(device_attach, hn_attach),
487 DEVMETHOD(device_detach, hn_detach),
488 DEVMETHOD(device_shutdown, hn_shutdown),
492 static driver_t hn_driver = {
495 sizeof(struct hn_softc)
498 static devclass_t hn_devclass;
500 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
501 MODULE_VERSION(hn, 1);
502 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
504 #if __FreeBSD_version >= 1100099
506 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
510 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
511 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
516 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
519 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
520 txd->chim_size == 0, ("invalid rndis sglist txd"));
521 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
522 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
526 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
528 struct hn_nvs_rndis rndis;
530 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
531 txd->chim_size > 0, ("invalid rndis chim txd"));
533 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
534 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
535 rndis.nvs_chim_idx = txd->chim_index;
536 rndis.nvs_chim_sz = txd->chim_size;
538 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
539 &rndis, sizeof(rndis), &txd->send_ctx));
542 static __inline uint32_t
543 hn_chim_alloc(struct hn_softc *sc)
545 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
546 u_long *bmap = sc->hn_chim_bmap;
547 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
549 for (i = 0; i < bmap_cnt; ++i) {
552 idx = ffsl(~bmap[i]);
556 --idx; /* ffsl is 1-based */
557 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
558 ("invalid i %d and idx %d", i, idx));
560 if (atomic_testandset_long(&bmap[i], idx))
563 ret = i * LONG_BIT + idx;
570 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
575 idx = chim_idx / LONG_BIT;
576 KASSERT(idx < sc->hn_chim_bmap_cnt,
577 ("invalid chimney index 0x%x", chim_idx));
579 mask = 1UL << (chim_idx % LONG_BIT);
580 KASSERT(sc->hn_chim_bmap[idx] & mask,
581 ("index bitmap 0x%lx, chimney index %u, "
582 "bitmap idx %d, bitmask 0x%lx",
583 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
585 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
588 #if defined(INET6) || defined(INET)
590 * NOTE: If this function failed, the m_head would be freed.
592 static __inline struct mbuf *
593 hn_tso_fixup(struct mbuf *m_head)
595 struct ether_vlan_header *evl;
599 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
601 #define PULLUP_HDR(m, len) \
603 if (__predict_false((m)->m_len < (len))) { \
604 (m) = m_pullup((m), (len)); \
610 PULLUP_HDR(m_head, sizeof(*evl));
611 evl = mtod(m_head, struct ether_vlan_header *);
612 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
613 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
615 ehlen = ETHER_HDR_LEN;
618 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
622 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
623 ip = mtodo(m_head, ehlen);
624 iphlen = ip->ip_hl << 2;
626 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
627 th = mtodo(m_head, ehlen + iphlen);
631 th->th_sum = in_pseudo(ip->ip_src.s_addr,
632 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
635 #if defined(INET6) && defined(INET)
642 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
643 ip6 = mtodo(m_head, ehlen);
644 if (ip6->ip6_nxt != IPPROTO_TCP) {
649 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
650 th = mtodo(m_head, ehlen + sizeof(*ip6));
653 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
660 #endif /* INET6 || INET */
663 hn_set_rxfilter(struct hn_softc *sc)
665 struct ifnet *ifp = sc->hn_ifp;
671 if (ifp->if_flags & IFF_PROMISC) {
672 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
674 filter = NDIS_PACKET_TYPE_DIRECTED;
675 if (ifp->if_flags & IFF_BROADCAST)
676 filter |= NDIS_PACKET_TYPE_BROADCAST;
677 /* TODO: support multicast list */
678 if ((ifp->if_flags & IFF_ALLMULTI) ||
679 !TAILQ_EMPTY(&ifp->if_multiaddrs))
680 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
683 if (sc->hn_rx_filter != filter) {
684 error = hn_rndis_set_rxfilter(sc, filter);
686 sc->hn_rx_filter = filter;
692 hn_set_txagg(struct hn_softc *sc)
698 * Setup aggregation size.
700 if (sc->hn_agg_size < 0)
703 size = sc->hn_agg_size;
705 if (sc->hn_rndis_agg_size < size)
706 size = sc->hn_rndis_agg_size;
708 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
715 /* NOTE: Type of the per TX ring setting is 'int'. */
719 /* NOTE: We only aggregate packets using chimney sending buffers. */
720 if (size > (uint32_t)sc->hn_chim_szmax)
721 size = sc->hn_chim_szmax;
724 * Setup aggregation packet count.
726 if (sc->hn_agg_pkts < 0)
729 pkts = sc->hn_agg_pkts;
731 if (sc->hn_rndis_agg_pkts < pkts)
732 pkts = sc->hn_rndis_agg_pkts;
741 /* NOTE: Type of the per TX ring setting is 'short'. */
746 /* NOTE: Type of the per TX ring setting is 'short'. */
747 if (sc->hn_rndis_agg_align > SHRT_MAX) {
754 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
755 size, pkts, sc->hn_rndis_agg_align);
758 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
759 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
761 mtx_lock(&txr->hn_tx_lock);
762 txr->hn_agg_szmax = size;
763 txr->hn_agg_pktmax = pkts;
764 txr->hn_agg_align = sc->hn_rndis_agg_align;
765 mtx_unlock(&txr->hn_tx_lock);
770 hn_get_txswq_depth(const struct hn_tx_ring *txr)
773 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
774 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
775 return txr->hn_txdesc_cnt;
776 return hn_tx_swq_depth;
780 hn_rss_reconfig(struct hn_softc *sc)
786 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
793 * Direct reconfiguration by setting the UNCHG flags does
794 * _not_ work properly.
797 if_printf(sc->hn_ifp, "disable RSS\n");
798 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
800 if_printf(sc->hn_ifp, "RSS disable failed\n");
805 * Reenable the RSS w/ the updated RSS key or indirect
809 if_printf(sc->hn_ifp, "reconfig RSS\n");
810 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
812 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
819 hn_rss_ind_fixup(struct hn_softc *sc, int nchan)
821 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
824 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
827 * Check indirect table to make sure that all channels in it
830 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
831 if (rss->rss_ind[i] >= nchan) {
832 if_printf(sc->hn_ifp,
833 "RSS indirect table %d fixup: %u -> %d\n",
834 i, rss->rss_ind[i], nchan - 1);
835 rss->rss_ind[i] = nchan - 1;
841 hn_ifmedia_upd(struct ifnet *ifp __unused)
848 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
850 struct hn_softc *sc = ifp->if_softc;
852 ifmr->ifm_status = IFM_AVALID;
853 ifmr->ifm_active = IFM_ETHER;
855 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
856 ifmr->ifm_active |= IFM_NONE;
859 ifmr->ifm_status |= IFM_ACTIVE;
860 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
863 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
864 static const struct hyperv_guid g_net_vsc_device_type = {
865 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
866 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
870 hn_probe(device_t dev)
873 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
874 &g_net_vsc_device_type) == 0) {
875 device_set_desc(dev, "Hyper-V Network Interface");
876 return BUS_PROBE_DEFAULT;
882 hn_cpuset_setthread_task(void *xmask, int pending __unused)
884 cpuset_t *mask = xmask;
887 error = cpuset_setthread(curthread->td_tid, mask);
889 panic("curthread=%ju: can't pin; error=%d",
890 (uintmax_t)curthread->td_tid, error);
895 hn_attach(device_t dev)
897 struct hn_softc *sc = device_get_softc(dev);
898 struct sysctl_oid_list *child;
899 struct sysctl_ctx_list *ctx;
900 uint8_t eaddr[ETHER_ADDR_LEN];
901 struct ifnet *ifp = NULL;
902 int error, ring_cnt, tx_ring_cnt;
905 sc->hn_prichan = vmbus_get_channel(dev);
909 * Initialize these tunables once.
911 sc->hn_agg_size = hn_tx_agg_size;
912 sc->hn_agg_pkts = hn_tx_agg_pkts;
915 * Setup taskqueue for transmission.
917 if (hn_tx_taskq == NULL) {
918 sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
919 taskqueue_thread_enqueue, &sc->hn_tx_taskq);
920 taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx",
921 device_get_nameunit(dev));
922 if (hn_bind_tx_taskq >= 0) {
923 int cpu = hn_bind_tx_taskq;
924 struct task cpuset_task;
927 if (cpu > mp_ncpus - 1)
929 CPU_SETOF(cpu, &cpu_set);
930 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task,
932 taskqueue_enqueue(sc->hn_tx_taskq, &cpuset_task);
933 taskqueue_drain(sc->hn_tx_taskq, &cpuset_task);
936 sc->hn_tx_taskq = hn_tx_taskq;
940 * Setup taskqueue for mangement tasks, e.g. link status.
942 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
943 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
944 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
945 device_get_nameunit(dev));
946 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
947 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
948 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
949 hn_netchg_status_taskfunc, sc);
952 * Allocate ifnet and setup its name earlier, so that if_printf
953 * can be used by functions, which will be called after
956 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
958 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
961 * Initialize ifmedia earlier so that it can be unconditionally
962 * destroyed, if error happened later on.
964 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
967 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
968 * to use (tx_ring_cnt).
971 * The # of RX rings to use is same as the # of channels to use.
973 ring_cnt = hn_chan_cnt;
977 if (ring_cnt > HN_RING_CNT_DEF_MAX)
978 ring_cnt = HN_RING_CNT_DEF_MAX;
979 } else if (ring_cnt > mp_ncpus) {
983 tx_ring_cnt = hn_tx_ring_cnt;
984 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
985 tx_ring_cnt = ring_cnt;
986 #ifdef HN_IFSTART_SUPPORT
987 if (hn_use_if_start) {
988 /* ifnet.if_start only needs one TX ring. */
994 * Set the leader CPU for channels.
996 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
999 * Create enough TX/RX rings, even if only limited number of
1000 * channels can be allocated.
1002 error = hn_create_tx_data(sc, tx_ring_cnt);
1005 error = hn_create_rx_data(sc, ring_cnt);
1010 * Create transaction context for NVS and RNDIS transactions.
1012 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1013 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1014 if (sc->hn_xact == NULL)
1018 * Attach the synthetic parts, i.e. NVS and RNDIS.
1020 error = hn_synth_attach(sc, ETHERMTU);
1024 error = hn_rndis_get_eaddr(sc, eaddr);
1028 #if __FreeBSD_version >= 1100099
1029 if (sc->hn_rx_ring_inuse > 1) {
1031 * Reduce TCP segment aggregation limit for multiple
1032 * RX rings to increase ACK timeliness.
1034 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1039 * Fixup TX stuffs after synthetic parts are attached.
1041 hn_fixup_tx_data(sc);
1043 ctx = device_get_sysctl_ctx(dev);
1044 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1045 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1046 &sc->hn_nvs_ver, 0, "NVS version");
1047 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1048 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1049 hn_ndis_version_sysctl, "A", "NDIS version");
1050 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1051 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1052 hn_caps_sysctl, "A", "capabilities");
1053 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1054 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1055 hn_hwassist_sysctl, "A", "hwassist");
1056 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1057 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1058 hn_rxfilter_sysctl, "A", "rxfilter");
1059 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1060 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1061 hn_rss_hash_sysctl, "A", "RSS hash");
1062 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1063 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1064 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1065 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1066 hn_rss_key_sysctl, "IU", "RSS key");
1067 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1068 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1069 hn_rss_ind_sysctl, "IU", "RSS indirect table");
1070 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1071 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1072 "RNDIS offered packet transmission aggregation size limit");
1073 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1074 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1075 "RNDIS offered packet transmission aggregation count limit");
1076 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1077 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1078 "RNDIS packet transmission aggregation alignment");
1079 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1080 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1081 hn_txagg_size_sysctl, "I",
1082 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1083 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1084 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1085 hn_txagg_pkts_sysctl, "I",
1086 "Packet transmission aggregation packets, "
1087 "0 -- disable, -1 -- auto");
1090 * Setup the ifmedia, which has been initialized earlier.
1092 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1093 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1094 /* XXX ifmedia_set really should do this for us */
1095 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1098 * Setup the ifnet for this interface.
1102 ifp->if_baudrate = IF_Gbps(10);
1104 /* if_baudrate is 32bits on 32bit system. */
1105 ifp->if_baudrate = IF_Gbps(1);
1107 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1108 ifp->if_ioctl = hn_ioctl;
1109 ifp->if_init = hn_init;
1110 #ifdef HN_IFSTART_SUPPORT
1111 if (hn_use_if_start) {
1112 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1114 ifp->if_start = hn_start;
1115 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1116 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1117 IFQ_SET_READY(&ifp->if_snd);
1121 ifp->if_transmit = hn_transmit;
1122 ifp->if_qflush = hn_xmit_qflush;
1125 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1127 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1128 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1130 if (sc->hn_caps & HN_CAP_VLAN) {
1131 /* XXX not sure about VLAN_MTU. */
1132 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1135 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1136 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1137 ifp->if_capabilities |= IFCAP_TXCSUM;
1138 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1139 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1140 if (sc->hn_caps & HN_CAP_TSO4) {
1141 ifp->if_capabilities |= IFCAP_TSO4;
1142 ifp->if_hwassist |= CSUM_IP_TSO;
1144 if (sc->hn_caps & HN_CAP_TSO6) {
1145 ifp->if_capabilities |= IFCAP_TSO6;
1146 ifp->if_hwassist |= CSUM_IP6_TSO;
1149 /* Enable all available capabilities by default. */
1150 ifp->if_capenable = ifp->if_capabilities;
1153 * Disable IPv6 TSO and TXCSUM by default, they still can
1154 * be enabled through SIOCSIFCAP.
1156 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1157 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1159 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1160 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1161 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1162 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1165 ether_ifattach(ifp, eaddr);
1167 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1168 if_printf(ifp, "TSO segcnt %u segsz %u\n",
1169 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1172 /* Inform the upper layer about the long frame support. */
1173 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1176 * Kick off link status check.
1178 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1179 hn_update_link_status(sc);
1183 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1184 hn_synth_detach(sc);
1190 hn_detach(device_t dev)
1192 struct hn_softc *sc = device_get_softc(dev);
1193 struct ifnet *ifp = sc->hn_ifp;
1195 if (device_is_attached(dev)) {
1197 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1198 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1202 * hn_stop() only suspends data, so managment
1203 * stuffs have to be suspended manually here.
1205 hn_suspend_mgmt(sc);
1206 hn_synth_detach(sc);
1209 ether_ifdetach(ifp);
1212 ifmedia_removeall(&sc->hn_media);
1213 hn_destroy_rx_data(sc);
1214 hn_destroy_tx_data(sc);
1216 if (sc->hn_tx_taskq != hn_tx_taskq)
1217 taskqueue_free(sc->hn_tx_taskq);
1218 taskqueue_free(sc->hn_mgmt_taskq0);
1220 if (sc->hn_xact != NULL)
1221 vmbus_xact_ctx_destroy(sc->hn_xact);
1225 HN_LOCK_DESTROY(sc);
1230 hn_shutdown(device_t dev)
1237 hn_link_status(struct hn_softc *sc)
1239 uint32_t link_status;
1242 error = hn_rndis_get_linkstatus(sc, &link_status);
1244 /* XXX what to do? */
1248 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1249 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1251 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1252 if_link_state_change(sc->hn_ifp,
1253 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1254 LINK_STATE_UP : LINK_STATE_DOWN);
1258 hn_link_taskfunc(void *xsc, int pending __unused)
1260 struct hn_softc *sc = xsc;
1262 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1268 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1270 struct hn_softc *sc = xsc;
1272 /* Prevent any link status checks from running. */
1273 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1276 * Fake up a [link down --> link up] state change; 5 seconds
1277 * delay is used, which closely simulates miibus reaction
1278 * upon link down event.
1280 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1281 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1282 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1283 &sc->hn_netchg_status, 5 * hz);
1287 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1289 struct hn_softc *sc = xsc;
1291 /* Re-allow link status checks. */
1292 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1297 hn_update_link_status(struct hn_softc *sc)
1300 if (sc->hn_mgmt_taskq != NULL)
1301 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1305 hn_change_network(struct hn_softc *sc)
1308 if (sc->hn_mgmt_taskq != NULL)
1309 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1313 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1314 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1316 struct mbuf *m = *m_head;
1319 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1321 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1322 m, segs, nsegs, BUS_DMA_NOWAIT);
1323 if (error == EFBIG) {
1326 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1330 *m_head = m = m_new;
1331 txr->hn_tx_collapsed++;
1333 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1334 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1337 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1338 BUS_DMASYNC_PREWRITE);
1339 txd->flags |= HN_TXD_FLAG_DMAMAP;
1345 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1348 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1349 ("put an onlist txd %#x", txd->flags));
1350 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1351 ("put an onagg txd %#x", txd->flags));
1353 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1354 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1357 if (!STAILQ_EMPTY(&txd->agg_list)) {
1358 struct hn_txdesc *tmp_txd;
1360 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1363 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1364 ("resursive aggregation on aggregated txdesc"));
1365 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1366 ("not aggregated txdesc"));
1367 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1368 ("aggregated txdesc uses dmamap"));
1369 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1370 ("aggregated txdesc consumes "
1371 "chimney sending buffer"));
1372 KASSERT(tmp_txd->chim_size == 0,
1373 ("aggregated txdesc has non-zero "
1374 "chimney sending size"));
1376 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1377 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1378 freed = hn_txdesc_put(txr, tmp_txd);
1379 KASSERT(freed, ("failed to free aggregated txdesc"));
1383 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1384 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1385 ("chim txd uses dmamap"));
1386 hn_chim_free(txr->hn_sc, txd->chim_index);
1387 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1389 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1390 bus_dmamap_sync(txr->hn_tx_data_dtag,
1391 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1392 bus_dmamap_unload(txr->hn_tx_data_dtag,
1394 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1397 if (txd->m != NULL) {
1402 txd->flags |= HN_TXD_FLAG_ONLIST;
1403 #ifndef HN_USE_TXDESC_BUFRING
1404 mtx_lock_spin(&txr->hn_txlist_spin);
1405 KASSERT(txr->hn_txdesc_avail >= 0 &&
1406 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1407 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1408 txr->hn_txdesc_avail++;
1409 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1410 mtx_unlock_spin(&txr->hn_txlist_spin);
1412 atomic_add_int(&txr->hn_txdesc_avail, 1);
1413 buf_ring_enqueue(txr->hn_txdesc_br, txd);
1419 static __inline struct hn_txdesc *
1420 hn_txdesc_get(struct hn_tx_ring *txr)
1422 struct hn_txdesc *txd;
1424 #ifndef HN_USE_TXDESC_BUFRING
1425 mtx_lock_spin(&txr->hn_txlist_spin);
1426 txd = SLIST_FIRST(&txr->hn_txlist);
1428 KASSERT(txr->hn_txdesc_avail > 0,
1429 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1430 txr->hn_txdesc_avail--;
1431 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1433 mtx_unlock_spin(&txr->hn_txlist_spin);
1435 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1439 #ifdef HN_USE_TXDESC_BUFRING
1440 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1442 KASSERT(txd->m == NULL && txd->refs == 0 &&
1443 STAILQ_EMPTY(&txd->agg_list) &&
1444 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1445 txd->chim_size == 0 &&
1446 (txd->flags & HN_TXD_FLAG_ONLIST) &&
1447 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1448 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1449 txd->flags &= ~HN_TXD_FLAG_ONLIST;
1455 static __inline void
1456 hn_txdesc_hold(struct hn_txdesc *txd)
1459 /* 0->1 transition will never work */
1460 KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
1461 atomic_add_int(&txd->refs, 1);
1464 static __inline void
1465 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1468 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1469 ("recursive aggregation on aggregating txdesc"));
1471 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1472 ("already aggregated"));
1473 KASSERT(STAILQ_EMPTY(&txd->agg_list),
1474 ("recursive aggregation on to-be-aggregated txdesc"));
1476 txd->flags |= HN_TXD_FLAG_ONAGG;
1477 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1481 hn_tx_ring_pending(struct hn_tx_ring *txr)
1483 bool pending = false;
1485 #ifndef HN_USE_TXDESC_BUFRING
1486 mtx_lock_spin(&txr->hn_txlist_spin);
1487 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1489 mtx_unlock_spin(&txr->hn_txlist_spin);
1491 if (!buf_ring_full(txr->hn_txdesc_br))
1497 static __inline void
1498 hn_txeof(struct hn_tx_ring *txr)
1500 txr->hn_has_txeof = 0;
1505 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1506 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1508 struct hn_txdesc *txd = sndc->hn_cbarg;
1509 struct hn_tx_ring *txr;
1512 KASSERT(txr->hn_chan == chan,
1513 ("channel mismatch, on chan%u, should be chan%u",
1514 vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan)));
1516 txr->hn_has_txeof = 1;
1517 hn_txdesc_put(txr, txd);
1519 ++txr->hn_txdone_cnt;
1520 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1521 txr->hn_txdone_cnt = 0;
1522 if (txr->hn_oactive)
1528 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1530 #if defined(INET) || defined(INET6)
1531 struct lro_ctrl *lro = &rxr->hn_lro;
1532 struct lro_entry *queued;
1534 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1535 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1536 tcp_lro_flush(lro, queued);
1542 * 'txr' could be NULL, if multiple channels and
1543 * ifnet.if_start method are enabled.
1545 if (txr == NULL || !txr->hn_has_txeof)
1548 txr->hn_txdone_cnt = 0;
1552 static __inline uint32_t
1553 hn_rndis_pktmsg_offset(uint32_t ofs)
1556 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1557 ("invalid RNDIS packet msg offset %u", ofs));
1558 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1561 static __inline void *
1562 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1563 size_t pi_dlen, uint32_t pi_type)
1565 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1566 struct rndis_pktinfo *pi;
1568 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1569 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1572 * Per-packet-info does not move; it only grows.
1575 * rm_pktinfooffset in this phase counts from the beginning
1576 * of rndis_packet_msg.
1578 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1579 ("%u pktinfo overflows RNDIS packet msg", pi_type));
1580 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1581 pkt->rm_pktinfolen);
1582 pkt->rm_pktinfolen += pi_size;
1584 pi->rm_size = pi_size;
1585 pi->rm_type = pi_type;
1586 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1588 /* Data immediately follow per-packet-info. */
1589 pkt->rm_dataoffset += pi_size;
1591 /* Update RNDIS packet msg length */
1592 pkt->rm_len += pi_size;
1594 return (pi->rm_data);
1598 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1600 struct hn_txdesc *txd;
1604 txd = txr->hn_agg_txd;
1605 KASSERT(txd != NULL, ("no aggregate txdesc"));
1608 * Since hn_txpkt() will reset this temporary stat, save
1609 * it now, so that oerrors can be updated properly, if
1610 * hn_txpkt() ever fails.
1612 pkts = txr->hn_stat_pkts;
1615 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1616 * failure, save it for later freeing, if hn_txpkt() ever
1620 error = hn_txpkt(ifp, txr, txd);
1621 if (__predict_false(error)) {
1622 /* txd is freed, but m is not. */
1625 txr->hn_flush_failed++;
1626 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1629 /* Reset all aggregation states. */
1630 txr->hn_agg_txd = NULL;
1631 txr->hn_agg_szleft = 0;
1632 txr->hn_agg_pktleft = 0;
1633 txr->hn_agg_prevpkt = NULL;
1639 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1644 if (txr->hn_agg_txd != NULL) {
1645 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1646 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1647 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1651 * Update the previous RNDIS packet's total length,
1652 * it can be increased due to the mandatory alignment
1653 * padding for this RNDIS packet. And update the
1654 * aggregating txdesc's chimney sending buffer size
1658 * Zero-out the padding, as required by the RNDIS spec.
1661 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1662 agg_txd->chim_size += pkt->rm_len - olen;
1664 /* Link this txdesc to the parent. */
1665 hn_txdesc_agg(agg_txd, txd);
1667 chim = (uint8_t *)pkt + pkt->rm_len;
1668 /* Save the current packet for later fixup. */
1669 txr->hn_agg_prevpkt = chim;
1671 txr->hn_agg_pktleft--;
1672 txr->hn_agg_szleft -= pktsize;
1673 if (txr->hn_agg_szleft <=
1674 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1676 * Probably can't aggregate more packets,
1677 * flush this aggregating txdesc proactively.
1679 txr->hn_agg_pktleft = 0;
1684 hn_flush_txagg(ifp, txr);
1686 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1688 txr->hn_tx_chimney_tried++;
1689 txd->chim_index = hn_chim_alloc(txr->hn_sc);
1690 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1692 txr->hn_tx_chimney++;
1694 chim = txr->hn_sc->hn_chim +
1695 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1697 if (txr->hn_agg_pktmax > 1 &&
1698 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1699 txr->hn_agg_txd = txd;
1700 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1701 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1702 txr->hn_agg_prevpkt = chim;
1709 * If this function fails, then both txd and m_head0 will be freed.
1712 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1713 struct mbuf **m_head0)
1715 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1716 int error, nsegs, i;
1717 struct mbuf *m_head = *m_head0;
1718 struct rndis_packet_msg *pkt;
1721 int pkt_hlen, pkt_size;
1723 pkt = txd->rndis_pkt;
1724 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1725 if (pkt_size < txr->hn_chim_size) {
1726 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1730 if (txr->hn_agg_txd != NULL)
1731 hn_flush_txagg(ifp, txr);
1734 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1735 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1736 pkt->rm_dataoffset = sizeof(*pkt);
1737 pkt->rm_datalen = m_head->m_pkthdr.len;
1738 pkt->rm_oobdataoffset = 0;
1739 pkt->rm_oobdatalen = 0;
1740 pkt->rm_oobdataelements = 0;
1741 pkt->rm_pktinfooffset = sizeof(*pkt);
1742 pkt->rm_pktinfolen = 0;
1743 pkt->rm_vchandle = 0;
1744 pkt->rm_reserved = 0;
1746 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1748 * Set the hash value for this packet, so that the host could
1749 * dispatch the TX done event for this packet back to this TX
1752 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1753 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1754 *pi_data = txr->hn_tx_idx;
1757 if (m_head->m_flags & M_VLANTAG) {
1758 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1759 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1760 *pi_data = NDIS_VLAN_INFO_MAKE(
1761 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1762 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1763 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1766 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1767 #if defined(INET6) || defined(INET)
1768 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1769 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1771 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1772 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1773 m_head->m_pkthdr.tso_segsz);
1776 #if defined(INET6) && defined(INET)
1781 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1782 m_head->m_pkthdr.tso_segsz);
1785 #endif /* INET6 || INET */
1786 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1787 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1788 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1789 if (m_head->m_pkthdr.csum_flags &
1790 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1791 *pi_data = NDIS_TXCSUM_INFO_IPV6;
1793 *pi_data = NDIS_TXCSUM_INFO_IPV4;
1794 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1795 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1798 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1799 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1800 else if (m_head->m_pkthdr.csum_flags &
1801 (CSUM_IP_UDP | CSUM_IP6_UDP))
1802 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1805 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1806 /* Convert RNDIS packet message offsets */
1807 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1808 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1811 * Fast path: Chimney sending.
1814 struct hn_txdesc *tgt_txd = txd;
1816 if (txr->hn_agg_txd != NULL) {
1817 tgt_txd = txr->hn_agg_txd;
1823 KASSERT(pkt == chim,
1824 ("RNDIS pkt not in chimney sending buffer"));
1825 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1826 ("chimney sending buffer is not used"));
1827 tgt_txd->chim_size += pkt->rm_len;
1829 m_copydata(m_head, 0, m_head->m_pkthdr.len,
1830 ((uint8_t *)chim) + pkt_hlen);
1832 txr->hn_gpa_cnt = 0;
1833 txr->hn_sendpkt = hn_txpkt_chim;
1837 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1838 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1839 ("chimney buffer is used"));
1840 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1842 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1843 if (__predict_false(error)) {
1847 * This mbuf is not linked w/ the txd yet, so free it now.
1852 freed = hn_txdesc_put(txr, txd);
1854 ("fail to free txd upon txdma error"));
1856 txr->hn_txdma_failed++;
1857 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1862 /* +1 RNDIS packet message */
1863 txr->hn_gpa_cnt = nsegs + 1;
1865 /* send packet with page buffer */
1866 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1867 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1868 txr->hn_gpa[0].gpa_len = pkt_hlen;
1871 * Fill the page buffers with mbuf info after the page
1872 * buffer for RNDIS packet message.
1874 for (i = 0; i < nsegs; ++i) {
1875 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1877 gpa->gpa_page = atop(segs[i].ds_addr);
1878 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1879 gpa->gpa_len = segs[i].ds_len;
1882 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1884 txr->hn_sendpkt = hn_txpkt_sglist;
1888 /* Set the completion routine */
1889 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1891 /* Update temporary stats for later use. */
1892 txr->hn_stat_pkts++;
1893 txr->hn_stat_size += m_head->m_pkthdr.len;
1894 if (m_head->m_flags & M_MCAST)
1895 txr->hn_stat_mcasts++;
1902 * If this function fails, then txd will be freed, but the mbuf
1903 * associated w/ the txd will _not_ be freed.
1906 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1908 int error, send_failed = 0;
1912 * Make sure that this txd and any aggregated txds are not freed
1913 * before ETHER_BPF_MTAP.
1915 hn_txdesc_hold(txd);
1916 error = txr->hn_sendpkt(txr, txd);
1918 if (bpf_peers_present(ifp->if_bpf)) {
1919 const struct hn_txdesc *tmp_txd;
1921 ETHER_BPF_MTAP(ifp, txd->m);
1922 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1923 ETHER_BPF_MTAP(ifp, tmp_txd->m);
1926 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1927 #ifdef HN_IFSTART_SUPPORT
1928 if (!hn_use_if_start)
1931 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1933 if (txr->hn_stat_mcasts != 0) {
1934 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1935 txr->hn_stat_mcasts);
1938 txr->hn_pkts += txr->hn_stat_pkts;
1941 hn_txdesc_put(txr, txd);
1943 if (__predict_false(error)) {
1947 * This should "really rarely" happen.
1949 * XXX Too many RX to be acked or too many sideband
1950 * commands to run? Ask netvsc_channel_rollup()
1951 * to kick start later.
1953 txr->hn_has_txeof = 1;
1955 txr->hn_send_failed++;
1958 * Try sending again after set hn_has_txeof;
1959 * in case that we missed the last
1960 * netvsc_channel_rollup().
1964 if_printf(ifp, "send failed\n");
1967 * Caller will perform further processing on the
1968 * associated mbuf, so don't free it in hn_txdesc_put();
1969 * only unload it from the DMA map in hn_txdesc_put(),
1973 freed = hn_txdesc_put(txr, txd);
1975 ("fail to free txd upon send error"));
1977 txr->hn_send_failed++;
1980 /* Reset temporary stats, after this sending is done. */
1981 txr->hn_stat_size = 0;
1982 txr->hn_stat_pkts = 0;
1983 txr->hn_stat_mcasts = 0;
1989 * Append the specified data to the indicated mbuf chain,
1990 * Extend the mbuf chain if the new data does not fit in
1993 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
1994 * There should be an equivalent in the kernel mbuf code,
1995 * but there does not appear to be one yet.
1997 * Differs from m_append() in that additional mbufs are
1998 * allocated with cluster size MJUMPAGESIZE, and filled
2001 * Return 1 if able to complete the job; otherwise 0.
2004 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2007 int remainder, space;
2009 for (m = m0; m->m_next != NULL; m = m->m_next)
2012 space = M_TRAILINGSPACE(m);
2015 * Copy into available space.
2017 if (space > remainder)
2019 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2024 while (remainder > 0) {
2026 * Allocate a new mbuf; could check space
2027 * and allocate a cluster instead.
2029 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
2032 n->m_len = min(MJUMPAGESIZE, remainder);
2033 bcopy(cp, mtod(n, caddr_t), n->m_len);
2035 remainder -= n->m_len;
2039 if (m0->m_flags & M_PKTHDR)
2040 m0->m_pkthdr.len += len - remainder;
2042 return (remainder == 0);
2045 #if defined(INET) || defined(INET6)
2047 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2049 #if __FreeBSD_version >= 1100095
2050 if (hn_lro_mbufq_depth) {
2051 tcp_lro_queue_mbuf(lc, m);
2055 return tcp_lro_rx(lc, m, 0);
2060 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2061 const struct hn_rxinfo *info)
2063 struct ifnet *ifp = rxr->hn_ifp;
2065 int size, do_lro = 0, do_csum = 1;
2066 int hash_type = M_HASHTYPE_OPAQUE;
2068 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2072 * Bail out if packet contains more data than configured MTU.
2074 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
2076 } else if (dlen <= MHLEN) {
2077 m_new = m_gethdr(M_NOWAIT, MT_DATA);
2078 if (m_new == NULL) {
2079 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2082 memcpy(mtod(m_new, void *), data, dlen);
2083 m_new->m_pkthdr.len = m_new->m_len = dlen;
2084 rxr->hn_small_pkts++;
2087 * Get an mbuf with a cluster. For packets 2K or less,
2088 * get a standard 2K cluster. For anything larger, get a
2089 * 4K cluster. Any buffers larger than 4K can cause problems
2090 * if looped around to the Hyper-V TX channel, so avoid them.
2093 if (dlen > MCLBYTES) {
2095 size = MJUMPAGESIZE;
2098 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2099 if (m_new == NULL) {
2100 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2104 hv_m_append(m_new, dlen, data);
2106 m_new->m_pkthdr.rcvif = ifp;
2108 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2111 /* receive side checksum offload */
2112 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2113 /* IP csum offload */
2114 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2115 m_new->m_pkthdr.csum_flags |=
2116 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2120 /* TCP/UDP csum offload */
2121 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2122 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2123 m_new->m_pkthdr.csum_flags |=
2124 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2125 m_new->m_pkthdr.csum_data = 0xffff;
2126 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2134 * As of this write (Oct 28th, 2016), host side will turn
2135 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2136 * the do_lro setting here is actually _not_ accurate. We
2137 * depend on the RSS hash type check to reset do_lro.
2139 if ((info->csum_info &
2140 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2141 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2144 const struct ether_header *eh;
2149 if (m_new->m_len < hoff)
2151 eh = mtod(m_new, struct ether_header *);
2152 etype = ntohs(eh->ether_type);
2153 if (etype == ETHERTYPE_VLAN) {
2154 const struct ether_vlan_header *evl;
2156 hoff = sizeof(*evl);
2157 if (m_new->m_len < hoff)
2159 evl = mtod(m_new, struct ether_vlan_header *);
2160 etype = ntohs(evl->evl_proto);
2163 if (etype == ETHERTYPE_IP) {
2166 pr = hn_check_iplen(m_new, hoff);
2167 if (pr == IPPROTO_TCP) {
2169 (rxr->hn_trust_hcsum &
2170 HN_TRUST_HCSUM_TCP)) {
2171 rxr->hn_csum_trusted++;
2172 m_new->m_pkthdr.csum_flags |=
2173 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2174 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2175 m_new->m_pkthdr.csum_data = 0xffff;
2178 } else if (pr == IPPROTO_UDP) {
2180 (rxr->hn_trust_hcsum &
2181 HN_TRUST_HCSUM_UDP)) {
2182 rxr->hn_csum_trusted++;
2183 m_new->m_pkthdr.csum_flags |=
2184 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2185 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2186 m_new->m_pkthdr.csum_data = 0xffff;
2188 } else if (pr != IPPROTO_DONE && do_csum &&
2189 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2190 rxr->hn_csum_trusted++;
2191 m_new->m_pkthdr.csum_flags |=
2192 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2197 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2198 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2199 NDIS_VLAN_INFO_ID(info->vlan_info),
2200 NDIS_VLAN_INFO_PRI(info->vlan_info),
2201 NDIS_VLAN_INFO_CFI(info->vlan_info));
2202 m_new->m_flags |= M_VLANTAG;
2205 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2207 m_new->m_pkthdr.flowid = info->hash_value;
2208 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2209 NDIS_HASH_FUNCTION_TOEPLITZ) {
2210 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2214 * do_lro is resetted, if the hash types are not TCP
2215 * related. See the comment in the above csum_flags
2219 case NDIS_HASH_IPV4:
2220 hash_type = M_HASHTYPE_RSS_IPV4;
2224 case NDIS_HASH_TCP_IPV4:
2225 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2228 case NDIS_HASH_IPV6:
2229 hash_type = M_HASHTYPE_RSS_IPV6;
2233 case NDIS_HASH_IPV6_EX:
2234 hash_type = M_HASHTYPE_RSS_IPV6_EX;
2238 case NDIS_HASH_TCP_IPV6:
2239 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2242 case NDIS_HASH_TCP_IPV6_EX:
2243 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2248 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2250 M_HASHTYPE_SET(m_new, hash_type);
2253 * Note: Moved RX completion back to hv_nv_on_receive() so all
2254 * messages (not just data messages) will trigger a response.
2260 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2261 #if defined(INET) || defined(INET6)
2262 struct lro_ctrl *lro = &rxr->hn_lro;
2265 rxr->hn_lro_tried++;
2266 if (hn_lro_rx(lro, m_new) == 0) {
2274 /* We're not holding the lock here, so don't release it */
2275 (*ifp->if_input)(ifp, m_new);
2281 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2283 struct hn_softc *sc = ifp->if_softc;
2284 struct ifreq *ifr = (struct ifreq *)data;
2285 int mask, error = 0;
2289 if (ifr->ifr_mtu > HN_MTU_MAX) {
2296 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2301 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2302 /* Can't change MTU */
2308 if (ifp->if_mtu == ifr->ifr_mtu) {
2314 * Suspend this interface before the synthetic parts
2320 * Detach the synthetics parts, i.e. NVS and RNDIS.
2322 hn_synth_detach(sc);
2325 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2326 * with the new MTU setting.
2328 error = hn_synth_attach(sc, ifr->ifr_mtu);
2335 * Commit the requested MTU, after the synthetic parts
2336 * have been successfully attached.
2338 ifp->if_mtu = ifr->ifr_mtu;
2341 * Make sure that various parameters based on MTU are
2342 * still valid, after the MTU change.
2344 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2345 hn_set_chim_size(sc, sc->hn_chim_szmax);
2346 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2347 #if __FreeBSD_version >= 1100099
2348 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2349 HN_LRO_LENLIM_MIN(ifp))
2350 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2354 * All done! Resume the interface now.
2364 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2369 if (ifp->if_flags & IFF_UP) {
2370 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2372 * Caller meight hold mutex, e.g.
2373 * bpf; use busy-wait for the RNDIS
2377 hn_set_rxfilter(sc);
2383 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2386 sc->hn_if_flags = ifp->if_flags;
2393 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2395 if (mask & IFCAP_TXCSUM) {
2396 ifp->if_capenable ^= IFCAP_TXCSUM;
2397 if (ifp->if_capenable & IFCAP_TXCSUM)
2398 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2400 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2402 if (mask & IFCAP_TXCSUM_IPV6) {
2403 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2404 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2405 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2407 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2410 /* TODO: flip RNDIS offload parameters for RXCSUM. */
2411 if (mask & IFCAP_RXCSUM)
2412 ifp->if_capenable ^= IFCAP_RXCSUM;
2414 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2415 if (mask & IFCAP_RXCSUM_IPV6)
2416 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2419 if (mask & IFCAP_LRO)
2420 ifp->if_capenable ^= IFCAP_LRO;
2422 if (mask & IFCAP_TSO4) {
2423 ifp->if_capenable ^= IFCAP_TSO4;
2424 if (ifp->if_capenable & IFCAP_TSO4)
2425 ifp->if_hwassist |= CSUM_IP_TSO;
2427 ifp->if_hwassist &= ~CSUM_IP_TSO;
2429 if (mask & IFCAP_TSO6) {
2430 ifp->if_capenable ^= IFCAP_TSO6;
2431 if (ifp->if_capenable & IFCAP_TSO6)
2432 ifp->if_hwassist |= CSUM_IP6_TSO;
2434 ifp->if_hwassist &= ~CSUM_IP6_TSO;
2444 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2448 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2450 * Multicast uses mutex; use busy-wait for
2454 hn_set_rxfilter(sc);
2463 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2467 error = ether_ioctl(ifp, cmd, data);
2474 hn_stop(struct hn_softc *sc)
2476 struct ifnet *ifp = sc->hn_ifp;
2481 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2482 ("synthetic parts were not attached"));
2484 /* Clear RUNNING bit _before_ hn_suspend_data() */
2485 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2486 hn_suspend_data(sc);
2488 /* Clear OACTIVE bit. */
2489 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2490 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2491 sc->hn_tx_ring[i].hn_oactive = 0;
2495 hn_init_locked(struct hn_softc *sc)
2497 struct ifnet *ifp = sc->hn_ifp;
2502 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2505 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2508 /* Configure RX filter */
2509 hn_set_rxfilter(sc);
2511 /* Clear OACTIVE bit. */
2512 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2513 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2514 sc->hn_tx_ring[i].hn_oactive = 0;
2516 /* Clear TX 'suspended' bit. */
2517 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2519 /* Everything is ready; unleash! */
2520 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2526 struct hn_softc *sc = xsc;
2533 #if __FreeBSD_version >= 1100099
2536 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2538 struct hn_softc *sc = arg1;
2539 unsigned int lenlim;
2542 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2543 error = sysctl_handle_int(oidp, &lenlim, 0, req);
2544 if (error || req->newptr == NULL)
2548 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2549 lenlim > TCP_LRO_LENGTH_MAX) {
2553 hn_set_lro_lenlim(sc, lenlim);
2560 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2562 struct hn_softc *sc = arg1;
2563 int ackcnt, error, i;
2566 * lro_ackcnt_lim is append count limit,
2567 * +1 to turn it into aggregation limit.
2569 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2570 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2571 if (error || req->newptr == NULL)
2574 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2578 * Convert aggregation limit back to append
2583 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2584 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2592 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2594 struct hn_softc *sc = arg1;
2599 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2602 error = sysctl_handle_int(oidp, &on, 0, req);
2603 if (error || req->newptr == NULL)
2607 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2608 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2611 rxr->hn_trust_hcsum |= hcsum;
2613 rxr->hn_trust_hcsum &= ~hcsum;
2620 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2622 struct hn_softc *sc = arg1;
2623 int chim_size, error;
2625 chim_size = sc->hn_tx_ring[0].hn_chim_size;
2626 error = sysctl_handle_int(oidp, &chim_size, 0, req);
2627 if (error || req->newptr == NULL)
2630 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2634 hn_set_chim_size(sc, chim_size);
2639 #if __FreeBSD_version < 1100095
2641 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2643 struct hn_softc *sc = arg1;
2644 int ofs = arg2, i, error;
2645 struct hn_rx_ring *rxr;
2649 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2650 rxr = &sc->hn_rx_ring[i];
2651 stat += *((int *)((uint8_t *)rxr + ofs));
2654 error = sysctl_handle_64(oidp, &stat, 0, req);
2655 if (error || req->newptr == NULL)
2658 /* Zero out this stat. */
2659 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2660 rxr = &sc->hn_rx_ring[i];
2661 *((int *)((uint8_t *)rxr + ofs)) = 0;
2667 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2669 struct hn_softc *sc = arg1;
2670 int ofs = arg2, i, error;
2671 struct hn_rx_ring *rxr;
2675 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2676 rxr = &sc->hn_rx_ring[i];
2677 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2680 error = sysctl_handle_64(oidp, &stat, 0, req);
2681 if (error || req->newptr == NULL)
2684 /* Zero out this stat. */
2685 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2686 rxr = &sc->hn_rx_ring[i];
2687 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2695 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2697 struct hn_softc *sc = arg1;
2698 int ofs = arg2, i, error;
2699 struct hn_rx_ring *rxr;
2703 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2704 rxr = &sc->hn_rx_ring[i];
2705 stat += *((u_long *)((uint8_t *)rxr + ofs));
2708 error = sysctl_handle_long(oidp, &stat, 0, req);
2709 if (error || req->newptr == NULL)
2712 /* Zero out this stat. */
2713 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2714 rxr = &sc->hn_rx_ring[i];
2715 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
2721 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2723 struct hn_softc *sc = arg1;
2724 int ofs = arg2, i, error;
2725 struct hn_tx_ring *txr;
2729 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2730 txr = &sc->hn_tx_ring[i];
2731 stat += *((u_long *)((uint8_t *)txr + ofs));
2734 error = sysctl_handle_long(oidp, &stat, 0, req);
2735 if (error || req->newptr == NULL)
2738 /* Zero out this stat. */
2739 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2740 txr = &sc->hn_tx_ring[i];
2741 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2747 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2749 struct hn_softc *sc = arg1;
2750 int ofs = arg2, i, error, conf;
2751 struct hn_tx_ring *txr;
2753 txr = &sc->hn_tx_ring[0];
2754 conf = *((int *)((uint8_t *)txr + ofs));
2756 error = sysctl_handle_int(oidp, &conf, 0, req);
2757 if (error || req->newptr == NULL)
2761 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2762 txr = &sc->hn_tx_ring[i];
2763 *((int *)((uint8_t *)txr + ofs)) = conf;
2771 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2773 struct hn_softc *sc = arg1;
2776 size = sc->hn_agg_size;
2777 error = sysctl_handle_int(oidp, &size, 0, req);
2778 if (error || req->newptr == NULL)
2782 sc->hn_agg_size = size;
2790 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2792 struct hn_softc *sc = arg1;
2795 pkts = sc->hn_agg_pkts;
2796 error = sysctl_handle_int(oidp, &pkts, 0, req);
2797 if (error || req->newptr == NULL)
2801 sc->hn_agg_pkts = pkts;
2809 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2811 struct hn_softc *sc = arg1;
2814 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2815 return (sysctl_handle_int(oidp, &pkts, 0, req));
2819 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2821 struct hn_softc *sc = arg1;
2824 align = sc->hn_tx_ring[0].hn_agg_align;
2825 return (sysctl_handle_int(oidp, &align, 0, req));
2829 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2831 struct hn_softc *sc = arg1;
2834 snprintf(verstr, sizeof(verstr), "%u.%u",
2835 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2836 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2837 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2841 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2843 struct hn_softc *sc = arg1;
2850 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2851 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2855 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2857 struct hn_softc *sc = arg1;
2858 char assist_str[128];
2862 hwassist = sc->hn_ifp->if_hwassist;
2864 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2865 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2869 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2871 struct hn_softc *sc = arg1;
2872 char filter_str[128];
2876 filter = sc->hn_rx_filter;
2878 snprintf(filter_str, sizeof(filter_str), "%b", filter,
2880 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2884 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2886 struct hn_softc *sc = arg1;
2891 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2892 if (error || req->newptr == NULL)
2895 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2898 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2900 if (sc->hn_rx_ring_inuse > 1) {
2901 error = hn_rss_reconfig(sc);
2903 /* Not RSS capable, at least for now; just save the RSS key. */
2912 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2914 struct hn_softc *sc = arg1;
2919 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2920 if (error || req->newptr == NULL)
2924 * Don't allow RSS indirect table change, if this interface is not
2925 * RSS capable currently.
2927 if (sc->hn_rx_ring_inuse == 1) {
2932 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2935 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2937 hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse);
2938 error = hn_rss_reconfig(sc);
2945 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
2947 struct hn_softc *sc = arg1;
2952 hash = sc->hn_rss_hash;
2954 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
2955 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
2959 hn_check_iplen(const struct mbuf *m, int hoff)
2961 const struct ip *ip;
2962 int len, iphlen, iplen;
2963 const struct tcphdr *th;
2964 int thoff; /* TCP data offset */
2966 len = hoff + sizeof(struct ip);
2968 /* The packet must be at least the size of an IP header. */
2969 if (m->m_pkthdr.len < len)
2970 return IPPROTO_DONE;
2972 /* The fixed IP header must reside completely in the first mbuf. */
2974 return IPPROTO_DONE;
2976 ip = mtodo(m, hoff);
2978 /* Bound check the packet's stated IP header length. */
2979 iphlen = ip->ip_hl << 2;
2980 if (iphlen < sizeof(struct ip)) /* minimum header length */
2981 return IPPROTO_DONE;
2983 /* The full IP header must reside completely in the one mbuf. */
2984 if (m->m_len < hoff + iphlen)
2985 return IPPROTO_DONE;
2987 iplen = ntohs(ip->ip_len);
2990 * Check that the amount of data in the buffers is as
2991 * at least much as the IP header would have us expect.
2993 if (m->m_pkthdr.len < hoff + iplen)
2994 return IPPROTO_DONE;
2997 * Ignore IP fragments.
2999 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3000 return IPPROTO_DONE;
3003 * The TCP/IP or UDP/IP header must be entirely contained within
3004 * the first fragment of a packet.
3008 if (iplen < iphlen + sizeof(struct tcphdr))
3009 return IPPROTO_DONE;
3010 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3011 return IPPROTO_DONE;
3012 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3013 thoff = th->th_off << 2;
3014 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3015 return IPPROTO_DONE;
3016 if (m->m_len < hoff + iphlen + thoff)
3017 return IPPROTO_DONE;
3020 if (iplen < iphlen + sizeof(struct udphdr))
3021 return IPPROTO_DONE;
3022 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3023 return IPPROTO_DONE;
3027 return IPPROTO_DONE;
3034 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3036 struct sysctl_oid_list *child;
3037 struct sysctl_ctx_list *ctx;
3038 device_t dev = sc->hn_dev;
3039 #if defined(INET) || defined(INET6)
3040 #if __FreeBSD_version >= 1100095
3047 * Create RXBUF for reception.
3050 * - It is shared by all channels.
3051 * - A large enough buffer is allocated, certain version of NVSes
3052 * may further limit the usable space.
3054 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3055 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3056 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3057 if (sc->hn_rxbuf == NULL) {
3058 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3062 sc->hn_rx_ring_cnt = ring_cnt;
3063 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3065 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3066 M_DEVBUF, M_WAITOK | M_ZERO);
3068 #if defined(INET) || defined(INET6)
3069 #if __FreeBSD_version >= 1100095
3070 lroent_cnt = hn_lro_entry_count;
3071 if (lroent_cnt < TCP_LRO_ENTRIES)
3072 lroent_cnt = TCP_LRO_ENTRIES;
3074 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3076 #endif /* INET || INET6 */
3078 ctx = device_get_sysctl_ctx(dev);
3079 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3081 /* Create dev.hn.UNIT.rx sysctl tree */
3082 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3083 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3085 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3086 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3088 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3089 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3090 &rxr->hn_br_dma, BUS_DMA_WAITOK);
3091 if (rxr->hn_br == NULL) {
3092 device_printf(dev, "allocate bufring failed\n");
3096 if (hn_trust_hosttcp)
3097 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3098 if (hn_trust_hostudp)
3099 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3100 if (hn_trust_hostip)
3101 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3102 rxr->hn_ifp = sc->hn_ifp;
3103 if (i < sc->hn_tx_ring_cnt)
3104 rxr->hn_txr = &sc->hn_tx_ring[i];
3105 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3106 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3108 rxr->hn_rxbuf = sc->hn_rxbuf;
3113 #if defined(INET) || defined(INET6)
3114 #if __FreeBSD_version >= 1100095
3115 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3116 hn_lro_mbufq_depth);
3118 tcp_lro_init(&rxr->hn_lro);
3119 rxr->hn_lro.ifp = sc->hn_ifp;
3121 #if __FreeBSD_version >= 1100099
3122 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3123 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3125 #endif /* INET || INET6 */
3127 if (sc->hn_rx_sysctl_tree != NULL) {
3131 * Create per RX ring sysctl tree:
3132 * dev.hn.UNIT.rx.RINGID
3134 snprintf(name, sizeof(name), "%d", i);
3135 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3136 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3137 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3139 if (rxr->hn_rx_sysctl_tree != NULL) {
3140 SYSCTL_ADD_ULONG(ctx,
3141 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3142 OID_AUTO, "packets", CTLFLAG_RW,
3143 &rxr->hn_pkts, "# of packets received");
3144 SYSCTL_ADD_ULONG(ctx,
3145 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3146 OID_AUTO, "rss_pkts", CTLFLAG_RW,
3148 "# of packets w/ RSS info received");
3150 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3151 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3152 &rxr->hn_pktbuf_len, 0,
3153 "Temporary channel packet buffer length");
3158 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3159 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3160 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3161 #if __FreeBSD_version < 1100095
3162 hn_rx_stat_int_sysctl,
3164 hn_rx_stat_u64_sysctl,
3166 "LU", "LRO queued");
3167 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3168 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3169 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3170 #if __FreeBSD_version < 1100095
3171 hn_rx_stat_int_sysctl,
3173 hn_rx_stat_u64_sysctl,
3175 "LU", "LRO flushed");
3176 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3177 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3178 __offsetof(struct hn_rx_ring, hn_lro_tried),
3179 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3180 #if __FreeBSD_version >= 1100099
3181 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3182 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3183 hn_lro_lenlim_sysctl, "IU",
3184 "Max # of data bytes to be aggregated by LRO");
3185 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3186 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3187 hn_lro_ackcnt_sysctl, "I",
3188 "Max # of ACKs to be aggregated by LRO");
3190 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3191 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3192 hn_trust_hcsum_sysctl, "I",
3193 "Trust tcp segement verification on host side, "
3194 "when csum info is missing");
3195 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3196 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3197 hn_trust_hcsum_sysctl, "I",
3198 "Trust udp datagram verification on host side, "
3199 "when csum info is missing");
3200 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3201 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3202 hn_trust_hcsum_sysctl, "I",
3203 "Trust ip packet verification on host side, "
3204 "when csum info is missing");
3205 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3206 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3207 __offsetof(struct hn_rx_ring, hn_csum_ip),
3208 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3209 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3210 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3211 __offsetof(struct hn_rx_ring, hn_csum_tcp),
3212 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3213 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3214 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3215 __offsetof(struct hn_rx_ring, hn_csum_udp),
3216 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3217 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3218 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3219 __offsetof(struct hn_rx_ring, hn_csum_trusted),
3220 hn_rx_stat_ulong_sysctl, "LU",
3221 "# of packets that we trust host's csum verification");
3222 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3223 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3224 __offsetof(struct hn_rx_ring, hn_small_pkts),
3225 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3226 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3227 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3228 __offsetof(struct hn_rx_ring, hn_ack_failed),
3229 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3230 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3231 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3232 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3233 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3239 hn_destroy_rx_data(struct hn_softc *sc)
3243 if (sc->hn_rxbuf != NULL) {
3244 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3245 sc->hn_rxbuf = NULL;
3248 if (sc->hn_rx_ring_cnt == 0)
3251 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3252 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3254 if (rxr->hn_br == NULL)
3256 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3259 #if defined(INET) || defined(INET6)
3260 tcp_lro_free(&rxr->hn_lro);
3262 free(rxr->hn_pktbuf, M_DEVBUF);
3264 free(sc->hn_rx_ring, M_DEVBUF);
3265 sc->hn_rx_ring = NULL;
3267 sc->hn_rx_ring_cnt = 0;
3268 sc->hn_rx_ring_inuse = 0;
3272 hn_tx_ring_create(struct hn_softc *sc, int id)
3274 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3275 device_t dev = sc->hn_dev;
3276 bus_dma_tag_t parent_dtag;
3280 txr->hn_tx_idx = id;
3282 #ifndef HN_USE_TXDESC_BUFRING
3283 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3285 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3287 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3288 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3289 M_DEVBUF, M_WAITOK | M_ZERO);
3290 #ifndef HN_USE_TXDESC_BUFRING
3291 SLIST_INIT(&txr->hn_txlist);
3293 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3294 M_WAITOK, &txr->hn_tx_lock);
3297 txr->hn_tx_taskq = sc->hn_tx_taskq;
3299 #ifdef HN_IFSTART_SUPPORT
3300 if (hn_use_if_start) {
3301 txr->hn_txeof = hn_start_txeof;
3302 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3303 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3309 txr->hn_txeof = hn_xmit_txeof;
3310 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3311 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3313 br_depth = hn_get_txswq_depth(txr);
3314 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3315 M_WAITOK, &txr->hn_tx_lock);
3318 txr->hn_direct_tx_size = hn_direct_tx_size;
3321 * Always schedule transmission instead of trying to do direct
3322 * transmission. This one gives the best performance so far.
3324 txr->hn_sched_tx = 1;
3326 parent_dtag = bus_get_dma_tag(dev);
3328 /* DMA tag for RNDIS packet messages. */
3329 error = bus_dma_tag_create(parent_dtag, /* parent */
3330 HN_RNDIS_PKT_ALIGN, /* alignment */
3331 HN_RNDIS_PKT_BOUNDARY, /* boundary */
3332 BUS_SPACE_MAXADDR, /* lowaddr */
3333 BUS_SPACE_MAXADDR, /* highaddr */
3334 NULL, NULL, /* filter, filterarg */
3335 HN_RNDIS_PKT_LEN, /* maxsize */
3337 HN_RNDIS_PKT_LEN, /* maxsegsize */
3339 NULL, /* lockfunc */
3340 NULL, /* lockfuncarg */
3341 &txr->hn_tx_rndis_dtag);
3343 device_printf(dev, "failed to create rndis dmatag\n");
3347 /* DMA tag for data. */
3348 error = bus_dma_tag_create(parent_dtag, /* parent */
3350 HN_TX_DATA_BOUNDARY, /* boundary */
3351 BUS_SPACE_MAXADDR, /* lowaddr */
3352 BUS_SPACE_MAXADDR, /* highaddr */
3353 NULL, NULL, /* filter, filterarg */
3354 HN_TX_DATA_MAXSIZE, /* maxsize */
3355 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
3356 HN_TX_DATA_SEGSIZE, /* maxsegsize */
3358 NULL, /* lockfunc */
3359 NULL, /* lockfuncarg */
3360 &txr->hn_tx_data_dtag);
3362 device_printf(dev, "failed to create data dmatag\n");
3366 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3367 struct hn_txdesc *txd = &txr->hn_txdesc[i];
3370 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3371 STAILQ_INIT(&txd->agg_list);
3374 * Allocate and load RNDIS packet message.
3376 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3377 (void **)&txd->rndis_pkt,
3378 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3379 &txd->rndis_pkt_dmap);
3382 "failed to allocate rndis_packet_msg, %d\n", i);
3386 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3387 txd->rndis_pkt_dmap,
3388 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3389 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3393 "failed to load rndis_packet_msg, %d\n", i);
3394 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3395 txd->rndis_pkt, txd->rndis_pkt_dmap);
3399 /* DMA map for TX data. */
3400 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3404 "failed to allocate tx data dmamap\n");
3405 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3406 txd->rndis_pkt_dmap);
3407 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3408 txd->rndis_pkt, txd->rndis_pkt_dmap);
3412 /* All set, put it to list */
3413 txd->flags |= HN_TXD_FLAG_ONLIST;
3414 #ifndef HN_USE_TXDESC_BUFRING
3415 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3417 buf_ring_enqueue(txr->hn_txdesc_br, txd);
3420 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3422 if (sc->hn_tx_sysctl_tree != NULL) {
3423 struct sysctl_oid_list *child;
3424 struct sysctl_ctx_list *ctx;
3428 * Create per TX ring sysctl tree:
3429 * dev.hn.UNIT.tx.RINGID
3431 ctx = device_get_sysctl_ctx(dev);
3432 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3434 snprintf(name, sizeof(name), "%d", id);
3435 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3436 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3438 if (txr->hn_tx_sysctl_tree != NULL) {
3439 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3441 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3442 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3443 "# of available TX descs");
3444 #ifdef HN_IFSTART_SUPPORT
3445 if (!hn_use_if_start)
3448 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3449 CTLFLAG_RD, &txr->hn_oactive, 0,
3452 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3453 CTLFLAG_RW, &txr->hn_pkts,
3454 "# of packets transmitted");
3455 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3456 CTLFLAG_RW, &txr->hn_sends, "# of sends");
3464 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3466 struct hn_tx_ring *txr = txd->txr;
3468 KASSERT(txd->m == NULL, ("still has mbuf installed"));
3469 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3471 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3472 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3473 txd->rndis_pkt_dmap);
3474 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3478 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3480 struct hn_txdesc *txd;
3482 if (txr->hn_txdesc == NULL)
3485 #ifndef HN_USE_TXDESC_BUFRING
3486 while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) {
3487 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
3488 hn_txdesc_dmamap_destroy(txd);
3491 mtx_lock(&txr->hn_tx_lock);
3492 while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
3493 hn_txdesc_dmamap_destroy(txd);
3494 mtx_unlock(&txr->hn_tx_lock);
3497 if (txr->hn_tx_data_dtag != NULL)
3498 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3499 if (txr->hn_tx_rndis_dtag != NULL)
3500 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3502 #ifdef HN_USE_TXDESC_BUFRING
3503 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3506 free(txr->hn_txdesc, M_DEVBUF);
3507 txr->hn_txdesc = NULL;
3509 if (txr->hn_mbuf_br != NULL)
3510 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3512 #ifndef HN_USE_TXDESC_BUFRING
3513 mtx_destroy(&txr->hn_txlist_spin);
3515 mtx_destroy(&txr->hn_tx_lock);
3519 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3521 struct sysctl_oid_list *child;
3522 struct sysctl_ctx_list *ctx;
3526 * Create TXBUF for chimney sending.
3528 * NOTE: It is shared by all channels.
3530 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3531 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3532 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3533 if (sc->hn_chim == NULL) {
3534 device_printf(sc->hn_dev, "allocate txbuf failed\n");
3538 sc->hn_tx_ring_cnt = ring_cnt;
3539 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3541 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3542 M_DEVBUF, M_WAITOK | M_ZERO);
3544 ctx = device_get_sysctl_ctx(sc->hn_dev);
3545 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3547 /* Create dev.hn.UNIT.tx sysctl tree */
3548 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3549 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3551 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3554 error = hn_tx_ring_create(sc, i);
3559 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3560 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3561 __offsetof(struct hn_tx_ring, hn_no_txdescs),
3562 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3563 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3564 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3565 __offsetof(struct hn_tx_ring, hn_send_failed),
3566 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3567 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3568 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3569 __offsetof(struct hn_tx_ring, hn_txdma_failed),
3570 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3571 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3572 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3573 __offsetof(struct hn_tx_ring, hn_flush_failed),
3574 hn_tx_stat_ulong_sysctl, "LU",
3575 "# of packet transmission aggregation flush failure");
3576 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3577 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3578 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3579 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3580 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3581 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3582 __offsetof(struct hn_tx_ring, hn_tx_chimney),
3583 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3584 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3585 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3586 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3587 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3588 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3589 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3590 "# of total TX descs");
3591 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3592 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3593 "Chimney send packet size upper boundary");
3594 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3595 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3596 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3597 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3598 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3599 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3600 hn_tx_conf_int_sysctl, "I",
3601 "Size of the packet for direct transmission");
3602 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3603 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3604 __offsetof(struct hn_tx_ring, hn_sched_tx),
3605 hn_tx_conf_int_sysctl, "I",
3606 "Always schedule transmission "
3607 "instead of doing direct transmission");
3608 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3609 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3610 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3611 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3612 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3613 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3614 "Applied packet transmission aggregation size");
3615 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3616 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3617 hn_txagg_pktmax_sysctl, "I",
3618 "Applied packet transmission aggregation packets");
3619 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3620 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3621 hn_txagg_align_sysctl, "I",
3622 "Applied packet transmission aggregation alignment");
3628 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3632 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3633 sc->hn_tx_ring[i].hn_chim_size = chim_size;
3637 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3639 struct ifnet *ifp = sc->hn_ifp;
3642 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3645 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3646 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3647 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3649 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3650 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3651 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3653 if (tso_maxlen < tso_minlen)
3654 tso_maxlen = tso_minlen;
3655 else if (tso_maxlen > IP_MAXPACKET)
3656 tso_maxlen = IP_MAXPACKET;
3657 if (tso_maxlen > sc->hn_ndis_tso_szmax)
3658 tso_maxlen = sc->hn_ndis_tso_szmax;
3659 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3661 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3665 hn_fixup_tx_data(struct hn_softc *sc)
3667 uint64_t csum_assist;
3670 hn_set_chim_size(sc, sc->hn_chim_szmax);
3671 if (hn_tx_chimney_size > 0 &&
3672 hn_tx_chimney_size < sc->hn_chim_szmax)
3673 hn_set_chim_size(sc, hn_tx_chimney_size);
3676 if (sc->hn_caps & HN_CAP_IPCS)
3677 csum_assist |= CSUM_IP;
3678 if (sc->hn_caps & HN_CAP_TCP4CS)
3679 csum_assist |= CSUM_IP_TCP;
3680 if (sc->hn_caps & HN_CAP_UDP4CS)
3681 csum_assist |= CSUM_IP_UDP;
3682 if (sc->hn_caps & HN_CAP_TCP6CS)
3683 csum_assist |= CSUM_IP6_TCP;
3684 if (sc->hn_caps & HN_CAP_UDP6CS)
3685 csum_assist |= CSUM_IP6_UDP;
3686 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3687 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3689 if (sc->hn_caps & HN_CAP_HASHVAL) {
3691 * Support HASHVAL pktinfo on TX path.
3694 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3695 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3696 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3701 hn_destroy_tx_data(struct hn_softc *sc)
3705 if (sc->hn_chim != NULL) {
3706 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3710 if (sc->hn_tx_ring_cnt == 0)
3713 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3714 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3716 free(sc->hn_tx_ring, M_DEVBUF);
3717 sc->hn_tx_ring = NULL;
3719 sc->hn_tx_ring_cnt = 0;
3720 sc->hn_tx_ring_inuse = 0;
3723 #ifdef HN_IFSTART_SUPPORT
3726 hn_start_taskfunc(void *xtxr, int pending __unused)
3728 struct hn_tx_ring *txr = xtxr;
3730 mtx_lock(&txr->hn_tx_lock);
3731 hn_start_locked(txr, 0);
3732 mtx_unlock(&txr->hn_tx_lock);
3736 hn_start_locked(struct hn_tx_ring *txr, int len)
3738 struct hn_softc *sc = txr->hn_sc;
3739 struct ifnet *ifp = sc->hn_ifp;
3742 KASSERT(hn_use_if_start,
3743 ("hn_start_locked is called, when if_start is disabled"));
3744 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3745 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3746 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3748 if (__predict_false(txr->hn_suspended))
3751 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3755 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3756 struct hn_txdesc *txd;
3757 struct mbuf *m_head;
3760 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3764 if (len > 0 && m_head->m_pkthdr.len > len) {
3766 * This sending could be time consuming; let callers
3767 * dispatch this packet sending (and sending of any
3768 * following up packets) to tx taskqueue.
3770 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3775 #if defined(INET6) || defined(INET)
3776 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3777 m_head = hn_tso_fixup(m_head);
3778 if (__predict_false(m_head == NULL)) {
3779 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3785 txd = hn_txdesc_get(txr);
3787 txr->hn_no_txdescs++;
3788 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3789 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3793 error = hn_encap(ifp, txr, txd, &m_head);
3795 /* Both txd and m_head are freed */
3796 KASSERT(txr->hn_agg_txd == NULL,
3797 ("encap failed w/ pending aggregating txdesc"));
3801 if (txr->hn_agg_pktleft == 0) {
3802 if (txr->hn_agg_txd != NULL) {
3803 KASSERT(m_head == NULL,
3804 ("pending mbuf for aggregating txdesc"));
3805 error = hn_flush_txagg(ifp, txr);
3806 if (__predict_false(error)) {
3807 atomic_set_int(&ifp->if_drv_flags,
3812 KASSERT(m_head != NULL, ("mbuf was freed"));
3813 error = hn_txpkt(ifp, txr, txd);
3814 if (__predict_false(error)) {
3815 /* txd is freed, but m_head is not */
3816 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3817 atomic_set_int(&ifp->if_drv_flags,
3825 KASSERT(txr->hn_agg_txd != NULL,
3826 ("no aggregating txdesc"));
3827 KASSERT(m_head == NULL,
3828 ("pending mbuf for aggregating txdesc"));
3833 /* Flush pending aggerated transmission. */
3834 if (txr->hn_agg_txd != NULL)
3835 hn_flush_txagg(ifp, txr);
3840 hn_start(struct ifnet *ifp)
3842 struct hn_softc *sc = ifp->if_softc;
3843 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
3845 if (txr->hn_sched_tx)
3848 if (mtx_trylock(&txr->hn_tx_lock)) {
3851 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3852 mtx_unlock(&txr->hn_tx_lock);
3857 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3861 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3863 struct hn_tx_ring *txr = xtxr;
3865 mtx_lock(&txr->hn_tx_lock);
3866 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3867 hn_start_locked(txr, 0);
3868 mtx_unlock(&txr->hn_tx_lock);
3872 hn_start_txeof(struct hn_tx_ring *txr)
3874 struct hn_softc *sc = txr->hn_sc;
3875 struct ifnet *ifp = sc->hn_ifp;
3877 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3879 if (txr->hn_sched_tx)
3882 if (mtx_trylock(&txr->hn_tx_lock)) {
3885 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3886 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3887 mtx_unlock(&txr->hn_tx_lock);
3889 taskqueue_enqueue(txr->hn_tx_taskq,
3895 * Release the OACTIVE earlier, with the hope, that
3896 * others could catch up. The task will clear the
3897 * flag again with the hn_tx_lock to avoid possible
3900 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3901 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3905 #endif /* HN_IFSTART_SUPPORT */
3908 hn_xmit(struct hn_tx_ring *txr, int len)
3910 struct hn_softc *sc = txr->hn_sc;
3911 struct ifnet *ifp = sc->hn_ifp;
3912 struct mbuf *m_head;
3915 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3916 #ifdef HN_IFSTART_SUPPORT
3917 KASSERT(hn_use_if_start == 0,
3918 ("hn_xmit is called, when if_start is enabled"));
3920 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3922 if (__predict_false(txr->hn_suspended))
3925 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
3928 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
3929 struct hn_txdesc *txd;
3932 if (len > 0 && m_head->m_pkthdr.len > len) {
3934 * This sending could be time consuming; let callers
3935 * dispatch this packet sending (and sending of any
3936 * following up packets) to tx taskqueue.
3938 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3943 txd = hn_txdesc_get(txr);
3945 txr->hn_no_txdescs++;
3946 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3947 txr->hn_oactive = 1;
3951 error = hn_encap(ifp, txr, txd, &m_head);
3953 /* Both txd and m_head are freed; discard */
3954 KASSERT(txr->hn_agg_txd == NULL,
3955 ("encap failed w/ pending aggregating txdesc"));
3956 drbr_advance(ifp, txr->hn_mbuf_br);
3960 if (txr->hn_agg_pktleft == 0) {
3961 if (txr->hn_agg_txd != NULL) {
3962 KASSERT(m_head == NULL,
3963 ("pending mbuf for aggregating txdesc"));
3964 error = hn_flush_txagg(ifp, txr);
3965 if (__predict_false(error)) {
3966 txr->hn_oactive = 1;
3970 KASSERT(m_head != NULL, ("mbuf was freed"));
3971 error = hn_txpkt(ifp, txr, txd);
3972 if (__predict_false(error)) {
3973 /* txd is freed, but m_head is not */
3974 drbr_putback(ifp, txr->hn_mbuf_br,
3976 txr->hn_oactive = 1;
3983 KASSERT(txr->hn_agg_txd != NULL,
3984 ("no aggregating txdesc"));
3985 KASSERT(m_head == NULL,
3986 ("pending mbuf for aggregating txdesc"));
3991 drbr_advance(ifp, txr->hn_mbuf_br);
3994 /* Flush pending aggerated transmission. */
3995 if (txr->hn_agg_txd != NULL)
3996 hn_flush_txagg(ifp, txr);
4001 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4003 struct hn_softc *sc = ifp->if_softc;
4004 struct hn_tx_ring *txr;
4007 #if defined(INET6) || defined(INET)
4009 * Perform TSO packet header fixup now, since the TSO
4010 * packet header should be cache-hot.
4012 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4013 m = hn_tso_fixup(m);
4014 if (__predict_false(m == NULL)) {
4015 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4022 * Select the TX ring based on flowid
4024 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
4025 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4026 txr = &sc->hn_tx_ring[idx];
4028 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4030 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4034 if (txr->hn_oactive)
4037 if (txr->hn_sched_tx)
4040 if (mtx_trylock(&txr->hn_tx_lock)) {
4043 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4044 mtx_unlock(&txr->hn_tx_lock);
4049 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4054 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4058 mtx_lock(&txr->hn_tx_lock);
4059 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4061 mtx_unlock(&txr->hn_tx_lock);
4065 hn_xmit_qflush(struct ifnet *ifp)
4067 struct hn_softc *sc = ifp->if_softc;
4070 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4071 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4076 hn_xmit_txeof(struct hn_tx_ring *txr)
4079 if (txr->hn_sched_tx)
4082 if (mtx_trylock(&txr->hn_tx_lock)) {
4085 txr->hn_oactive = 0;
4086 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4087 mtx_unlock(&txr->hn_tx_lock);
4089 taskqueue_enqueue(txr->hn_tx_taskq,
4095 * Release the oactive earlier, with the hope, that
4096 * others could catch up. The task will clear the
4097 * oactive again with the hn_tx_lock to avoid possible
4100 txr->hn_oactive = 0;
4101 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4106 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4108 struct hn_tx_ring *txr = xtxr;
4110 mtx_lock(&txr->hn_tx_lock);
4112 mtx_unlock(&txr->hn_tx_lock);
4116 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4118 struct hn_tx_ring *txr = xtxr;
4120 mtx_lock(&txr->hn_tx_lock);
4121 txr->hn_oactive = 0;
4123 mtx_unlock(&txr->hn_tx_lock);
4127 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4129 struct vmbus_chan_br cbr;
4130 struct hn_rx_ring *rxr;
4131 struct hn_tx_ring *txr = NULL;
4134 idx = vmbus_chan_subidx(chan);
4137 * Link this channel to RX/TX ring.
4139 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4140 ("invalid channel index %d, should > 0 && < %d",
4141 idx, sc->hn_rx_ring_inuse));
4142 rxr = &sc->hn_rx_ring[idx];
4143 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4144 ("RX ring %d already attached", idx));
4145 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4148 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4149 idx, vmbus_chan_id(chan));
4152 if (idx < sc->hn_tx_ring_inuse) {
4153 txr = &sc->hn_tx_ring[idx];
4154 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4155 ("TX ring %d already attached", idx));
4156 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4158 txr->hn_chan = chan;
4160 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4161 idx, vmbus_chan_id(chan));
4165 /* Bind this channel to a proper CPU. */
4166 vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus);
4171 cbr.cbr = rxr->hn_br;
4172 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4173 cbr.cbr_txsz = HN_TXBR_SIZE;
4174 cbr.cbr_rxsz = HN_RXBR_SIZE;
4175 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4177 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4178 vmbus_chan_id(chan), error);
4179 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4181 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4187 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4189 struct hn_rx_ring *rxr;
4192 idx = vmbus_chan_subidx(chan);
4195 * Link this channel to RX/TX ring.
4197 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4198 ("invalid channel index %d, should > 0 && < %d",
4199 idx, sc->hn_rx_ring_inuse));
4200 rxr = &sc->hn_rx_ring[idx];
4201 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4202 ("RX ring %d is not attached", idx));
4203 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4205 if (idx < sc->hn_tx_ring_inuse) {
4206 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4208 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4209 ("TX ring %d is not attached attached", idx));
4210 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4214 * Close this channel.
4217 * Channel closing does _not_ destroy the target channel.
4219 vmbus_chan_close(chan);
4223 hn_attach_subchans(struct hn_softc *sc)
4225 struct vmbus_channel **subchans;
4226 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4229 if (subchan_cnt == 0)
4232 /* Attach the sub-channels. */
4233 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4234 for (i = 0; i < subchan_cnt; ++i) {
4235 error = hn_chan_attach(sc, subchans[i]);
4239 vmbus_subchan_rel(subchans, subchan_cnt);
4242 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4245 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4253 hn_detach_allchans(struct hn_softc *sc)
4255 struct vmbus_channel **subchans;
4256 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4259 if (subchan_cnt == 0)
4262 /* Detach the sub-channels. */
4263 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4264 for (i = 0; i < subchan_cnt; ++i)
4265 hn_chan_detach(sc, subchans[i]);
4266 vmbus_subchan_rel(subchans, subchan_cnt);
4270 * Detach the primary channel, _after_ all sub-channels
4273 hn_chan_detach(sc, sc->hn_prichan);
4275 /* Wait for sub-channels to be destroyed, if any. */
4276 vmbus_subchan_drain(sc->hn_prichan);
4279 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4280 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4281 HN_RX_FLAG_ATTACHED) == 0,
4282 ("%dth RX ring is still attached", i));
4284 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4285 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4286 HN_TX_FLAG_ATTACHED) == 0,
4287 ("%dth TX ring is still attached", i));
4293 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4295 struct vmbus_channel **subchans;
4296 int nchan, rxr_cnt, error;
4298 nchan = *nsubch + 1;
4301 * Multiple RX/TX rings are not requested.
4308 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4311 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4313 /* No RSS; this is benign. */
4318 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4322 if (nchan > rxr_cnt)
4325 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4331 * Allocate sub-channels from NVS.
4333 *nsubch = nchan - 1;
4334 error = hn_nvs_alloc_subchans(sc, nsubch);
4335 if (error || *nsubch == 0) {
4336 /* Failed to allocate sub-channels. */
4342 * Wait for all sub-channels to become ready before moving on.
4344 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4345 vmbus_subchan_rel(subchans, *nsubch);
4350 hn_synth_attach(struct hn_softc *sc, int mtu)
4352 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4353 int error, nsubch, nchan, i;
4356 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4357 ("synthetic parts were attached"));
4359 /* Save capabilities for later verification. */
4360 old_caps = sc->hn_caps;
4363 /* Clear RSS stuffs. */
4364 sc->hn_rss_ind_size = 0;
4365 sc->hn_rss_hash = 0;
4368 * Attach the primary channel _before_ attaching NVS and RNDIS.
4370 error = hn_chan_attach(sc, sc->hn_prichan);
4377 error = hn_nvs_attach(sc, mtu);
4382 * Attach RNDIS _after_ NVS is attached.
4384 error = hn_rndis_attach(sc, mtu);
4389 * Make sure capabilities are not changed.
4391 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4392 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4393 old_caps, sc->hn_caps);
4394 /* Restore old capabilities and abort. */
4395 sc->hn_caps = old_caps;
4400 * Allocate sub-channels for multi-TX/RX rings.
4403 * The # of RX rings that can be used is equivalent to the # of
4404 * channels to be requested.
4406 nsubch = sc->hn_rx_ring_cnt - 1;
4407 error = hn_synth_alloc_subchans(sc, &nsubch);
4413 /* Only the primary channel can be used; done */
4418 * Configure RSS key and indirect table _after_ all sub-channels
4422 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4424 * RSS key is not set yet; set it to the default RSS key.
4427 if_printf(sc->hn_ifp, "setup default RSS key\n");
4428 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4429 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4432 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4434 * RSS indirect table is not set yet; set it up in round-
4438 if_printf(sc->hn_ifp, "setup default RSS indirect "
4441 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
4442 rss->rss_ind[i] = i % nchan;
4443 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4446 * # of usable channels may be changed, so we have to
4447 * make sure that all entries in RSS indirect table
4450 hn_rss_ind_fixup(sc, nchan);
4453 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4456 * Failed to configure RSS key or indirect table; only
4457 * the primary channel can be used.
4463 * Set the # of TX/RX rings that could be used according to
4464 * the # of channels that NVS offered.
4466 hn_set_ring_inuse(sc, nchan);
4469 * Attach the sub-channels, if any.
4471 error = hn_attach_subchans(sc);
4476 * Fixup transmission aggregation setup.
4480 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4486 * The interface must have been suspended though hn_suspend(), before
4487 * this function get called.
4490 hn_synth_detach(struct hn_softc *sc)
4494 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4495 ("synthetic parts were not attached"));
4497 /* Detach the RNDIS first. */
4498 hn_rndis_detach(sc);
4503 /* Detach all of the channels. */
4504 hn_detach_allchans(sc);
4506 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4510 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4512 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4513 ("invalid ring count %d", ring_cnt));
4515 if (sc->hn_tx_ring_cnt > ring_cnt)
4516 sc->hn_tx_ring_inuse = ring_cnt;
4518 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4519 sc->hn_rx_ring_inuse = ring_cnt;
4522 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4523 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4528 hn_chan_drain(struct vmbus_channel *chan)
4531 while (!vmbus_chan_rx_empty(chan) || !vmbus_chan_tx_empty(chan))
4533 vmbus_chan_intr_drain(chan);
4537 hn_suspend_data(struct hn_softc *sc)
4539 struct vmbus_channel **subch = NULL;
4547 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4548 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4550 mtx_lock(&txr->hn_tx_lock);
4551 txr->hn_suspended = 1;
4552 mtx_unlock(&txr->hn_tx_lock);
4553 /* No one is able send more packets now. */
4555 /* Wait for all pending sends to finish. */
4556 while (hn_tx_ring_pending(txr))
4557 pause("hnwtx", 1 /* 1 tick */);
4559 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4560 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4564 * Disable RX by clearing RX filter.
4566 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
4567 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
4570 * Give RNDIS enough time to flush all pending data packets.
4572 pause("waitrx", (200 * hz) / 1000);
4575 * Drain RX/TX bufrings and interrupts.
4577 nsubch = sc->hn_rx_ring_inuse - 1;
4579 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4581 if (subch != NULL) {
4582 for (i = 0; i < nsubch; ++i)
4583 hn_chan_drain(subch[i]);
4585 hn_chan_drain(sc->hn_prichan);
4588 vmbus_subchan_rel(subch, nsubch);
4592 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4595 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4599 hn_suspend_mgmt(struct hn_softc *sc)
4606 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4607 * through hn_mgmt_taskq.
4609 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4610 vmbus_chan_run_task(sc->hn_prichan, &task);
4613 * Make sure that all pending management tasks are completed.
4615 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4616 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4617 taskqueue_drain_all(sc->hn_mgmt_taskq0);
4621 hn_suspend(struct hn_softc *sc)
4624 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4625 hn_suspend_data(sc);
4626 hn_suspend_mgmt(sc);
4630 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4634 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4635 ("invalid TX ring count %d", tx_ring_cnt));
4637 for (i = 0; i < tx_ring_cnt; ++i) {
4638 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4640 mtx_lock(&txr->hn_tx_lock);
4641 txr->hn_suspended = 0;
4642 mtx_unlock(&txr->hn_tx_lock);
4647 hn_resume_data(struct hn_softc *sc)
4656 hn_set_rxfilter(sc);
4659 * Make sure to clear suspend status on "all" TX rings,
4660 * since hn_tx_ring_inuse can be changed after
4661 * hn_suspend_data().
4663 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4665 #ifdef HN_IFSTART_SUPPORT
4666 if (!hn_use_if_start)
4670 * Flush unused drbrs, since hn_tx_ring_inuse may be
4673 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4674 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4680 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4681 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4684 * Use txeof task, so that any pending oactive can be
4687 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4692 hn_resume_mgmt(struct hn_softc *sc)
4695 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4698 * Kick off network change detection, if it was pending.
4699 * If no network change was pending, start link status
4700 * checks, which is more lightweight than network change
4703 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4704 hn_change_network(sc);
4706 hn_update_link_status(sc);
4710 hn_resume(struct hn_softc *sc)
4713 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4719 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4721 const struct rndis_status_msg *msg;
4724 if (dlen < sizeof(*msg)) {
4725 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4730 switch (msg->rm_status) {
4731 case RNDIS_STATUS_MEDIA_CONNECT:
4732 case RNDIS_STATUS_MEDIA_DISCONNECT:
4733 hn_update_link_status(sc);
4736 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4737 /* Not really useful; ignore. */
4740 case RNDIS_STATUS_NETWORK_CHANGE:
4741 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4742 if (dlen < ofs + msg->rm_stbuflen ||
4743 msg->rm_stbuflen < sizeof(uint32_t)) {
4744 if_printf(sc->hn_ifp, "network changed\n");
4748 memcpy(&change, ((const uint8_t *)msg) + ofs,
4750 if_printf(sc->hn_ifp, "network changed, change %u\n",
4753 hn_change_network(sc);
4757 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4764 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
4766 const struct rndis_pktinfo *pi = info_data;
4769 while (info_dlen != 0) {
4773 if (__predict_false(info_dlen < sizeof(*pi)))
4775 if (__predict_false(info_dlen < pi->rm_size))
4777 info_dlen -= pi->rm_size;
4779 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
4781 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
4783 dlen = pi->rm_size - pi->rm_pktinfooffset;
4786 switch (pi->rm_type) {
4787 case NDIS_PKTINFO_TYPE_VLAN:
4788 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
4790 info->vlan_info = *((const uint32_t *)data);
4791 mask |= HN_RXINFO_VLAN;
4794 case NDIS_PKTINFO_TYPE_CSUM:
4795 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
4797 info->csum_info = *((const uint32_t *)data);
4798 mask |= HN_RXINFO_CSUM;
4801 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
4802 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
4804 info->hash_value = *((const uint32_t *)data);
4805 mask |= HN_RXINFO_HASHVAL;
4808 case HN_NDIS_PKTINFO_TYPE_HASHINF:
4809 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
4811 info->hash_info = *((const uint32_t *)data);
4812 mask |= HN_RXINFO_HASHINF;
4819 if (mask == HN_RXINFO_ALL) {
4820 /* All found; done */
4824 pi = (const struct rndis_pktinfo *)
4825 ((const uint8_t *)pi + pi->rm_size);
4830 * - If there is no hash value, invalidate the hash info.
4832 if ((mask & HN_RXINFO_HASHVAL) == 0)
4833 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
4837 static __inline bool
4838 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
4841 if (off < check_off) {
4842 if (__predict_true(off + len <= check_off))
4844 } else if (off > check_off) {
4845 if (__predict_true(check_off + check_len <= off))
4852 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
4854 const struct rndis_packet_msg *pkt;
4855 struct hn_rxinfo info;
4856 int data_off, pktinfo_off, data_len, pktinfo_len;
4861 if (__predict_false(dlen < sizeof(*pkt))) {
4862 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
4867 if (__predict_false(dlen < pkt->rm_len)) {
4868 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
4869 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
4872 if (__predict_false(pkt->rm_len <
4873 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
4874 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
4875 "msglen %u, data %u, oob %u, pktinfo %u\n",
4876 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
4877 pkt->rm_pktinfolen);
4880 if (__predict_false(pkt->rm_datalen == 0)) {
4881 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
4888 #define IS_OFFSET_INVALID(ofs) \
4889 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
4890 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
4892 /* XXX Hyper-V does not meet data offset alignment requirement */
4893 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
4894 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4895 "data offset %u\n", pkt->rm_dataoffset);
4898 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
4899 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
4900 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4901 "oob offset %u\n", pkt->rm_oobdataoffset);
4904 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
4905 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
4906 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4907 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
4911 #undef IS_OFFSET_INVALID
4913 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
4914 data_len = pkt->rm_datalen;
4915 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
4916 pktinfo_len = pkt->rm_pktinfolen;
4919 * Check OOB coverage.
4921 if (__predict_false(pkt->rm_oobdatalen != 0)) {
4922 int oob_off, oob_len;
4924 if_printf(rxr->hn_ifp, "got oobdata\n");
4925 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
4926 oob_len = pkt->rm_oobdatalen;
4928 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
4929 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4930 "oob overflow, msglen %u, oob abs %d len %d\n",
4931 pkt->rm_len, oob_off, oob_len);
4936 * Check against data.
4938 if (hn_rndis_check_overlap(oob_off, oob_len,
4939 data_off, data_len)) {
4940 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4941 "oob overlaps data, oob abs %d len %d, "
4942 "data abs %d len %d\n",
4943 oob_off, oob_len, data_off, data_len);
4948 * Check against pktinfo.
4950 if (pktinfo_len != 0 &&
4951 hn_rndis_check_overlap(oob_off, oob_len,
4952 pktinfo_off, pktinfo_len)) {
4953 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4954 "oob overlaps pktinfo, oob abs %d len %d, "
4955 "pktinfo abs %d len %d\n",
4956 oob_off, oob_len, pktinfo_off, pktinfo_len);
4962 * Check per-packet-info coverage and find useful per-packet-info.
4964 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
4965 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
4966 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
4967 if (__predict_true(pktinfo_len != 0)) {
4971 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
4972 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4973 "pktinfo overflow, msglen %u, "
4974 "pktinfo abs %d len %d\n",
4975 pkt->rm_len, pktinfo_off, pktinfo_len);
4980 * Check packet info coverage.
4982 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
4983 data_off, data_len);
4984 if (__predict_false(overlap)) {
4985 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4986 "pktinfo overlap data, pktinfo abs %d len %d, "
4987 "data abs %d len %d\n",
4988 pktinfo_off, pktinfo_len, data_off, data_len);
4993 * Find useful per-packet-info.
4995 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
4996 pktinfo_len, &info);
4997 if (__predict_false(error)) {
4998 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5004 if (__predict_false(data_off + data_len > pkt->rm_len)) {
5005 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5006 "data overflow, msglen %u, data abs %d len %d\n",
5007 pkt->rm_len, data_off, data_len);
5010 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5013 static __inline void
5014 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5016 const struct rndis_msghdr *hdr;
5018 if (__predict_false(dlen < sizeof(*hdr))) {
5019 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5024 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5025 /* Hot data path. */
5026 hn_rndis_rx_data(rxr, data, dlen);
5031 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5032 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5034 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5038 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5040 const struct hn_nvs_hdr *hdr;
5042 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5043 if_printf(sc->hn_ifp, "invalid nvs notify\n");
5046 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5048 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5049 /* Useless; ignore */
5052 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5056 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5057 const struct vmbus_chanpkt_hdr *pkt)
5059 struct hn_nvs_sendctx *sndc;
5061 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5062 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5063 VMBUS_CHANPKT_DATALEN(pkt));
5066 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5072 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5073 const struct vmbus_chanpkt_hdr *pkthdr)
5075 const struct vmbus_chanpkt_rxbuf *pkt;
5076 const struct hn_nvs_hdr *nvs_hdr;
5079 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5080 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5083 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5085 /* Make sure that this is a RNDIS message. */
5086 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5087 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5092 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5093 if (__predict_false(hlen < sizeof(*pkt))) {
5094 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5097 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5099 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5100 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5105 count = pkt->cp_rxbuf_cnt;
5106 if (__predict_false(hlen <
5107 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5108 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5112 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5113 for (i = 0; i < count; ++i) {
5116 ofs = pkt->cp_rxbuf[i].rb_ofs;
5117 len = pkt->cp_rxbuf[i].rb_len;
5118 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5119 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5120 "ofs %d, len %d\n", i, ofs, len);
5123 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5127 * Ack the consumed RXBUF associated w/ this channel packet,
5128 * so that this RXBUF can be recycled by the hypervisor.
5130 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5134 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5137 struct hn_nvs_rndis_ack ack;
5140 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5141 ack.nvs_status = HN_NVS_STATUS_OK;
5145 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5146 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5147 if (__predict_false(error == EAGAIN)) {
5150 * This should _not_ happen in real world, since the
5151 * consumption of the TX bufring from the TX path is
5154 if (rxr->hn_ack_failed == 0)
5155 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5156 rxr->hn_ack_failed++;
5163 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5168 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5170 struct hn_rx_ring *rxr = xrxr;
5171 struct hn_softc *sc = rxr->hn_ifp->if_softc;
5174 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5177 pktlen = rxr->hn_pktbuf_len;
5178 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5179 if (__predict_false(error == ENOBUFS)) {
5184 * Expand channel packet buffer.
5187 * Use M_WAITOK here, since allocation failure
5190 nlen = rxr->hn_pktbuf_len * 2;
5191 while (nlen < pktlen)
5193 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5195 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5196 rxr->hn_pktbuf_len, nlen);
5198 free(rxr->hn_pktbuf, M_DEVBUF);
5199 rxr->hn_pktbuf = nbuf;
5200 rxr->hn_pktbuf_len = nlen;
5203 } else if (__predict_false(error == EAGAIN)) {
5204 /* No more channel packets; done! */
5207 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5209 switch (pkt->cph_type) {
5210 case VMBUS_CHANPKT_TYPE_COMP:
5211 hn_nvs_handle_comp(sc, chan, pkt);
5214 case VMBUS_CHANPKT_TYPE_RXBUF:
5215 hn_nvs_handle_rxbuf(rxr, chan, pkt);
5218 case VMBUS_CHANPKT_TYPE_INBAND:
5219 hn_nvs_handle_notify(sc, pkt);
5223 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5228 hn_chan_rollup(rxr, rxr->hn_txr);
5232 hn_tx_taskq_create(void *arg __unused)
5235 if (vm_guest != VM_GUEST_HV)
5238 if (!hn_share_tx_taskq)
5241 hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
5242 taskqueue_thread_enqueue, &hn_tx_taskq);
5243 taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
5244 if (hn_bind_tx_taskq >= 0) {
5245 int cpu = hn_bind_tx_taskq;
5246 struct task cpuset_task;
5249 if (cpu > mp_ncpus - 1)
5251 CPU_SETOF(cpu, &cpu_set);
5252 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, &cpu_set);
5253 taskqueue_enqueue(hn_tx_taskq, &cpuset_task);
5254 taskqueue_drain(hn_tx_taskq, &cpuset_task);
5257 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5258 hn_tx_taskq_create, NULL);
5261 hn_tx_taskq_destroy(void *arg __unused)
5264 if (hn_tx_taskq != NULL)
5265 taskqueue_free(hn_tx_taskq);
5267 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5268 hn_tx_taskq_destroy, NULL);