2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
61 #include <sys/param.h>
63 #include <sys/kernel.h>
64 #include <sys/limits.h>
65 #include <sys/malloc.h>
67 #include <sys/module.h>
69 #include <sys/queue.h>
72 #include <sys/socket.h>
73 #include <sys/sockio.h>
75 #include <sys/sysctl.h>
76 #include <sys/systm.h>
77 #include <sys/taskqueue.h>
78 #include <sys/buf_ring.h>
80 #include <machine/atomic.h>
81 #include <machine/in_cksum.h>
84 #include <net/ethernet.h>
86 #include <net/if_arp.h>
87 #include <net/if_media.h>
88 #include <net/if_types.h>
89 #include <net/if_var.h>
90 #include <net/if_vlan_var.h>
91 #include <net/rndis.h>
93 #include <netinet/in_systm.h>
94 #include <netinet/in.h>
95 #include <netinet/ip.h>
96 #include <netinet/ip6.h>
97 #include <netinet/tcp.h>
98 #include <netinet/tcp_lro.h>
99 #include <netinet/udp.h>
101 #include <dev/hyperv/include/hyperv.h>
102 #include <dev/hyperv/include/hyperv_busdma.h>
103 #include <dev/hyperv/include/vmbus.h>
104 #include <dev/hyperv/include/vmbus_xact.h>
106 #include <dev/hyperv/netvsc/ndis.h>
107 #include <dev/hyperv/netvsc/if_hnreg.h>
108 #include <dev/hyperv/netvsc/if_hnvar.h>
109 #include <dev/hyperv/netvsc/hn_nvs.h>
110 #include <dev/hyperv/netvsc/hn_rndis.h>
112 #include "vmbus_if.h"
114 #define HN_IFSTART_SUPPORT
116 #define HN_RING_CNT_DEF_MAX 8
118 /* YYY should get it from the underlying channel */
119 #define HN_TX_DESC_CNT 512
121 #define HN_RNDIS_PKT_LEN \
122 (sizeof(struct rndis_packet_msg) + \
123 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
124 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
125 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
126 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
127 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
128 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
130 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
131 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
132 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
133 /* -1 for RNDIS packet message */
134 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
136 #define HN_DIRECT_TX_SIZE_DEF 128
138 #define HN_EARLY_TXEOF_THRESH 8
140 #define HN_PKTBUF_LEN_DEF (16 * 1024)
142 #define HN_LROENT_CNT_DEF 128
144 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
145 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
146 /* YYY 2*MTU is a bit rough, but should be good enough. */
147 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
149 #define HN_LRO_ACKCNT_DEF 1
151 #define HN_LOCK_INIT(sc) \
152 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
153 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
154 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
155 #define HN_LOCK(sc) \
157 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
160 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
162 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
163 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
164 #define HN_CSUM_IP_HWASSIST(sc) \
165 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
166 #define HN_CSUM_IP6_HWASSIST(sc) \
167 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
169 #define HN_PKTSIZE_MIN(align) \
170 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
171 HN_RNDIS_PKT_LEN, (align))
172 #define HN_PKTSIZE(m, align) \
173 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
176 #ifndef HN_USE_TXDESC_BUFRING
177 SLIST_ENTRY(hn_txdesc) link;
179 STAILQ_ENTRY(hn_txdesc) agg_link;
181 /* Aggregated txdescs, in sending order. */
182 STAILQ_HEAD(, hn_txdesc) agg_list;
184 /* The oldest packet, if transmission aggregation happens. */
186 struct hn_tx_ring *txr;
188 uint32_t flags; /* HN_TXD_FLAG_ */
189 struct hn_nvs_sendctx send_ctx;
193 bus_dmamap_t data_dmap;
195 bus_addr_t rndis_pkt_paddr;
196 struct rndis_packet_msg *rndis_pkt;
197 bus_dmamap_t rndis_pkt_dmap;
200 #define HN_TXD_FLAG_ONLIST 0x0001
201 #define HN_TXD_FLAG_DMAMAP 0x0002
202 #define HN_TXD_FLAG_ONAGG 0x0004
211 #define HN_RXINFO_VLAN 0x0001
212 #define HN_RXINFO_CSUM 0x0002
213 #define HN_RXINFO_HASHINF 0x0004
214 #define HN_RXINFO_HASHVAL 0x0008
215 #define HN_RXINFO_ALL \
218 HN_RXINFO_HASHINF | \
221 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
222 #define HN_NDIS_RXCSUM_INFO_INVALID 0
223 #define HN_NDIS_HASH_INFO_INVALID 0
225 static int hn_probe(device_t);
226 static int hn_attach(device_t);
227 static int hn_detach(device_t);
228 static int hn_shutdown(device_t);
229 static void hn_chan_callback(struct vmbus_channel *,
232 static void hn_init(void *);
233 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
234 #ifdef HN_IFSTART_SUPPORT
235 static void hn_start(struct ifnet *);
237 static int hn_transmit(struct ifnet *, struct mbuf *);
238 static void hn_xmit_qflush(struct ifnet *);
239 static int hn_ifmedia_upd(struct ifnet *);
240 static void hn_ifmedia_sts(struct ifnet *,
241 struct ifmediareq *);
243 static int hn_rndis_rxinfo(const void *, int,
245 static void hn_rndis_rx_data(struct hn_rx_ring *,
247 static void hn_rndis_rx_status(struct hn_softc *,
250 static void hn_nvs_handle_notify(struct hn_softc *,
251 const struct vmbus_chanpkt_hdr *);
252 static void hn_nvs_handle_comp(struct hn_softc *,
253 struct vmbus_channel *,
254 const struct vmbus_chanpkt_hdr *);
255 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
256 struct vmbus_channel *,
257 const struct vmbus_chanpkt_hdr *);
258 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
259 struct vmbus_channel *, uint64_t);
261 #if __FreeBSD_version >= 1100099
262 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
263 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
265 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
266 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
267 #if __FreeBSD_version < 1100095
268 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
270 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
272 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
273 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
274 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
275 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
276 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
277 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
278 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
279 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
280 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
281 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
282 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
283 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
284 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
285 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
287 static void hn_stop(struct hn_softc *);
288 static void hn_init_locked(struct hn_softc *);
289 static int hn_chan_attach(struct hn_softc *,
290 struct vmbus_channel *);
291 static void hn_chan_detach(struct hn_softc *,
292 struct vmbus_channel *);
293 static int hn_attach_subchans(struct hn_softc *);
294 static void hn_detach_allchans(struct hn_softc *);
295 static void hn_chan_rollup(struct hn_rx_ring *,
296 struct hn_tx_ring *);
297 static void hn_set_ring_inuse(struct hn_softc *, int);
298 static int hn_synth_attach(struct hn_softc *, int);
299 static void hn_synth_detach(struct hn_softc *);
300 static int hn_synth_alloc_subchans(struct hn_softc *,
302 static void hn_suspend(struct hn_softc *);
303 static void hn_suspend_data(struct hn_softc *);
304 static void hn_suspend_mgmt(struct hn_softc *);
305 static void hn_resume(struct hn_softc *);
306 static void hn_resume_data(struct hn_softc *);
307 static void hn_resume_mgmt(struct hn_softc *);
308 static void hn_suspend_mgmt_taskfunc(void *, int);
309 static void hn_chan_drain(struct vmbus_channel *);
311 static void hn_update_link_status(struct hn_softc *);
312 static void hn_change_network(struct hn_softc *);
313 static void hn_link_taskfunc(void *, int);
314 static void hn_netchg_init_taskfunc(void *, int);
315 static void hn_netchg_status_taskfunc(void *, int);
316 static void hn_link_status(struct hn_softc *);
318 static int hn_create_rx_data(struct hn_softc *, int);
319 static void hn_destroy_rx_data(struct hn_softc *);
320 static int hn_check_iplen(const struct mbuf *, int);
321 static int hn_set_rxfilter(struct hn_softc *);
322 static int hn_rss_reconfig(struct hn_softc *);
323 static void hn_rss_ind_fixup(struct hn_softc *, int);
324 static int hn_rxpkt(struct hn_rx_ring *, const void *,
325 int, const struct hn_rxinfo *);
327 static int hn_tx_ring_create(struct hn_softc *, int);
328 static void hn_tx_ring_destroy(struct hn_tx_ring *);
329 static int hn_create_tx_data(struct hn_softc *, int);
330 static void hn_fixup_tx_data(struct hn_softc *);
331 static void hn_destroy_tx_data(struct hn_softc *);
332 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
333 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
334 struct hn_txdesc *, struct mbuf **);
335 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
337 static void hn_set_chim_size(struct hn_softc *, int);
338 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
339 static bool hn_tx_ring_pending(struct hn_tx_ring *);
340 static void hn_tx_ring_qflush(struct hn_tx_ring *);
341 static void hn_resume_tx(struct hn_softc *, int);
342 static void hn_set_txagg(struct hn_softc *);
343 static void *hn_try_txagg(struct ifnet *,
344 struct hn_tx_ring *, struct hn_txdesc *,
346 static int hn_get_txswq_depth(const struct hn_tx_ring *);
347 static void hn_txpkt_done(struct hn_nvs_sendctx *,
348 struct hn_softc *, struct vmbus_channel *,
350 static int hn_txpkt_sglist(struct hn_tx_ring *,
352 static int hn_txpkt_chim(struct hn_tx_ring *,
354 static int hn_xmit(struct hn_tx_ring *, int);
355 static void hn_xmit_taskfunc(void *, int);
356 static void hn_xmit_txeof(struct hn_tx_ring *);
357 static void hn_xmit_txeof_taskfunc(void *, int);
358 #ifdef HN_IFSTART_SUPPORT
359 static int hn_start_locked(struct hn_tx_ring *, int);
360 static void hn_start_taskfunc(void *, int);
361 static void hn_start_txeof(struct hn_tx_ring *);
362 static void hn_start_txeof_taskfunc(void *, int);
365 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
366 "Hyper-V network interface");
368 /* Trust tcp segements verification on host side. */
369 static int hn_trust_hosttcp = 1;
370 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
371 &hn_trust_hosttcp, 0,
372 "Trust tcp segement verification on host side, "
373 "when csum info is missing (global setting)");
375 /* Trust udp datagrams verification on host side. */
376 static int hn_trust_hostudp = 1;
377 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
378 &hn_trust_hostudp, 0,
379 "Trust udp datagram verification on host side, "
380 "when csum info is missing (global setting)");
382 /* Trust ip packets verification on host side. */
383 static int hn_trust_hostip = 1;
384 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
386 "Trust ip packet verification on host side, "
387 "when csum info is missing (global setting)");
389 /* Limit TSO burst size */
390 static int hn_tso_maxlen = IP_MAXPACKET;
391 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
392 &hn_tso_maxlen, 0, "TSO burst limit");
394 /* Limit chimney send size */
395 static int hn_tx_chimney_size = 0;
396 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
397 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
399 /* Limit the size of packet for direct transmission */
400 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
401 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
402 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
404 /* # of LRO entries per RX ring */
405 #if defined(INET) || defined(INET6)
406 #if __FreeBSD_version >= 1100095
407 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
408 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
409 &hn_lro_entry_count, 0, "LRO entry count");
413 /* Use shared TX taskqueue */
414 static int hn_share_tx_taskq = 0;
415 SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
416 &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
418 #ifndef HN_USE_TXDESC_BUFRING
419 static int hn_use_txdesc_bufring = 0;
421 static int hn_use_txdesc_bufring = 1;
423 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
424 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
426 /* Bind TX taskqueue to the target CPU */
427 static int hn_bind_tx_taskq = -1;
428 SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
429 &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
431 #ifdef HN_IFSTART_SUPPORT
432 /* Use ifnet.if_start instead of ifnet.if_transmit */
433 static int hn_use_if_start = 0;
434 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
435 &hn_use_if_start, 0, "Use if_start TX method");
438 /* # of channels to use */
439 static int hn_chan_cnt = 0;
440 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
442 "# of channels to use; each channel has one RX ring and one TX ring");
444 /* # of transmit rings to use */
445 static int hn_tx_ring_cnt = 0;
446 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
447 &hn_tx_ring_cnt, 0, "# of TX rings to use");
449 /* Software TX ring deptch */
450 static int hn_tx_swq_depth = 0;
451 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
452 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
454 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
455 #if __FreeBSD_version >= 1100095
456 static u_int hn_lro_mbufq_depth = 0;
457 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
458 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
461 /* Packet transmission aggregation size limit */
462 static int hn_tx_agg_size = -1;
463 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
464 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
466 /* Packet transmission aggregation count limit */
467 static int hn_tx_agg_pkts = 0;
468 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
469 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
471 static u_int hn_cpu_index; /* next CPU for channel */
472 static struct taskqueue *hn_tx_taskq; /* shared TX taskqueue */
475 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
476 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
477 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
478 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
479 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
480 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
483 static device_method_t hn_methods[] = {
484 /* Device interface */
485 DEVMETHOD(device_probe, hn_probe),
486 DEVMETHOD(device_attach, hn_attach),
487 DEVMETHOD(device_detach, hn_detach),
488 DEVMETHOD(device_shutdown, hn_shutdown),
492 static driver_t hn_driver = {
495 sizeof(struct hn_softc)
498 static devclass_t hn_devclass;
500 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
501 MODULE_VERSION(hn, 1);
502 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
504 #if __FreeBSD_version >= 1100099
506 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
510 for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
511 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
516 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
519 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
520 txd->chim_size == 0, ("invalid rndis sglist txd"));
521 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
522 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
526 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
528 struct hn_nvs_rndis rndis;
530 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
531 txd->chim_size > 0, ("invalid rndis chim txd"));
533 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
534 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
535 rndis.nvs_chim_idx = txd->chim_index;
536 rndis.nvs_chim_sz = txd->chim_size;
538 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
539 &rndis, sizeof(rndis), &txd->send_ctx));
542 static __inline uint32_t
543 hn_chim_alloc(struct hn_softc *sc)
545 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
546 u_long *bmap = sc->hn_chim_bmap;
547 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
549 for (i = 0; i < bmap_cnt; ++i) {
552 idx = ffsl(~bmap[i]);
556 --idx; /* ffsl is 1-based */
557 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
558 ("invalid i %d and idx %d", i, idx));
560 if (atomic_testandset_long(&bmap[i], idx))
563 ret = i * LONG_BIT + idx;
570 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
575 idx = chim_idx / LONG_BIT;
576 KASSERT(idx < sc->hn_chim_bmap_cnt,
577 ("invalid chimney index 0x%x", chim_idx));
579 mask = 1UL << (chim_idx % LONG_BIT);
580 KASSERT(sc->hn_chim_bmap[idx] & mask,
581 ("index bitmap 0x%lx, chimney index %u, "
582 "bitmap idx %d, bitmask 0x%lx",
583 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
585 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
588 #if defined(INET6) || defined(INET)
590 * NOTE: If this function failed, the m_head would be freed.
592 static __inline struct mbuf *
593 hn_tso_fixup(struct mbuf *m_head)
595 struct ether_vlan_header *evl;
599 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
601 #define PULLUP_HDR(m, len) \
603 if (__predict_false((m)->m_len < (len))) { \
604 (m) = m_pullup((m), (len)); \
610 PULLUP_HDR(m_head, sizeof(*evl));
611 evl = mtod(m_head, struct ether_vlan_header *);
612 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
613 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
615 ehlen = ETHER_HDR_LEN;
618 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
622 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
623 ip = mtodo(m_head, ehlen);
624 iphlen = ip->ip_hl << 2;
626 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
627 th = mtodo(m_head, ehlen + iphlen);
631 th->th_sum = in_pseudo(ip->ip_src.s_addr,
632 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
635 #if defined(INET6) && defined(INET)
642 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
643 ip6 = mtodo(m_head, ehlen);
644 if (ip6->ip6_nxt != IPPROTO_TCP) {
649 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
650 th = mtodo(m_head, ehlen + sizeof(*ip6));
653 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
660 #endif /* INET6 || INET */
663 hn_set_rxfilter(struct hn_softc *sc)
665 struct ifnet *ifp = sc->hn_ifp;
671 if (ifp->if_flags & IFF_PROMISC) {
672 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
674 filter = NDIS_PACKET_TYPE_DIRECTED;
675 if (ifp->if_flags & IFF_BROADCAST)
676 filter |= NDIS_PACKET_TYPE_BROADCAST;
677 /* TODO: support multicast list */
678 if ((ifp->if_flags & IFF_ALLMULTI) ||
679 !TAILQ_EMPTY(&ifp->if_multiaddrs))
680 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
683 if (sc->hn_rx_filter != filter) {
684 error = hn_rndis_set_rxfilter(sc, filter);
686 sc->hn_rx_filter = filter;
692 hn_set_txagg(struct hn_softc *sc)
698 * Setup aggregation size.
700 if (sc->hn_agg_size < 0)
703 size = sc->hn_agg_size;
705 if (sc->hn_rndis_agg_size < size)
706 size = sc->hn_rndis_agg_size;
708 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
715 /* NOTE: Type of the per TX ring setting is 'int'. */
719 /* NOTE: We only aggregate packets using chimney sending buffers. */
720 if (size > (uint32_t)sc->hn_chim_szmax)
721 size = sc->hn_chim_szmax;
724 * Setup aggregation packet count.
726 if (sc->hn_agg_pkts < 0)
729 pkts = sc->hn_agg_pkts;
731 if (sc->hn_rndis_agg_pkts < pkts)
732 pkts = sc->hn_rndis_agg_pkts;
741 /* NOTE: Type of the per TX ring setting is 'short'. */
746 /* NOTE: Type of the per TX ring setting is 'short'. */
747 if (sc->hn_rndis_agg_align > SHRT_MAX) {
754 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
755 size, pkts, sc->hn_rndis_agg_align);
758 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
759 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
761 mtx_lock(&txr->hn_tx_lock);
762 txr->hn_agg_szmax = size;
763 txr->hn_agg_pktmax = pkts;
764 txr->hn_agg_align = sc->hn_rndis_agg_align;
765 mtx_unlock(&txr->hn_tx_lock);
770 hn_get_txswq_depth(const struct hn_tx_ring *txr)
773 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
774 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
775 return txr->hn_txdesc_cnt;
776 return hn_tx_swq_depth;
780 hn_rss_reconfig(struct hn_softc *sc)
786 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
793 * Direct reconfiguration by setting the UNCHG flags does
794 * _not_ work properly.
797 if_printf(sc->hn_ifp, "disable RSS\n");
798 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
800 if_printf(sc->hn_ifp, "RSS disable failed\n");
805 * Reenable the RSS w/ the updated RSS key or indirect
809 if_printf(sc->hn_ifp, "reconfig RSS\n");
810 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
812 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
819 hn_rss_ind_fixup(struct hn_softc *sc, int nchan)
821 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
824 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
827 * Check indirect table to make sure that all channels in it
830 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
831 if (rss->rss_ind[i] >= nchan) {
832 if_printf(sc->hn_ifp,
833 "RSS indirect table %d fixup: %u -> %d\n",
834 i, rss->rss_ind[i], nchan - 1);
835 rss->rss_ind[i] = nchan - 1;
841 hn_ifmedia_upd(struct ifnet *ifp __unused)
848 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
850 struct hn_softc *sc = ifp->if_softc;
852 ifmr->ifm_status = IFM_AVALID;
853 ifmr->ifm_active = IFM_ETHER;
855 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
856 ifmr->ifm_active |= IFM_NONE;
859 ifmr->ifm_status |= IFM_ACTIVE;
860 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
863 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
864 static const struct hyperv_guid g_net_vsc_device_type = {
865 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
866 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
870 hn_probe(device_t dev)
873 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
874 &g_net_vsc_device_type) == 0) {
875 device_set_desc(dev, "Hyper-V Network Interface");
876 return BUS_PROBE_DEFAULT;
882 hn_cpuset_setthread_task(void *xmask, int pending __unused)
884 cpuset_t *mask = xmask;
887 error = cpuset_setthread(curthread->td_tid, mask);
889 panic("curthread=%ju: can't pin; error=%d",
890 (uintmax_t)curthread->td_tid, error);
895 hn_attach(device_t dev)
897 struct hn_softc *sc = device_get_softc(dev);
898 struct sysctl_oid_list *child;
899 struct sysctl_ctx_list *ctx;
900 uint8_t eaddr[ETHER_ADDR_LEN];
901 struct ifnet *ifp = NULL;
902 int error, ring_cnt, tx_ring_cnt;
905 sc->hn_prichan = vmbus_get_channel(dev);
909 * Initialize these tunables once.
911 sc->hn_agg_size = hn_tx_agg_size;
912 sc->hn_agg_pkts = hn_tx_agg_pkts;
915 * Setup taskqueue for transmission.
917 if (hn_tx_taskq == NULL) {
918 sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
919 taskqueue_thread_enqueue, &sc->hn_tx_taskq);
920 taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx",
921 device_get_nameunit(dev));
922 if (hn_bind_tx_taskq >= 0) {
923 int cpu = hn_bind_tx_taskq;
924 struct task cpuset_task;
927 if (cpu > mp_ncpus - 1)
929 CPU_SETOF(cpu, &cpu_set);
930 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task,
932 taskqueue_enqueue(sc->hn_tx_taskq, &cpuset_task);
933 taskqueue_drain(sc->hn_tx_taskq, &cpuset_task);
936 sc->hn_tx_taskq = hn_tx_taskq;
940 * Setup taskqueue for mangement tasks, e.g. link status.
942 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
943 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
944 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
945 device_get_nameunit(dev));
946 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
947 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
948 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
949 hn_netchg_status_taskfunc, sc);
952 * Allocate ifnet and setup its name earlier, so that if_printf
953 * can be used by functions, which will be called after
956 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
958 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
961 * Initialize ifmedia earlier so that it can be unconditionally
962 * destroyed, if error happened later on.
964 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
967 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
968 * to use (tx_ring_cnt).
971 * The # of RX rings to use is same as the # of channels to use.
973 ring_cnt = hn_chan_cnt;
977 if (ring_cnt > HN_RING_CNT_DEF_MAX)
978 ring_cnt = HN_RING_CNT_DEF_MAX;
979 } else if (ring_cnt > mp_ncpus) {
983 tx_ring_cnt = hn_tx_ring_cnt;
984 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
985 tx_ring_cnt = ring_cnt;
986 #ifdef HN_IFSTART_SUPPORT
987 if (hn_use_if_start) {
988 /* ifnet.if_start only needs one TX ring. */
994 * Set the leader CPU for channels.
996 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
999 * Create enough TX/RX rings, even if only limited number of
1000 * channels can be allocated.
1002 error = hn_create_tx_data(sc, tx_ring_cnt);
1005 error = hn_create_rx_data(sc, ring_cnt);
1010 * Create transaction context for NVS and RNDIS transactions.
1012 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1013 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1014 if (sc->hn_xact == NULL)
1018 * Attach the synthetic parts, i.e. NVS and RNDIS.
1020 error = hn_synth_attach(sc, ETHERMTU);
1024 error = hn_rndis_get_eaddr(sc, eaddr);
1028 #if __FreeBSD_version >= 1100099
1029 if (sc->hn_rx_ring_inuse > 1) {
1031 * Reduce TCP segment aggregation limit for multiple
1032 * RX rings to increase ACK timeliness.
1034 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1039 * Fixup TX stuffs after synthetic parts are attached.
1041 hn_fixup_tx_data(sc);
1043 ctx = device_get_sysctl_ctx(dev);
1044 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1045 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1046 &sc->hn_nvs_ver, 0, "NVS version");
1047 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1048 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1049 hn_ndis_version_sysctl, "A", "NDIS version");
1050 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1051 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1052 hn_caps_sysctl, "A", "capabilities");
1053 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1054 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1055 hn_hwassist_sysctl, "A", "hwassist");
1056 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1057 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1058 hn_rxfilter_sysctl, "A", "rxfilter");
1059 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1060 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1061 hn_rss_hash_sysctl, "A", "RSS hash");
1062 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1063 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1064 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1065 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1066 hn_rss_key_sysctl, "IU", "RSS key");
1067 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1068 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1069 hn_rss_ind_sysctl, "IU", "RSS indirect table");
1070 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1071 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1072 "RNDIS offered packet transmission aggregation size limit");
1073 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1074 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1075 "RNDIS offered packet transmission aggregation count limit");
1076 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1077 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1078 "RNDIS packet transmission aggregation alignment");
1079 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1080 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1081 hn_txagg_size_sysctl, "I",
1082 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1083 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1084 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1085 hn_txagg_pkts_sysctl, "I",
1086 "Packet transmission aggregation packets, "
1087 "0 -- disable, -1 -- auto");
1090 * Setup the ifmedia, which has been initialized earlier.
1092 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1093 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1094 /* XXX ifmedia_set really should do this for us */
1095 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1098 * Setup the ifnet for this interface.
1102 ifp->if_baudrate = IF_Gbps(10);
1104 /* if_baudrate is 32bits on 32bit system. */
1105 ifp->if_baudrate = IF_Gbps(1);
1107 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1108 ifp->if_ioctl = hn_ioctl;
1109 ifp->if_init = hn_init;
1110 #ifdef HN_IFSTART_SUPPORT
1111 if (hn_use_if_start) {
1112 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1114 ifp->if_start = hn_start;
1115 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1116 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1117 IFQ_SET_READY(&ifp->if_snd);
1121 ifp->if_transmit = hn_transmit;
1122 ifp->if_qflush = hn_xmit_qflush;
1125 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1127 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1128 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1130 if (sc->hn_caps & HN_CAP_VLAN) {
1131 /* XXX not sure about VLAN_MTU. */
1132 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1135 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1136 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1137 ifp->if_capabilities |= IFCAP_TXCSUM;
1138 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1139 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1140 if (sc->hn_caps & HN_CAP_TSO4) {
1141 ifp->if_capabilities |= IFCAP_TSO4;
1142 ifp->if_hwassist |= CSUM_IP_TSO;
1144 if (sc->hn_caps & HN_CAP_TSO6) {
1145 ifp->if_capabilities |= IFCAP_TSO6;
1146 ifp->if_hwassist |= CSUM_IP6_TSO;
1149 /* Enable all available capabilities by default. */
1150 ifp->if_capenable = ifp->if_capabilities;
1152 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1153 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1154 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1155 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1158 ether_ifattach(ifp, eaddr);
1160 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1161 if_printf(ifp, "TSO segcnt %u segsz %u\n",
1162 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1165 /* Inform the upper layer about the long frame support. */
1166 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1169 * Kick off link status check.
1171 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1172 hn_update_link_status(sc);
1176 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1177 hn_synth_detach(sc);
1183 hn_detach(device_t dev)
1185 struct hn_softc *sc = device_get_softc(dev);
1186 struct ifnet *ifp = sc->hn_ifp;
1188 if (device_is_attached(dev)) {
1190 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1191 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1195 * hn_stop() only suspends data, so managment
1196 * stuffs have to be suspended manually here.
1198 hn_suspend_mgmt(sc);
1199 hn_synth_detach(sc);
1202 ether_ifdetach(ifp);
1205 ifmedia_removeall(&sc->hn_media);
1206 hn_destroy_rx_data(sc);
1207 hn_destroy_tx_data(sc);
1209 if (sc->hn_tx_taskq != hn_tx_taskq)
1210 taskqueue_free(sc->hn_tx_taskq);
1211 taskqueue_free(sc->hn_mgmt_taskq0);
1213 if (sc->hn_xact != NULL)
1214 vmbus_xact_ctx_destroy(sc->hn_xact);
1218 HN_LOCK_DESTROY(sc);
1223 hn_shutdown(device_t dev)
1230 hn_link_status(struct hn_softc *sc)
1232 uint32_t link_status;
1235 error = hn_rndis_get_linkstatus(sc, &link_status);
1237 /* XXX what to do? */
1241 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1242 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1244 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1245 if_link_state_change(sc->hn_ifp,
1246 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1247 LINK_STATE_UP : LINK_STATE_DOWN);
1251 hn_link_taskfunc(void *xsc, int pending __unused)
1253 struct hn_softc *sc = xsc;
1255 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1261 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1263 struct hn_softc *sc = xsc;
1265 /* Prevent any link status checks from running. */
1266 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1269 * Fake up a [link down --> link up] state change; 5 seconds
1270 * delay is used, which closely simulates miibus reaction
1271 * upon link down event.
1273 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1274 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1275 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1276 &sc->hn_netchg_status, 5 * hz);
1280 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1282 struct hn_softc *sc = xsc;
1284 /* Re-allow link status checks. */
1285 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1290 hn_update_link_status(struct hn_softc *sc)
1293 if (sc->hn_mgmt_taskq != NULL)
1294 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1298 hn_change_network(struct hn_softc *sc)
1301 if (sc->hn_mgmt_taskq != NULL)
1302 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1306 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1307 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1309 struct mbuf *m = *m_head;
1312 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1314 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1315 m, segs, nsegs, BUS_DMA_NOWAIT);
1316 if (error == EFBIG) {
1319 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1323 *m_head = m = m_new;
1324 txr->hn_tx_collapsed++;
1326 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1327 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1330 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1331 BUS_DMASYNC_PREWRITE);
1332 txd->flags |= HN_TXD_FLAG_DMAMAP;
1338 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1341 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1342 ("put an onlist txd %#x", txd->flags));
1343 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1344 ("put an onagg txd %#x", txd->flags));
1346 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1347 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1350 if (!STAILQ_EMPTY(&txd->agg_list)) {
1351 struct hn_txdesc *tmp_txd;
1353 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1356 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1357 ("resursive aggregation on aggregated txdesc"));
1358 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1359 ("not aggregated txdesc"));
1360 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1361 ("aggregated txdesc uses dmamap"));
1362 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1363 ("aggregated txdesc consumes "
1364 "chimney sending buffer"));
1365 KASSERT(tmp_txd->chim_size == 0,
1366 ("aggregated txdesc has non-zero "
1367 "chimney sending size"));
1369 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1370 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1371 freed = hn_txdesc_put(txr, tmp_txd);
1372 KASSERT(freed, ("failed to free aggregated txdesc"));
1376 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1377 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1378 ("chim txd uses dmamap"));
1379 hn_chim_free(txr->hn_sc, txd->chim_index);
1380 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1382 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1383 bus_dmamap_sync(txr->hn_tx_data_dtag,
1384 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1385 bus_dmamap_unload(txr->hn_tx_data_dtag,
1387 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1390 if (txd->m != NULL) {
1395 txd->flags |= HN_TXD_FLAG_ONLIST;
1396 #ifndef HN_USE_TXDESC_BUFRING
1397 mtx_lock_spin(&txr->hn_txlist_spin);
1398 KASSERT(txr->hn_txdesc_avail >= 0 &&
1399 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1400 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1401 txr->hn_txdesc_avail++;
1402 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1403 mtx_unlock_spin(&txr->hn_txlist_spin);
1405 atomic_add_int(&txr->hn_txdesc_avail, 1);
1406 buf_ring_enqueue(txr->hn_txdesc_br, txd);
1412 static __inline struct hn_txdesc *
1413 hn_txdesc_get(struct hn_tx_ring *txr)
1415 struct hn_txdesc *txd;
1417 #ifndef HN_USE_TXDESC_BUFRING
1418 mtx_lock_spin(&txr->hn_txlist_spin);
1419 txd = SLIST_FIRST(&txr->hn_txlist);
1421 KASSERT(txr->hn_txdesc_avail > 0,
1422 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1423 txr->hn_txdesc_avail--;
1424 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1426 mtx_unlock_spin(&txr->hn_txlist_spin);
1428 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1432 #ifdef HN_USE_TXDESC_BUFRING
1433 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1435 KASSERT(txd->m == NULL && txd->refs == 0 &&
1436 STAILQ_EMPTY(&txd->agg_list) &&
1437 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1438 txd->chim_size == 0 &&
1439 (txd->flags & HN_TXD_FLAG_ONLIST) &&
1440 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1441 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1442 txd->flags &= ~HN_TXD_FLAG_ONLIST;
1448 static __inline void
1449 hn_txdesc_hold(struct hn_txdesc *txd)
1452 /* 0->1 transition will never work */
1453 KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
1454 atomic_add_int(&txd->refs, 1);
1457 static __inline void
1458 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1461 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1462 ("recursive aggregation on aggregating txdesc"));
1464 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1465 ("already aggregated"));
1466 KASSERT(STAILQ_EMPTY(&txd->agg_list),
1467 ("recursive aggregation on to-be-aggregated txdesc"));
1469 txd->flags |= HN_TXD_FLAG_ONAGG;
1470 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1474 hn_tx_ring_pending(struct hn_tx_ring *txr)
1476 bool pending = false;
1478 #ifndef HN_USE_TXDESC_BUFRING
1479 mtx_lock_spin(&txr->hn_txlist_spin);
1480 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1482 mtx_unlock_spin(&txr->hn_txlist_spin);
1484 if (!buf_ring_full(txr->hn_txdesc_br))
1490 static __inline void
1491 hn_txeof(struct hn_tx_ring *txr)
1493 txr->hn_has_txeof = 0;
1498 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1499 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1501 struct hn_txdesc *txd = sndc->hn_cbarg;
1502 struct hn_tx_ring *txr;
1505 KASSERT(txr->hn_chan == chan,
1506 ("channel mismatch, on chan%u, should be chan%u",
1507 vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan)));
1509 txr->hn_has_txeof = 1;
1510 hn_txdesc_put(txr, txd);
1512 ++txr->hn_txdone_cnt;
1513 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1514 txr->hn_txdone_cnt = 0;
1515 if (txr->hn_oactive)
1521 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1523 #if defined(INET) || defined(INET6)
1524 struct lro_ctrl *lro = &rxr->hn_lro;
1525 struct lro_entry *queued;
1527 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1528 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1529 tcp_lro_flush(lro, queued);
1535 * 'txr' could be NULL, if multiple channels and
1536 * ifnet.if_start method are enabled.
1538 if (txr == NULL || !txr->hn_has_txeof)
1541 txr->hn_txdone_cnt = 0;
1545 static __inline uint32_t
1546 hn_rndis_pktmsg_offset(uint32_t ofs)
1549 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1550 ("invalid RNDIS packet msg offset %u", ofs));
1551 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1554 static __inline void *
1555 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1556 size_t pi_dlen, uint32_t pi_type)
1558 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1559 struct rndis_pktinfo *pi;
1561 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1562 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1565 * Per-packet-info does not move; it only grows.
1568 * rm_pktinfooffset in this phase counts from the beginning
1569 * of rndis_packet_msg.
1571 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1572 ("%u pktinfo overflows RNDIS packet msg", pi_type));
1573 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1574 pkt->rm_pktinfolen);
1575 pkt->rm_pktinfolen += pi_size;
1577 pi->rm_size = pi_size;
1578 pi->rm_type = pi_type;
1579 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1581 /* Data immediately follow per-packet-info. */
1582 pkt->rm_dataoffset += pi_size;
1584 /* Update RNDIS packet msg length */
1585 pkt->rm_len += pi_size;
1587 return (pi->rm_data);
1591 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1593 struct hn_txdesc *txd;
1597 txd = txr->hn_agg_txd;
1598 KASSERT(txd != NULL, ("no aggregate txdesc"));
1601 * Since hn_txpkt() will reset this temporary stat, save
1602 * it now, so that oerrors can be updated properly, if
1603 * hn_txpkt() ever fails.
1605 pkts = txr->hn_stat_pkts;
1608 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1609 * failure, save it for later freeing, if hn_txpkt() ever
1613 error = hn_txpkt(ifp, txr, txd);
1614 if (__predict_false(error)) {
1615 /* txd is freed, but m is not. */
1618 txr->hn_flush_failed++;
1619 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1622 /* Reset all aggregation states. */
1623 txr->hn_agg_txd = NULL;
1624 txr->hn_agg_szleft = 0;
1625 txr->hn_agg_pktleft = 0;
1626 txr->hn_agg_prevpkt = NULL;
1632 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1637 if (txr->hn_agg_txd != NULL) {
1638 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1639 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1640 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1644 * Update the previous RNDIS packet's total length,
1645 * it can be increased due to the mandatory alignment
1646 * padding for this RNDIS packet. And update the
1647 * aggregating txdesc's chimney sending buffer size
1651 * Zero-out the padding, as required by the RNDIS spec.
1654 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1655 agg_txd->chim_size += pkt->rm_len - olen;
1657 /* Link this txdesc to the parent. */
1658 hn_txdesc_agg(agg_txd, txd);
1660 chim = (uint8_t *)pkt + pkt->rm_len;
1661 /* Save the current packet for later fixup. */
1662 txr->hn_agg_prevpkt = chim;
1664 txr->hn_agg_pktleft--;
1665 txr->hn_agg_szleft -= pktsize;
1666 if (txr->hn_agg_szleft <=
1667 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1669 * Probably can't aggregate more packets,
1670 * flush this aggregating txdesc proactively.
1672 txr->hn_agg_pktleft = 0;
1677 hn_flush_txagg(ifp, txr);
1679 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1681 txr->hn_tx_chimney_tried++;
1682 txd->chim_index = hn_chim_alloc(txr->hn_sc);
1683 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1685 txr->hn_tx_chimney++;
1687 chim = txr->hn_sc->hn_chim +
1688 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1690 if (txr->hn_agg_pktmax > 1 &&
1691 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1692 txr->hn_agg_txd = txd;
1693 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1694 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1695 txr->hn_agg_prevpkt = chim;
1702 * If this function fails, then both txd and m_head0 will be freed.
1705 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1706 struct mbuf **m_head0)
1708 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1709 int error, nsegs, i;
1710 struct mbuf *m_head = *m_head0;
1711 struct rndis_packet_msg *pkt;
1714 int pkt_hlen, pkt_size;
1716 pkt = txd->rndis_pkt;
1717 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1718 if (pkt_size < txr->hn_chim_size) {
1719 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1723 if (txr->hn_agg_txd != NULL)
1724 hn_flush_txagg(ifp, txr);
1727 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1728 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1729 pkt->rm_dataoffset = sizeof(*pkt);
1730 pkt->rm_datalen = m_head->m_pkthdr.len;
1731 pkt->rm_oobdataoffset = 0;
1732 pkt->rm_oobdatalen = 0;
1733 pkt->rm_oobdataelements = 0;
1734 pkt->rm_pktinfooffset = sizeof(*pkt);
1735 pkt->rm_pktinfolen = 0;
1736 pkt->rm_vchandle = 0;
1737 pkt->rm_reserved = 0;
1739 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1741 * Set the hash value for this packet, so that the host could
1742 * dispatch the TX done event for this packet back to this TX
1745 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1746 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1747 *pi_data = txr->hn_tx_idx;
1750 if (m_head->m_flags & M_VLANTAG) {
1751 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1752 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1753 *pi_data = NDIS_VLAN_INFO_MAKE(
1754 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1755 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1756 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1759 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1760 #if defined(INET6) || defined(INET)
1761 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1762 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1764 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1765 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1766 m_head->m_pkthdr.tso_segsz);
1769 #if defined(INET6) && defined(INET)
1774 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1775 m_head->m_pkthdr.tso_segsz);
1778 #endif /* INET6 || INET */
1779 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1780 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1781 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1782 if (m_head->m_pkthdr.csum_flags &
1783 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1784 *pi_data = NDIS_TXCSUM_INFO_IPV6;
1786 *pi_data = NDIS_TXCSUM_INFO_IPV4;
1787 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1788 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1791 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1792 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1793 else if (m_head->m_pkthdr.csum_flags &
1794 (CSUM_IP_UDP | CSUM_IP6_UDP))
1795 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1798 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1799 /* Convert RNDIS packet message offsets */
1800 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1801 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1804 * Fast path: Chimney sending.
1807 struct hn_txdesc *tgt_txd = txd;
1809 if (txr->hn_agg_txd != NULL) {
1810 tgt_txd = txr->hn_agg_txd;
1816 KASSERT(pkt == chim,
1817 ("RNDIS pkt not in chimney sending buffer"));
1818 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1819 ("chimney sending buffer is not used"));
1820 tgt_txd->chim_size += pkt->rm_len;
1822 m_copydata(m_head, 0, m_head->m_pkthdr.len,
1823 ((uint8_t *)chim) + pkt_hlen);
1825 txr->hn_gpa_cnt = 0;
1826 txr->hn_sendpkt = hn_txpkt_chim;
1830 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1831 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1832 ("chimney buffer is used"));
1833 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1835 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1836 if (__predict_false(error)) {
1840 * This mbuf is not linked w/ the txd yet, so free it now.
1845 freed = hn_txdesc_put(txr, txd);
1847 ("fail to free txd upon txdma error"));
1849 txr->hn_txdma_failed++;
1850 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1855 /* +1 RNDIS packet message */
1856 txr->hn_gpa_cnt = nsegs + 1;
1858 /* send packet with page buffer */
1859 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1860 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1861 txr->hn_gpa[0].gpa_len = pkt_hlen;
1864 * Fill the page buffers with mbuf info after the page
1865 * buffer for RNDIS packet message.
1867 for (i = 0; i < nsegs; ++i) {
1868 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1870 gpa->gpa_page = atop(segs[i].ds_addr);
1871 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1872 gpa->gpa_len = segs[i].ds_len;
1875 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1877 txr->hn_sendpkt = hn_txpkt_sglist;
1881 /* Set the completion routine */
1882 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1884 /* Update temporary stats for later use. */
1885 txr->hn_stat_pkts++;
1886 txr->hn_stat_size += m_head->m_pkthdr.len;
1887 if (m_head->m_flags & M_MCAST)
1888 txr->hn_stat_mcasts++;
1895 * If this function fails, then txd will be freed, but the mbuf
1896 * associated w/ the txd will _not_ be freed.
1899 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1901 int error, send_failed = 0;
1905 * Make sure that this txd and any aggregated txds are not freed
1906 * before ETHER_BPF_MTAP.
1908 hn_txdesc_hold(txd);
1909 error = txr->hn_sendpkt(txr, txd);
1911 if (bpf_peers_present(ifp->if_bpf)) {
1912 const struct hn_txdesc *tmp_txd;
1914 ETHER_BPF_MTAP(ifp, txd->m);
1915 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1916 ETHER_BPF_MTAP(ifp, tmp_txd->m);
1919 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1920 #ifdef HN_IFSTART_SUPPORT
1921 if (!hn_use_if_start)
1924 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1926 if (txr->hn_stat_mcasts != 0) {
1927 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1928 txr->hn_stat_mcasts);
1931 txr->hn_pkts += txr->hn_stat_pkts;
1934 hn_txdesc_put(txr, txd);
1936 if (__predict_false(error)) {
1940 * This should "really rarely" happen.
1942 * XXX Too many RX to be acked or too many sideband
1943 * commands to run? Ask netvsc_channel_rollup()
1944 * to kick start later.
1946 txr->hn_has_txeof = 1;
1948 txr->hn_send_failed++;
1951 * Try sending again after set hn_has_txeof;
1952 * in case that we missed the last
1953 * netvsc_channel_rollup().
1957 if_printf(ifp, "send failed\n");
1960 * Caller will perform further processing on the
1961 * associated mbuf, so don't free it in hn_txdesc_put();
1962 * only unload it from the DMA map in hn_txdesc_put(),
1966 freed = hn_txdesc_put(txr, txd);
1968 ("fail to free txd upon send error"));
1970 txr->hn_send_failed++;
1973 /* Reset temporary stats, after this sending is done. */
1974 txr->hn_stat_size = 0;
1975 txr->hn_stat_pkts = 0;
1976 txr->hn_stat_mcasts = 0;
1982 * Append the specified data to the indicated mbuf chain,
1983 * Extend the mbuf chain if the new data does not fit in
1986 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
1987 * There should be an equivalent in the kernel mbuf code,
1988 * but there does not appear to be one yet.
1990 * Differs from m_append() in that additional mbufs are
1991 * allocated with cluster size MJUMPAGESIZE, and filled
1994 * Return 1 if able to complete the job; otherwise 0.
1997 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2000 int remainder, space;
2002 for (m = m0; m->m_next != NULL; m = m->m_next)
2005 space = M_TRAILINGSPACE(m);
2008 * Copy into available space.
2010 if (space > remainder)
2012 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2017 while (remainder > 0) {
2019 * Allocate a new mbuf; could check space
2020 * and allocate a cluster instead.
2022 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
2025 n->m_len = min(MJUMPAGESIZE, remainder);
2026 bcopy(cp, mtod(n, caddr_t), n->m_len);
2028 remainder -= n->m_len;
2032 if (m0->m_flags & M_PKTHDR)
2033 m0->m_pkthdr.len += len - remainder;
2035 return (remainder == 0);
2038 #if defined(INET) || defined(INET6)
2040 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2042 #if __FreeBSD_version >= 1100095
2043 if (hn_lro_mbufq_depth) {
2044 tcp_lro_queue_mbuf(lc, m);
2048 return tcp_lro_rx(lc, m, 0);
2053 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2054 const struct hn_rxinfo *info)
2056 struct ifnet *ifp = rxr->hn_ifp;
2058 int size, do_lro = 0, do_csum = 1;
2059 int hash_type = M_HASHTYPE_OPAQUE;
2061 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2065 * Bail out if packet contains more data than configured MTU.
2067 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
2069 } else if (dlen <= MHLEN) {
2070 m_new = m_gethdr(M_NOWAIT, MT_DATA);
2071 if (m_new == NULL) {
2072 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2075 memcpy(mtod(m_new, void *), data, dlen);
2076 m_new->m_pkthdr.len = m_new->m_len = dlen;
2077 rxr->hn_small_pkts++;
2080 * Get an mbuf with a cluster. For packets 2K or less,
2081 * get a standard 2K cluster. For anything larger, get a
2082 * 4K cluster. Any buffers larger than 4K can cause problems
2083 * if looped around to the Hyper-V TX channel, so avoid them.
2086 if (dlen > MCLBYTES) {
2088 size = MJUMPAGESIZE;
2091 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2092 if (m_new == NULL) {
2093 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2097 hv_m_append(m_new, dlen, data);
2099 m_new->m_pkthdr.rcvif = ifp;
2101 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2104 /* receive side checksum offload */
2105 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2106 /* IP csum offload */
2107 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2108 m_new->m_pkthdr.csum_flags |=
2109 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2113 /* TCP/UDP csum offload */
2114 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2115 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2116 m_new->m_pkthdr.csum_flags |=
2117 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2118 m_new->m_pkthdr.csum_data = 0xffff;
2119 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2127 * As of this write (Oct 28th, 2016), host side will turn
2128 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2129 * the do_lro setting here is actually _not_ accurate. We
2130 * depend on the RSS hash type check to reset do_lro.
2132 if ((info->csum_info &
2133 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2134 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2137 const struct ether_header *eh;
2142 if (m_new->m_len < hoff)
2144 eh = mtod(m_new, struct ether_header *);
2145 etype = ntohs(eh->ether_type);
2146 if (etype == ETHERTYPE_VLAN) {
2147 const struct ether_vlan_header *evl;
2149 hoff = sizeof(*evl);
2150 if (m_new->m_len < hoff)
2152 evl = mtod(m_new, struct ether_vlan_header *);
2153 etype = ntohs(evl->evl_proto);
2156 if (etype == ETHERTYPE_IP) {
2159 pr = hn_check_iplen(m_new, hoff);
2160 if (pr == IPPROTO_TCP) {
2162 (rxr->hn_trust_hcsum &
2163 HN_TRUST_HCSUM_TCP)) {
2164 rxr->hn_csum_trusted++;
2165 m_new->m_pkthdr.csum_flags |=
2166 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2167 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2168 m_new->m_pkthdr.csum_data = 0xffff;
2171 } else if (pr == IPPROTO_UDP) {
2173 (rxr->hn_trust_hcsum &
2174 HN_TRUST_HCSUM_UDP)) {
2175 rxr->hn_csum_trusted++;
2176 m_new->m_pkthdr.csum_flags |=
2177 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2178 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2179 m_new->m_pkthdr.csum_data = 0xffff;
2181 } else if (pr != IPPROTO_DONE && do_csum &&
2182 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2183 rxr->hn_csum_trusted++;
2184 m_new->m_pkthdr.csum_flags |=
2185 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2190 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2191 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2192 NDIS_VLAN_INFO_ID(info->vlan_info),
2193 NDIS_VLAN_INFO_PRI(info->vlan_info),
2194 NDIS_VLAN_INFO_CFI(info->vlan_info));
2195 m_new->m_flags |= M_VLANTAG;
2198 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2200 m_new->m_pkthdr.flowid = info->hash_value;
2201 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2202 NDIS_HASH_FUNCTION_TOEPLITZ) {
2203 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2207 * do_lro is resetted, if the hash types are not TCP
2208 * related. See the comment in the above csum_flags
2212 case NDIS_HASH_IPV4:
2213 hash_type = M_HASHTYPE_RSS_IPV4;
2217 case NDIS_HASH_TCP_IPV4:
2218 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2221 case NDIS_HASH_IPV6:
2222 hash_type = M_HASHTYPE_RSS_IPV6;
2226 case NDIS_HASH_IPV6_EX:
2227 hash_type = M_HASHTYPE_RSS_IPV6_EX;
2231 case NDIS_HASH_TCP_IPV6:
2232 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2235 case NDIS_HASH_TCP_IPV6_EX:
2236 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2241 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2243 M_HASHTYPE_SET(m_new, hash_type);
2246 * Note: Moved RX completion back to hv_nv_on_receive() so all
2247 * messages (not just data messages) will trigger a response.
2253 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2254 #if defined(INET) || defined(INET6)
2255 struct lro_ctrl *lro = &rxr->hn_lro;
2258 rxr->hn_lro_tried++;
2259 if (hn_lro_rx(lro, m_new) == 0) {
2267 /* We're not holding the lock here, so don't release it */
2268 (*ifp->if_input)(ifp, m_new);
2274 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2276 struct hn_softc *sc = ifp->if_softc;
2277 struct ifreq *ifr = (struct ifreq *)data;
2278 int mask, error = 0;
2282 if (ifr->ifr_mtu > HN_MTU_MAX) {
2289 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2294 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2295 /* Can't change MTU */
2301 if (ifp->if_mtu == ifr->ifr_mtu) {
2307 * Suspend this interface before the synthetic parts
2313 * Detach the synthetics parts, i.e. NVS and RNDIS.
2315 hn_synth_detach(sc);
2318 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2319 * with the new MTU setting.
2321 error = hn_synth_attach(sc, ifr->ifr_mtu);
2328 * Commit the requested MTU, after the synthetic parts
2329 * have been successfully attached.
2331 ifp->if_mtu = ifr->ifr_mtu;
2334 * Make sure that various parameters based on MTU are
2335 * still valid, after the MTU change.
2337 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2338 hn_set_chim_size(sc, sc->hn_chim_szmax);
2339 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2340 #if __FreeBSD_version >= 1100099
2341 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2342 HN_LRO_LENLIM_MIN(ifp))
2343 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2347 * All done! Resume the interface now.
2357 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2362 if (ifp->if_flags & IFF_UP) {
2363 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2365 * Caller meight hold mutex, e.g.
2366 * bpf; use busy-wait for the RNDIS
2370 hn_set_rxfilter(sc);
2376 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2379 sc->hn_if_flags = ifp->if_flags;
2386 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2388 if (mask & IFCAP_TXCSUM) {
2389 ifp->if_capenable ^= IFCAP_TXCSUM;
2390 if (ifp->if_capenable & IFCAP_TXCSUM)
2391 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2393 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2395 if (mask & IFCAP_TXCSUM_IPV6) {
2396 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2397 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2398 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2400 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2403 /* TODO: flip RNDIS offload parameters for RXCSUM. */
2404 if (mask & IFCAP_RXCSUM)
2405 ifp->if_capenable ^= IFCAP_RXCSUM;
2407 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2408 if (mask & IFCAP_RXCSUM_IPV6)
2409 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2412 if (mask & IFCAP_LRO)
2413 ifp->if_capenable ^= IFCAP_LRO;
2415 if (mask & IFCAP_TSO4) {
2416 ifp->if_capenable ^= IFCAP_TSO4;
2417 if (ifp->if_capenable & IFCAP_TSO4)
2418 ifp->if_hwassist |= CSUM_IP_TSO;
2420 ifp->if_hwassist &= ~CSUM_IP_TSO;
2422 if (mask & IFCAP_TSO6) {
2423 ifp->if_capenable ^= IFCAP_TSO6;
2424 if (ifp->if_capenable & IFCAP_TSO6)
2425 ifp->if_hwassist |= CSUM_IP6_TSO;
2427 ifp->if_hwassist &= ~CSUM_IP6_TSO;
2437 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2441 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2443 * Multicast uses mutex; use busy-wait for
2447 hn_set_rxfilter(sc);
2456 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2460 error = ether_ioctl(ifp, cmd, data);
2467 hn_stop(struct hn_softc *sc)
2469 struct ifnet *ifp = sc->hn_ifp;
2474 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2475 ("synthetic parts were not attached"));
2477 /* Clear RUNNING bit _before_ hn_suspend_data() */
2478 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2479 hn_suspend_data(sc);
2481 /* Clear OACTIVE bit. */
2482 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2483 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2484 sc->hn_tx_ring[i].hn_oactive = 0;
2488 hn_init_locked(struct hn_softc *sc)
2490 struct ifnet *ifp = sc->hn_ifp;
2495 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2498 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2501 /* Configure RX filter */
2502 hn_set_rxfilter(sc);
2504 /* Clear OACTIVE bit. */
2505 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2506 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2507 sc->hn_tx_ring[i].hn_oactive = 0;
2509 /* Clear TX 'suspended' bit. */
2510 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2512 /* Everything is ready; unleash! */
2513 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2519 struct hn_softc *sc = xsc;
2526 #if __FreeBSD_version >= 1100099
2529 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2531 struct hn_softc *sc = arg1;
2532 unsigned int lenlim;
2535 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2536 error = sysctl_handle_int(oidp, &lenlim, 0, req);
2537 if (error || req->newptr == NULL)
2541 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2542 lenlim > TCP_LRO_LENGTH_MAX) {
2546 hn_set_lro_lenlim(sc, lenlim);
2553 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2555 struct hn_softc *sc = arg1;
2556 int ackcnt, error, i;
2559 * lro_ackcnt_lim is append count limit,
2560 * +1 to turn it into aggregation limit.
2562 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2563 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2564 if (error || req->newptr == NULL)
2567 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2571 * Convert aggregation limit back to append
2576 for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
2577 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2585 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2587 struct hn_softc *sc = arg1;
2592 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2595 error = sysctl_handle_int(oidp, &on, 0, req);
2596 if (error || req->newptr == NULL)
2600 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2601 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2604 rxr->hn_trust_hcsum |= hcsum;
2606 rxr->hn_trust_hcsum &= ~hcsum;
2613 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2615 struct hn_softc *sc = arg1;
2616 int chim_size, error;
2618 chim_size = sc->hn_tx_ring[0].hn_chim_size;
2619 error = sysctl_handle_int(oidp, &chim_size, 0, req);
2620 if (error || req->newptr == NULL)
2623 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2627 hn_set_chim_size(sc, chim_size);
2632 #if __FreeBSD_version < 1100095
2634 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2636 struct hn_softc *sc = arg1;
2637 int ofs = arg2, i, error;
2638 struct hn_rx_ring *rxr;
2642 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2643 rxr = &sc->hn_rx_ring[i];
2644 stat += *((int *)((uint8_t *)rxr + ofs));
2647 error = sysctl_handle_64(oidp, &stat, 0, req);
2648 if (error || req->newptr == NULL)
2651 /* Zero out this stat. */
2652 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2653 rxr = &sc->hn_rx_ring[i];
2654 *((int *)((uint8_t *)rxr + ofs)) = 0;
2660 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2662 struct hn_softc *sc = arg1;
2663 int ofs = arg2, i, error;
2664 struct hn_rx_ring *rxr;
2668 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2669 rxr = &sc->hn_rx_ring[i];
2670 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2673 error = sysctl_handle_64(oidp, &stat, 0, req);
2674 if (error || req->newptr == NULL)
2677 /* Zero out this stat. */
2678 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2679 rxr = &sc->hn_rx_ring[i];
2680 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2688 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2690 struct hn_softc *sc = arg1;
2691 int ofs = arg2, i, error;
2692 struct hn_rx_ring *rxr;
2696 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2697 rxr = &sc->hn_rx_ring[i];
2698 stat += *((u_long *)((uint8_t *)rxr + ofs));
2701 error = sysctl_handle_long(oidp, &stat, 0, req);
2702 if (error || req->newptr == NULL)
2705 /* Zero out this stat. */
2706 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2707 rxr = &sc->hn_rx_ring[i];
2708 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
2714 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2716 struct hn_softc *sc = arg1;
2717 int ofs = arg2, i, error;
2718 struct hn_tx_ring *txr;
2722 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2723 txr = &sc->hn_tx_ring[i];
2724 stat += *((u_long *)((uint8_t *)txr + ofs));
2727 error = sysctl_handle_long(oidp, &stat, 0, req);
2728 if (error || req->newptr == NULL)
2731 /* Zero out this stat. */
2732 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2733 txr = &sc->hn_tx_ring[i];
2734 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2740 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2742 struct hn_softc *sc = arg1;
2743 int ofs = arg2, i, error, conf;
2744 struct hn_tx_ring *txr;
2746 txr = &sc->hn_tx_ring[0];
2747 conf = *((int *)((uint8_t *)txr + ofs));
2749 error = sysctl_handle_int(oidp, &conf, 0, req);
2750 if (error || req->newptr == NULL)
2754 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2755 txr = &sc->hn_tx_ring[i];
2756 *((int *)((uint8_t *)txr + ofs)) = conf;
2764 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2766 struct hn_softc *sc = arg1;
2769 size = sc->hn_agg_size;
2770 error = sysctl_handle_int(oidp, &size, 0, req);
2771 if (error || req->newptr == NULL)
2775 sc->hn_agg_size = size;
2783 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2785 struct hn_softc *sc = arg1;
2788 pkts = sc->hn_agg_pkts;
2789 error = sysctl_handle_int(oidp, &pkts, 0, req);
2790 if (error || req->newptr == NULL)
2794 sc->hn_agg_pkts = pkts;
2802 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2804 struct hn_softc *sc = arg1;
2807 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2808 return (sysctl_handle_int(oidp, &pkts, 0, req));
2812 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2814 struct hn_softc *sc = arg1;
2817 align = sc->hn_tx_ring[0].hn_agg_align;
2818 return (sysctl_handle_int(oidp, &align, 0, req));
2822 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2824 struct hn_softc *sc = arg1;
2827 snprintf(verstr, sizeof(verstr), "%u.%u",
2828 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2829 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2830 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2834 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2836 struct hn_softc *sc = arg1;
2843 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2844 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2848 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2850 struct hn_softc *sc = arg1;
2851 char assist_str[128];
2855 hwassist = sc->hn_ifp->if_hwassist;
2857 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2858 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2862 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2864 struct hn_softc *sc = arg1;
2865 char filter_str[128];
2869 filter = sc->hn_rx_filter;
2871 snprintf(filter_str, sizeof(filter_str), "%b", filter,
2873 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2877 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2879 struct hn_softc *sc = arg1;
2884 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2885 if (error || req->newptr == NULL)
2888 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2891 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2893 if (sc->hn_rx_ring_inuse > 1) {
2894 error = hn_rss_reconfig(sc);
2896 /* Not RSS capable, at least for now; just save the RSS key. */
2905 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2907 struct hn_softc *sc = arg1;
2912 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2913 if (error || req->newptr == NULL)
2917 * Don't allow RSS indirect table change, if this interface is not
2918 * RSS capable currently.
2920 if (sc->hn_rx_ring_inuse == 1) {
2925 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2928 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2930 hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse);
2931 error = hn_rss_reconfig(sc);
2938 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
2940 struct hn_softc *sc = arg1;
2945 hash = sc->hn_rss_hash;
2947 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
2948 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
2952 hn_check_iplen(const struct mbuf *m, int hoff)
2954 const struct ip *ip;
2955 int len, iphlen, iplen;
2956 const struct tcphdr *th;
2957 int thoff; /* TCP data offset */
2959 len = hoff + sizeof(struct ip);
2961 /* The packet must be at least the size of an IP header. */
2962 if (m->m_pkthdr.len < len)
2963 return IPPROTO_DONE;
2965 /* The fixed IP header must reside completely in the first mbuf. */
2967 return IPPROTO_DONE;
2969 ip = mtodo(m, hoff);
2971 /* Bound check the packet's stated IP header length. */
2972 iphlen = ip->ip_hl << 2;
2973 if (iphlen < sizeof(struct ip)) /* minimum header length */
2974 return IPPROTO_DONE;
2976 /* The full IP header must reside completely in the one mbuf. */
2977 if (m->m_len < hoff + iphlen)
2978 return IPPROTO_DONE;
2980 iplen = ntohs(ip->ip_len);
2983 * Check that the amount of data in the buffers is as
2984 * at least much as the IP header would have us expect.
2986 if (m->m_pkthdr.len < hoff + iplen)
2987 return IPPROTO_DONE;
2990 * Ignore IP fragments.
2992 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
2993 return IPPROTO_DONE;
2996 * The TCP/IP or UDP/IP header must be entirely contained within
2997 * the first fragment of a packet.
3001 if (iplen < iphlen + sizeof(struct tcphdr))
3002 return IPPROTO_DONE;
3003 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3004 return IPPROTO_DONE;
3005 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3006 thoff = th->th_off << 2;
3007 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3008 return IPPROTO_DONE;
3009 if (m->m_len < hoff + iphlen + thoff)
3010 return IPPROTO_DONE;
3013 if (iplen < iphlen + sizeof(struct udphdr))
3014 return IPPROTO_DONE;
3015 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3016 return IPPROTO_DONE;
3020 return IPPROTO_DONE;
3027 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3029 struct sysctl_oid_list *child;
3030 struct sysctl_ctx_list *ctx;
3031 device_t dev = sc->hn_dev;
3032 #if defined(INET) || defined(INET6)
3033 #if __FreeBSD_version >= 1100095
3040 * Create RXBUF for reception.
3043 * - It is shared by all channels.
3044 * - A large enough buffer is allocated, certain version of NVSes
3045 * may further limit the usable space.
3047 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3048 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3049 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3050 if (sc->hn_rxbuf == NULL) {
3051 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3055 sc->hn_rx_ring_cnt = ring_cnt;
3056 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3058 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3059 M_DEVBUF, M_WAITOK | M_ZERO);
3061 #if defined(INET) || defined(INET6)
3062 #if __FreeBSD_version >= 1100095
3063 lroent_cnt = hn_lro_entry_count;
3064 if (lroent_cnt < TCP_LRO_ENTRIES)
3065 lroent_cnt = TCP_LRO_ENTRIES;
3067 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3069 #endif /* INET || INET6 */
3071 ctx = device_get_sysctl_ctx(dev);
3072 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3074 /* Create dev.hn.UNIT.rx sysctl tree */
3075 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3076 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3078 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3079 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3081 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3082 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3083 &rxr->hn_br_dma, BUS_DMA_WAITOK);
3084 if (rxr->hn_br == NULL) {
3085 device_printf(dev, "allocate bufring failed\n");
3089 if (hn_trust_hosttcp)
3090 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3091 if (hn_trust_hostudp)
3092 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3093 if (hn_trust_hostip)
3094 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3095 rxr->hn_ifp = sc->hn_ifp;
3096 if (i < sc->hn_tx_ring_cnt)
3097 rxr->hn_txr = &sc->hn_tx_ring[i];
3098 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3099 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3101 rxr->hn_rxbuf = sc->hn_rxbuf;
3106 #if defined(INET) || defined(INET6)
3107 #if __FreeBSD_version >= 1100095
3108 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3109 hn_lro_mbufq_depth);
3111 tcp_lro_init(&rxr->hn_lro);
3112 rxr->hn_lro.ifp = sc->hn_ifp;
3114 #if __FreeBSD_version >= 1100099
3115 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3116 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3118 #endif /* INET || INET6 */
3120 if (sc->hn_rx_sysctl_tree != NULL) {
3124 * Create per RX ring sysctl tree:
3125 * dev.hn.UNIT.rx.RINGID
3127 snprintf(name, sizeof(name), "%d", i);
3128 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3129 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3130 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3132 if (rxr->hn_rx_sysctl_tree != NULL) {
3133 SYSCTL_ADD_ULONG(ctx,
3134 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3135 OID_AUTO, "packets", CTLFLAG_RW,
3136 &rxr->hn_pkts, "# of packets received");
3137 SYSCTL_ADD_ULONG(ctx,
3138 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3139 OID_AUTO, "rss_pkts", CTLFLAG_RW,
3141 "# of packets w/ RSS info received");
3143 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3144 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3145 &rxr->hn_pktbuf_len, 0,
3146 "Temporary channel packet buffer length");
3151 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3152 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3153 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3154 #if __FreeBSD_version < 1100095
3155 hn_rx_stat_int_sysctl,
3157 hn_rx_stat_u64_sysctl,
3159 "LU", "LRO queued");
3160 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3161 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3162 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3163 #if __FreeBSD_version < 1100095
3164 hn_rx_stat_int_sysctl,
3166 hn_rx_stat_u64_sysctl,
3168 "LU", "LRO flushed");
3169 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3170 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3171 __offsetof(struct hn_rx_ring, hn_lro_tried),
3172 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3173 #if __FreeBSD_version >= 1100099
3174 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3175 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3176 hn_lro_lenlim_sysctl, "IU",
3177 "Max # of data bytes to be aggregated by LRO");
3178 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3179 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3180 hn_lro_ackcnt_sysctl, "I",
3181 "Max # of ACKs to be aggregated by LRO");
3183 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3184 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3185 hn_trust_hcsum_sysctl, "I",
3186 "Trust tcp segement verification on host side, "
3187 "when csum info is missing");
3188 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3189 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3190 hn_trust_hcsum_sysctl, "I",
3191 "Trust udp datagram verification on host side, "
3192 "when csum info is missing");
3193 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3194 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3195 hn_trust_hcsum_sysctl, "I",
3196 "Trust ip packet verification on host side, "
3197 "when csum info is missing");
3198 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3199 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3200 __offsetof(struct hn_rx_ring, hn_csum_ip),
3201 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3202 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3203 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3204 __offsetof(struct hn_rx_ring, hn_csum_tcp),
3205 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3206 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3207 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3208 __offsetof(struct hn_rx_ring, hn_csum_udp),
3209 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3210 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3211 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3212 __offsetof(struct hn_rx_ring, hn_csum_trusted),
3213 hn_rx_stat_ulong_sysctl, "LU",
3214 "# of packets that we trust host's csum verification");
3215 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3216 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3217 __offsetof(struct hn_rx_ring, hn_small_pkts),
3218 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3219 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3220 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3221 __offsetof(struct hn_rx_ring, hn_ack_failed),
3222 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3223 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3224 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3225 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3226 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3232 hn_destroy_rx_data(struct hn_softc *sc)
3236 if (sc->hn_rxbuf != NULL) {
3237 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3238 sc->hn_rxbuf = NULL;
3241 if (sc->hn_rx_ring_cnt == 0)
3244 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3245 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3247 if (rxr->hn_br == NULL)
3249 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3252 #if defined(INET) || defined(INET6)
3253 tcp_lro_free(&rxr->hn_lro);
3255 free(rxr->hn_pktbuf, M_DEVBUF);
3257 free(sc->hn_rx_ring, M_DEVBUF);
3258 sc->hn_rx_ring = NULL;
3260 sc->hn_rx_ring_cnt = 0;
3261 sc->hn_rx_ring_inuse = 0;
3265 hn_tx_ring_create(struct hn_softc *sc, int id)
3267 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3268 device_t dev = sc->hn_dev;
3269 bus_dma_tag_t parent_dtag;
3273 txr->hn_tx_idx = id;
3275 #ifndef HN_USE_TXDESC_BUFRING
3276 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3278 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3280 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3281 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3282 M_DEVBUF, M_WAITOK | M_ZERO);
3283 #ifndef HN_USE_TXDESC_BUFRING
3284 SLIST_INIT(&txr->hn_txlist);
3286 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3287 M_WAITOK, &txr->hn_tx_lock);
3290 txr->hn_tx_taskq = sc->hn_tx_taskq;
3292 #ifdef HN_IFSTART_SUPPORT
3293 if (hn_use_if_start) {
3294 txr->hn_txeof = hn_start_txeof;
3295 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3296 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3302 txr->hn_txeof = hn_xmit_txeof;
3303 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3304 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3306 br_depth = hn_get_txswq_depth(txr);
3307 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3308 M_WAITOK, &txr->hn_tx_lock);
3311 txr->hn_direct_tx_size = hn_direct_tx_size;
3314 * Always schedule transmission instead of trying to do direct
3315 * transmission. This one gives the best performance so far.
3317 txr->hn_sched_tx = 1;
3319 parent_dtag = bus_get_dma_tag(dev);
3321 /* DMA tag for RNDIS packet messages. */
3322 error = bus_dma_tag_create(parent_dtag, /* parent */
3323 HN_RNDIS_PKT_ALIGN, /* alignment */
3324 HN_RNDIS_PKT_BOUNDARY, /* boundary */
3325 BUS_SPACE_MAXADDR, /* lowaddr */
3326 BUS_SPACE_MAXADDR, /* highaddr */
3327 NULL, NULL, /* filter, filterarg */
3328 HN_RNDIS_PKT_LEN, /* maxsize */
3330 HN_RNDIS_PKT_LEN, /* maxsegsize */
3332 NULL, /* lockfunc */
3333 NULL, /* lockfuncarg */
3334 &txr->hn_tx_rndis_dtag);
3336 device_printf(dev, "failed to create rndis dmatag\n");
3340 /* DMA tag for data. */
3341 error = bus_dma_tag_create(parent_dtag, /* parent */
3343 HN_TX_DATA_BOUNDARY, /* boundary */
3344 BUS_SPACE_MAXADDR, /* lowaddr */
3345 BUS_SPACE_MAXADDR, /* highaddr */
3346 NULL, NULL, /* filter, filterarg */
3347 HN_TX_DATA_MAXSIZE, /* maxsize */
3348 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
3349 HN_TX_DATA_SEGSIZE, /* maxsegsize */
3351 NULL, /* lockfunc */
3352 NULL, /* lockfuncarg */
3353 &txr->hn_tx_data_dtag);
3355 device_printf(dev, "failed to create data dmatag\n");
3359 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3360 struct hn_txdesc *txd = &txr->hn_txdesc[i];
3363 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3364 STAILQ_INIT(&txd->agg_list);
3367 * Allocate and load RNDIS packet message.
3369 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3370 (void **)&txd->rndis_pkt,
3371 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3372 &txd->rndis_pkt_dmap);
3375 "failed to allocate rndis_packet_msg, %d\n", i);
3379 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3380 txd->rndis_pkt_dmap,
3381 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3382 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3386 "failed to load rndis_packet_msg, %d\n", i);
3387 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3388 txd->rndis_pkt, txd->rndis_pkt_dmap);
3392 /* DMA map for TX data. */
3393 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3397 "failed to allocate tx data dmamap\n");
3398 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3399 txd->rndis_pkt_dmap);
3400 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3401 txd->rndis_pkt, txd->rndis_pkt_dmap);
3405 /* All set, put it to list */
3406 txd->flags |= HN_TXD_FLAG_ONLIST;
3407 #ifndef HN_USE_TXDESC_BUFRING
3408 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3410 buf_ring_enqueue(txr->hn_txdesc_br, txd);
3413 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3415 if (sc->hn_tx_sysctl_tree != NULL) {
3416 struct sysctl_oid_list *child;
3417 struct sysctl_ctx_list *ctx;
3421 * Create per TX ring sysctl tree:
3422 * dev.hn.UNIT.tx.RINGID
3424 ctx = device_get_sysctl_ctx(dev);
3425 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3427 snprintf(name, sizeof(name), "%d", id);
3428 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3429 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3431 if (txr->hn_tx_sysctl_tree != NULL) {
3432 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3434 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3435 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3436 "# of available TX descs");
3437 #ifdef HN_IFSTART_SUPPORT
3438 if (!hn_use_if_start)
3441 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3442 CTLFLAG_RD, &txr->hn_oactive, 0,
3445 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3446 CTLFLAG_RW, &txr->hn_pkts,
3447 "# of packets transmitted");
3448 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3449 CTLFLAG_RW, &txr->hn_sends, "# of sends");
3457 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3459 struct hn_tx_ring *txr = txd->txr;
3461 KASSERT(txd->m == NULL, ("still has mbuf installed"));
3462 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3464 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3465 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3466 txd->rndis_pkt_dmap);
3467 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3471 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3473 struct hn_txdesc *txd;
3475 if (txr->hn_txdesc == NULL)
3478 #ifndef HN_USE_TXDESC_BUFRING
3479 while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) {
3480 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
3481 hn_txdesc_dmamap_destroy(txd);
3484 mtx_lock(&txr->hn_tx_lock);
3485 while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
3486 hn_txdesc_dmamap_destroy(txd);
3487 mtx_unlock(&txr->hn_tx_lock);
3490 if (txr->hn_tx_data_dtag != NULL)
3491 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3492 if (txr->hn_tx_rndis_dtag != NULL)
3493 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3495 #ifdef HN_USE_TXDESC_BUFRING
3496 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3499 free(txr->hn_txdesc, M_DEVBUF);
3500 txr->hn_txdesc = NULL;
3502 if (txr->hn_mbuf_br != NULL)
3503 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3505 #ifndef HN_USE_TXDESC_BUFRING
3506 mtx_destroy(&txr->hn_txlist_spin);
3508 mtx_destroy(&txr->hn_tx_lock);
3512 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3514 struct sysctl_oid_list *child;
3515 struct sysctl_ctx_list *ctx;
3519 * Create TXBUF for chimney sending.
3521 * NOTE: It is shared by all channels.
3523 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3524 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3525 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3526 if (sc->hn_chim == NULL) {
3527 device_printf(sc->hn_dev, "allocate txbuf failed\n");
3531 sc->hn_tx_ring_cnt = ring_cnt;
3532 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3534 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3535 M_DEVBUF, M_WAITOK | M_ZERO);
3537 ctx = device_get_sysctl_ctx(sc->hn_dev);
3538 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3540 /* Create dev.hn.UNIT.tx sysctl tree */
3541 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3542 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3544 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3547 error = hn_tx_ring_create(sc, i);
3552 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3553 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3554 __offsetof(struct hn_tx_ring, hn_no_txdescs),
3555 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3556 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3557 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3558 __offsetof(struct hn_tx_ring, hn_send_failed),
3559 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3560 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3561 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3562 __offsetof(struct hn_tx_ring, hn_txdma_failed),
3563 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3564 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3565 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3566 __offsetof(struct hn_tx_ring, hn_flush_failed),
3567 hn_tx_stat_ulong_sysctl, "LU",
3568 "# of packet transmission aggregation flush failure");
3569 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3570 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3571 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3572 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3573 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3574 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3575 __offsetof(struct hn_tx_ring, hn_tx_chimney),
3576 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3577 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3578 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3579 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3580 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3581 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3582 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3583 "# of total TX descs");
3584 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3585 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3586 "Chimney send packet size upper boundary");
3587 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3588 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3589 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3590 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3591 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3592 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3593 hn_tx_conf_int_sysctl, "I",
3594 "Size of the packet for direct transmission");
3595 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3596 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3597 __offsetof(struct hn_tx_ring, hn_sched_tx),
3598 hn_tx_conf_int_sysctl, "I",
3599 "Always schedule transmission "
3600 "instead of doing direct transmission");
3601 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3602 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3603 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3604 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3605 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3606 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3607 "Applied packet transmission aggregation size");
3608 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3609 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3610 hn_txagg_pktmax_sysctl, "I",
3611 "Applied packet transmission aggregation packets");
3612 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3613 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3614 hn_txagg_align_sysctl, "I",
3615 "Applied packet transmission aggregation alignment");
3621 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3625 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3626 sc->hn_tx_ring[i].hn_chim_size = chim_size;
3630 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3632 struct ifnet *ifp = sc->hn_ifp;
3635 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3638 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3639 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3640 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3642 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3643 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3644 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3646 if (tso_maxlen < tso_minlen)
3647 tso_maxlen = tso_minlen;
3648 else if (tso_maxlen > IP_MAXPACKET)
3649 tso_maxlen = IP_MAXPACKET;
3650 if (tso_maxlen > sc->hn_ndis_tso_szmax)
3651 tso_maxlen = sc->hn_ndis_tso_szmax;
3652 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3654 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3658 hn_fixup_tx_data(struct hn_softc *sc)
3660 uint64_t csum_assist;
3663 hn_set_chim_size(sc, sc->hn_chim_szmax);
3664 if (hn_tx_chimney_size > 0 &&
3665 hn_tx_chimney_size < sc->hn_chim_szmax)
3666 hn_set_chim_size(sc, hn_tx_chimney_size);
3669 if (sc->hn_caps & HN_CAP_IPCS)
3670 csum_assist |= CSUM_IP;
3671 if (sc->hn_caps & HN_CAP_TCP4CS)
3672 csum_assist |= CSUM_IP_TCP;
3673 if (sc->hn_caps & HN_CAP_UDP4CS)
3674 csum_assist |= CSUM_IP_UDP;
3676 if (sc->hn_caps & HN_CAP_TCP6CS)
3677 csum_assist |= CSUM_IP6_TCP;
3678 if (sc->hn_caps & HN_CAP_UDP6CS)
3679 csum_assist |= CSUM_IP6_UDP;
3681 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3682 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3684 if (sc->hn_caps & HN_CAP_HASHVAL) {
3686 * Support HASHVAL pktinfo on TX path.
3689 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3690 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3691 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3696 hn_destroy_tx_data(struct hn_softc *sc)
3700 if (sc->hn_chim != NULL) {
3701 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3705 if (sc->hn_tx_ring_cnt == 0)
3708 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3709 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3711 free(sc->hn_tx_ring, M_DEVBUF);
3712 sc->hn_tx_ring = NULL;
3714 sc->hn_tx_ring_cnt = 0;
3715 sc->hn_tx_ring_inuse = 0;
3718 #ifdef HN_IFSTART_SUPPORT
3721 hn_start_taskfunc(void *xtxr, int pending __unused)
3723 struct hn_tx_ring *txr = xtxr;
3725 mtx_lock(&txr->hn_tx_lock);
3726 hn_start_locked(txr, 0);
3727 mtx_unlock(&txr->hn_tx_lock);
3731 hn_start_locked(struct hn_tx_ring *txr, int len)
3733 struct hn_softc *sc = txr->hn_sc;
3734 struct ifnet *ifp = sc->hn_ifp;
3737 KASSERT(hn_use_if_start,
3738 ("hn_start_locked is called, when if_start is disabled"));
3739 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3740 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3741 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3743 if (__predict_false(txr->hn_suspended))
3746 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3750 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3751 struct hn_txdesc *txd;
3752 struct mbuf *m_head;
3755 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3759 if (len > 0 && m_head->m_pkthdr.len > len) {
3761 * This sending could be time consuming; let callers
3762 * dispatch this packet sending (and sending of any
3763 * following up packets) to tx taskqueue.
3765 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3770 #if defined(INET6) || defined(INET)
3771 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3772 m_head = hn_tso_fixup(m_head);
3773 if (__predict_false(m_head == NULL)) {
3774 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3780 txd = hn_txdesc_get(txr);
3782 txr->hn_no_txdescs++;
3783 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3784 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3788 error = hn_encap(ifp, txr, txd, &m_head);
3790 /* Both txd and m_head are freed */
3791 KASSERT(txr->hn_agg_txd == NULL,
3792 ("encap failed w/ pending aggregating txdesc"));
3796 if (txr->hn_agg_pktleft == 0) {
3797 if (txr->hn_agg_txd != NULL) {
3798 KASSERT(m_head == NULL,
3799 ("pending mbuf for aggregating txdesc"));
3800 error = hn_flush_txagg(ifp, txr);
3801 if (__predict_false(error)) {
3802 atomic_set_int(&ifp->if_drv_flags,
3807 KASSERT(m_head != NULL, ("mbuf was freed"));
3808 error = hn_txpkt(ifp, txr, txd);
3809 if (__predict_false(error)) {
3810 /* txd is freed, but m_head is not */
3811 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3812 atomic_set_int(&ifp->if_drv_flags,
3820 KASSERT(txr->hn_agg_txd != NULL,
3821 ("no aggregating txdesc"));
3822 KASSERT(m_head == NULL,
3823 ("pending mbuf for aggregating txdesc"));
3828 /* Flush pending aggerated transmission. */
3829 if (txr->hn_agg_txd != NULL)
3830 hn_flush_txagg(ifp, txr);
3835 hn_start(struct ifnet *ifp)
3837 struct hn_softc *sc = ifp->if_softc;
3838 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
3840 if (txr->hn_sched_tx)
3843 if (mtx_trylock(&txr->hn_tx_lock)) {
3846 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3847 mtx_unlock(&txr->hn_tx_lock);
3852 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3856 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3858 struct hn_tx_ring *txr = xtxr;
3860 mtx_lock(&txr->hn_tx_lock);
3861 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3862 hn_start_locked(txr, 0);
3863 mtx_unlock(&txr->hn_tx_lock);
3867 hn_start_txeof(struct hn_tx_ring *txr)
3869 struct hn_softc *sc = txr->hn_sc;
3870 struct ifnet *ifp = sc->hn_ifp;
3872 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3874 if (txr->hn_sched_tx)
3877 if (mtx_trylock(&txr->hn_tx_lock)) {
3880 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3881 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3882 mtx_unlock(&txr->hn_tx_lock);
3884 taskqueue_enqueue(txr->hn_tx_taskq,
3890 * Release the OACTIVE earlier, with the hope, that
3891 * others could catch up. The task will clear the
3892 * flag again with the hn_tx_lock to avoid possible
3895 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3896 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3900 #endif /* HN_IFSTART_SUPPORT */
3903 hn_xmit(struct hn_tx_ring *txr, int len)
3905 struct hn_softc *sc = txr->hn_sc;
3906 struct ifnet *ifp = sc->hn_ifp;
3907 struct mbuf *m_head;
3910 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3911 #ifdef HN_IFSTART_SUPPORT
3912 KASSERT(hn_use_if_start == 0,
3913 ("hn_xmit is called, when if_start is enabled"));
3915 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3917 if (__predict_false(txr->hn_suspended))
3920 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
3923 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
3924 struct hn_txdesc *txd;
3927 if (len > 0 && m_head->m_pkthdr.len > len) {
3929 * This sending could be time consuming; let callers
3930 * dispatch this packet sending (and sending of any
3931 * following up packets) to tx taskqueue.
3933 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3938 txd = hn_txdesc_get(txr);
3940 txr->hn_no_txdescs++;
3941 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3942 txr->hn_oactive = 1;
3946 error = hn_encap(ifp, txr, txd, &m_head);
3948 /* Both txd and m_head are freed; discard */
3949 KASSERT(txr->hn_agg_txd == NULL,
3950 ("encap failed w/ pending aggregating txdesc"));
3951 drbr_advance(ifp, txr->hn_mbuf_br);
3955 if (txr->hn_agg_pktleft == 0) {
3956 if (txr->hn_agg_txd != NULL) {
3957 KASSERT(m_head == NULL,
3958 ("pending mbuf for aggregating txdesc"));
3959 error = hn_flush_txagg(ifp, txr);
3960 if (__predict_false(error)) {
3961 txr->hn_oactive = 1;
3965 KASSERT(m_head != NULL, ("mbuf was freed"));
3966 error = hn_txpkt(ifp, txr, txd);
3967 if (__predict_false(error)) {
3968 /* txd is freed, but m_head is not */
3969 drbr_putback(ifp, txr->hn_mbuf_br,
3971 txr->hn_oactive = 1;
3978 KASSERT(txr->hn_agg_txd != NULL,
3979 ("no aggregating txdesc"));
3980 KASSERT(m_head == NULL,
3981 ("pending mbuf for aggregating txdesc"));
3986 drbr_advance(ifp, txr->hn_mbuf_br);
3989 /* Flush pending aggerated transmission. */
3990 if (txr->hn_agg_txd != NULL)
3991 hn_flush_txagg(ifp, txr);
3996 hn_transmit(struct ifnet *ifp, struct mbuf *m)
3998 struct hn_softc *sc = ifp->if_softc;
3999 struct hn_tx_ring *txr;
4002 #if defined(INET6) || defined(INET)
4004 * Perform TSO packet header fixup now, since the TSO
4005 * packet header should be cache-hot.
4007 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4008 m = hn_tso_fixup(m);
4009 if (__predict_false(m == NULL)) {
4010 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4017 * Select the TX ring based on flowid
4019 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
4020 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4021 txr = &sc->hn_tx_ring[idx];
4023 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4025 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4029 if (txr->hn_oactive)
4032 if (txr->hn_sched_tx)
4035 if (mtx_trylock(&txr->hn_tx_lock)) {
4038 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4039 mtx_unlock(&txr->hn_tx_lock);
4044 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4049 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4053 mtx_lock(&txr->hn_tx_lock);
4054 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4056 mtx_unlock(&txr->hn_tx_lock);
4060 hn_xmit_qflush(struct ifnet *ifp)
4062 struct hn_softc *sc = ifp->if_softc;
4065 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4066 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4071 hn_xmit_txeof(struct hn_tx_ring *txr)
4074 if (txr->hn_sched_tx)
4077 if (mtx_trylock(&txr->hn_tx_lock)) {
4080 txr->hn_oactive = 0;
4081 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4082 mtx_unlock(&txr->hn_tx_lock);
4084 taskqueue_enqueue(txr->hn_tx_taskq,
4090 * Release the oactive earlier, with the hope, that
4091 * others could catch up. The task will clear the
4092 * oactive again with the hn_tx_lock to avoid possible
4095 txr->hn_oactive = 0;
4096 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4101 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4103 struct hn_tx_ring *txr = xtxr;
4105 mtx_lock(&txr->hn_tx_lock);
4107 mtx_unlock(&txr->hn_tx_lock);
4111 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4113 struct hn_tx_ring *txr = xtxr;
4115 mtx_lock(&txr->hn_tx_lock);
4116 txr->hn_oactive = 0;
4118 mtx_unlock(&txr->hn_tx_lock);
4122 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4124 struct vmbus_chan_br cbr;
4125 struct hn_rx_ring *rxr;
4126 struct hn_tx_ring *txr = NULL;
4129 idx = vmbus_chan_subidx(chan);
4132 * Link this channel to RX/TX ring.
4134 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4135 ("invalid channel index %d, should > 0 && < %d",
4136 idx, sc->hn_rx_ring_inuse));
4137 rxr = &sc->hn_rx_ring[idx];
4138 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4139 ("RX ring %d already attached", idx));
4140 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4143 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4144 idx, vmbus_chan_id(chan));
4147 if (idx < sc->hn_tx_ring_inuse) {
4148 txr = &sc->hn_tx_ring[idx];
4149 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4150 ("TX ring %d already attached", idx));
4151 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4153 txr->hn_chan = chan;
4155 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4156 idx, vmbus_chan_id(chan));
4160 /* Bind this channel to a proper CPU. */
4161 vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus);
4166 cbr.cbr = rxr->hn_br;
4167 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4168 cbr.cbr_txsz = HN_TXBR_SIZE;
4169 cbr.cbr_rxsz = HN_RXBR_SIZE;
4170 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4172 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4173 vmbus_chan_id(chan), error);
4174 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4176 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4182 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4184 struct hn_rx_ring *rxr;
4187 idx = vmbus_chan_subidx(chan);
4190 * Link this channel to RX/TX ring.
4192 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4193 ("invalid channel index %d, should > 0 && < %d",
4194 idx, sc->hn_rx_ring_inuse));
4195 rxr = &sc->hn_rx_ring[idx];
4196 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4197 ("RX ring %d is not attached", idx));
4198 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4200 if (idx < sc->hn_tx_ring_inuse) {
4201 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4203 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4204 ("TX ring %d is not attached attached", idx));
4205 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4209 * Close this channel.
4212 * Channel closing does _not_ destroy the target channel.
4214 vmbus_chan_close(chan);
4218 hn_attach_subchans(struct hn_softc *sc)
4220 struct vmbus_channel **subchans;
4221 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4224 if (subchan_cnt == 0)
4227 /* Attach the sub-channels. */
4228 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4229 for (i = 0; i < subchan_cnt; ++i) {
4230 error = hn_chan_attach(sc, subchans[i]);
4234 vmbus_subchan_rel(subchans, subchan_cnt);
4237 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4240 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4248 hn_detach_allchans(struct hn_softc *sc)
4250 struct vmbus_channel **subchans;
4251 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4254 if (subchan_cnt == 0)
4257 /* Detach the sub-channels. */
4258 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4259 for (i = 0; i < subchan_cnt; ++i)
4260 hn_chan_detach(sc, subchans[i]);
4261 vmbus_subchan_rel(subchans, subchan_cnt);
4265 * Detach the primary channel, _after_ all sub-channels
4268 hn_chan_detach(sc, sc->hn_prichan);
4270 /* Wait for sub-channels to be destroyed, if any. */
4271 vmbus_subchan_drain(sc->hn_prichan);
4274 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4275 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4276 HN_RX_FLAG_ATTACHED) == 0,
4277 ("%dth RX ring is still attached", i));
4279 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4280 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4281 HN_TX_FLAG_ATTACHED) == 0,
4282 ("%dth TX ring is still attached", i));
4288 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4290 struct vmbus_channel **subchans;
4291 int nchan, rxr_cnt, error;
4293 nchan = *nsubch + 1;
4296 * Multiple RX/TX rings are not requested.
4303 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4306 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4308 /* No RSS; this is benign. */
4313 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4317 if (nchan > rxr_cnt)
4320 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4326 * Allocate sub-channels from NVS.
4328 *nsubch = nchan - 1;
4329 error = hn_nvs_alloc_subchans(sc, nsubch);
4330 if (error || *nsubch == 0) {
4331 /* Failed to allocate sub-channels. */
4337 * Wait for all sub-channels to become ready before moving on.
4339 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4340 vmbus_subchan_rel(subchans, *nsubch);
4345 hn_synth_attach(struct hn_softc *sc, int mtu)
4347 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4348 int error, nsubch, nchan, i;
4351 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4352 ("synthetic parts were attached"));
4354 /* Save capabilities for later verification. */
4355 old_caps = sc->hn_caps;
4358 /* Clear RSS stuffs. */
4359 sc->hn_rss_ind_size = 0;
4360 sc->hn_rss_hash = 0;
4363 * Attach the primary channel _before_ attaching NVS and RNDIS.
4365 error = hn_chan_attach(sc, sc->hn_prichan);
4372 error = hn_nvs_attach(sc, mtu);
4377 * Attach RNDIS _after_ NVS is attached.
4379 error = hn_rndis_attach(sc, mtu);
4384 * Make sure capabilities are not changed.
4386 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4387 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4388 old_caps, sc->hn_caps);
4389 /* Restore old capabilities and abort. */
4390 sc->hn_caps = old_caps;
4395 * Allocate sub-channels for multi-TX/RX rings.
4398 * The # of RX rings that can be used is equivalent to the # of
4399 * channels to be requested.
4401 nsubch = sc->hn_rx_ring_cnt - 1;
4402 error = hn_synth_alloc_subchans(sc, &nsubch);
4408 /* Only the primary channel can be used; done */
4413 * Configure RSS key and indirect table _after_ all sub-channels
4417 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4419 * RSS key is not set yet; set it to the default RSS key.
4422 if_printf(sc->hn_ifp, "setup default RSS key\n");
4423 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4424 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4427 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4429 * RSS indirect table is not set yet; set it up in round-
4433 if_printf(sc->hn_ifp, "setup default RSS indirect "
4436 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
4437 rss->rss_ind[i] = i % nchan;
4438 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4441 * # of usable channels may be changed, so we have to
4442 * make sure that all entries in RSS indirect table
4445 hn_rss_ind_fixup(sc, nchan);
4448 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4451 * Failed to configure RSS key or indirect table; only
4452 * the primary channel can be used.
4458 * Set the # of TX/RX rings that could be used according to
4459 * the # of channels that NVS offered.
4461 hn_set_ring_inuse(sc, nchan);
4464 * Attach the sub-channels, if any.
4466 error = hn_attach_subchans(sc);
4471 * Fixup transmission aggregation setup.
4475 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4481 * The interface must have been suspended though hn_suspend(), before
4482 * this function get called.
4485 hn_synth_detach(struct hn_softc *sc)
4489 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4490 ("synthetic parts were not attached"));
4492 /* Detach the RNDIS first. */
4493 hn_rndis_detach(sc);
4498 /* Detach all of the channels. */
4499 hn_detach_allchans(sc);
4501 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4505 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4507 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4508 ("invalid ring count %d", ring_cnt));
4510 if (sc->hn_tx_ring_cnt > ring_cnt)
4511 sc->hn_tx_ring_inuse = ring_cnt;
4513 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4514 sc->hn_rx_ring_inuse = ring_cnt;
4517 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4518 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4523 hn_chan_drain(struct vmbus_channel *chan)
4526 while (!vmbus_chan_rx_empty(chan) || !vmbus_chan_tx_empty(chan))
4528 vmbus_chan_intr_drain(chan);
4532 hn_suspend_data(struct hn_softc *sc)
4534 struct vmbus_channel **subch = NULL;
4542 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4543 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4545 mtx_lock(&txr->hn_tx_lock);
4546 txr->hn_suspended = 1;
4547 mtx_unlock(&txr->hn_tx_lock);
4548 /* No one is able send more packets now. */
4550 /* Wait for all pending sends to finish. */
4551 while (hn_tx_ring_pending(txr))
4552 pause("hnwtx", 1 /* 1 tick */);
4554 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4555 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4559 * Disable RX by clearing RX filter.
4561 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
4562 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
4565 * Give RNDIS enough time to flush all pending data packets.
4567 pause("waitrx", (200 * hz) / 1000);
4570 * Drain RX/TX bufrings and interrupts.
4572 nsubch = sc->hn_rx_ring_inuse - 1;
4574 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4576 if (subch != NULL) {
4577 for (i = 0; i < nsubch; ++i)
4578 hn_chan_drain(subch[i]);
4580 hn_chan_drain(sc->hn_prichan);
4583 vmbus_subchan_rel(subch, nsubch);
4587 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4590 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4594 hn_suspend_mgmt(struct hn_softc *sc)
4601 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4602 * through hn_mgmt_taskq.
4604 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4605 vmbus_chan_run_task(sc->hn_prichan, &task);
4608 * Make sure that all pending management tasks are completed.
4610 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4611 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4612 taskqueue_drain_all(sc->hn_mgmt_taskq0);
4616 hn_suspend(struct hn_softc *sc)
4619 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4620 hn_suspend_data(sc);
4621 hn_suspend_mgmt(sc);
4625 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4629 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4630 ("invalid TX ring count %d", tx_ring_cnt));
4632 for (i = 0; i < tx_ring_cnt; ++i) {
4633 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4635 mtx_lock(&txr->hn_tx_lock);
4636 txr->hn_suspended = 0;
4637 mtx_unlock(&txr->hn_tx_lock);
4642 hn_resume_data(struct hn_softc *sc)
4651 hn_set_rxfilter(sc);
4654 * Make sure to clear suspend status on "all" TX rings,
4655 * since hn_tx_ring_inuse can be changed after
4656 * hn_suspend_data().
4658 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4660 #ifdef HN_IFSTART_SUPPORT
4661 if (!hn_use_if_start)
4665 * Flush unused drbrs, since hn_tx_ring_inuse may be
4668 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4669 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4675 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4676 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4679 * Use txeof task, so that any pending oactive can be
4682 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4687 hn_resume_mgmt(struct hn_softc *sc)
4690 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4693 * Kick off network change detection, if it was pending.
4694 * If no network change was pending, start link status
4695 * checks, which is more lightweight than network change
4698 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4699 hn_change_network(sc);
4701 hn_update_link_status(sc);
4705 hn_resume(struct hn_softc *sc)
4708 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4714 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4716 const struct rndis_status_msg *msg;
4719 if (dlen < sizeof(*msg)) {
4720 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4725 switch (msg->rm_status) {
4726 case RNDIS_STATUS_MEDIA_CONNECT:
4727 case RNDIS_STATUS_MEDIA_DISCONNECT:
4728 hn_update_link_status(sc);
4731 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4732 /* Not really useful; ignore. */
4735 case RNDIS_STATUS_NETWORK_CHANGE:
4736 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4737 if (dlen < ofs + msg->rm_stbuflen ||
4738 msg->rm_stbuflen < sizeof(uint32_t)) {
4739 if_printf(sc->hn_ifp, "network changed\n");
4743 memcpy(&change, ((const uint8_t *)msg) + ofs,
4745 if_printf(sc->hn_ifp, "network changed, change %u\n",
4748 hn_change_network(sc);
4752 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4759 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
4761 const struct rndis_pktinfo *pi = info_data;
4764 while (info_dlen != 0) {
4768 if (__predict_false(info_dlen < sizeof(*pi)))
4770 if (__predict_false(info_dlen < pi->rm_size))
4772 info_dlen -= pi->rm_size;
4774 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
4776 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
4778 dlen = pi->rm_size - pi->rm_pktinfooffset;
4781 switch (pi->rm_type) {
4782 case NDIS_PKTINFO_TYPE_VLAN:
4783 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
4785 info->vlan_info = *((const uint32_t *)data);
4786 mask |= HN_RXINFO_VLAN;
4789 case NDIS_PKTINFO_TYPE_CSUM:
4790 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
4792 info->csum_info = *((const uint32_t *)data);
4793 mask |= HN_RXINFO_CSUM;
4796 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
4797 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
4799 info->hash_value = *((const uint32_t *)data);
4800 mask |= HN_RXINFO_HASHVAL;
4803 case HN_NDIS_PKTINFO_TYPE_HASHINF:
4804 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
4806 info->hash_info = *((const uint32_t *)data);
4807 mask |= HN_RXINFO_HASHINF;
4814 if (mask == HN_RXINFO_ALL) {
4815 /* All found; done */
4819 pi = (const struct rndis_pktinfo *)
4820 ((const uint8_t *)pi + pi->rm_size);
4825 * - If there is no hash value, invalidate the hash info.
4827 if ((mask & HN_RXINFO_HASHVAL) == 0)
4828 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
4832 static __inline bool
4833 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
4836 if (off < check_off) {
4837 if (__predict_true(off + len <= check_off))
4839 } else if (off > check_off) {
4840 if (__predict_true(check_off + check_len <= off))
4847 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
4849 const struct rndis_packet_msg *pkt;
4850 struct hn_rxinfo info;
4851 int data_off, pktinfo_off, data_len, pktinfo_len;
4856 if (__predict_false(dlen < sizeof(*pkt))) {
4857 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
4862 if (__predict_false(dlen < pkt->rm_len)) {
4863 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
4864 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
4867 if (__predict_false(pkt->rm_len <
4868 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
4869 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
4870 "msglen %u, data %u, oob %u, pktinfo %u\n",
4871 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
4872 pkt->rm_pktinfolen);
4875 if (__predict_false(pkt->rm_datalen == 0)) {
4876 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
4883 #define IS_OFFSET_INVALID(ofs) \
4884 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
4885 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
4887 /* XXX Hyper-V does not meet data offset alignment requirement */
4888 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
4889 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4890 "data offset %u\n", pkt->rm_dataoffset);
4893 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
4894 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
4895 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4896 "oob offset %u\n", pkt->rm_oobdataoffset);
4899 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
4900 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
4901 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4902 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
4906 #undef IS_OFFSET_INVALID
4908 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
4909 data_len = pkt->rm_datalen;
4910 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
4911 pktinfo_len = pkt->rm_pktinfolen;
4914 * Check OOB coverage.
4916 if (__predict_false(pkt->rm_oobdatalen != 0)) {
4917 int oob_off, oob_len;
4919 if_printf(rxr->hn_ifp, "got oobdata\n");
4920 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
4921 oob_len = pkt->rm_oobdatalen;
4923 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
4924 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4925 "oob overflow, msglen %u, oob abs %d len %d\n",
4926 pkt->rm_len, oob_off, oob_len);
4931 * Check against data.
4933 if (hn_rndis_check_overlap(oob_off, oob_len,
4934 data_off, data_len)) {
4935 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4936 "oob overlaps data, oob abs %d len %d, "
4937 "data abs %d len %d\n",
4938 oob_off, oob_len, data_off, data_len);
4943 * Check against pktinfo.
4945 if (pktinfo_len != 0 &&
4946 hn_rndis_check_overlap(oob_off, oob_len,
4947 pktinfo_off, pktinfo_len)) {
4948 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4949 "oob overlaps pktinfo, oob abs %d len %d, "
4950 "pktinfo abs %d len %d\n",
4951 oob_off, oob_len, pktinfo_off, pktinfo_len);
4957 * Check per-packet-info coverage and find useful per-packet-info.
4959 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
4960 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
4961 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
4962 if (__predict_true(pktinfo_len != 0)) {
4966 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
4967 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4968 "pktinfo overflow, msglen %u, "
4969 "pktinfo abs %d len %d\n",
4970 pkt->rm_len, pktinfo_off, pktinfo_len);
4975 * Check packet info coverage.
4977 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
4978 data_off, data_len);
4979 if (__predict_false(overlap)) {
4980 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4981 "pktinfo overlap data, pktinfo abs %d len %d, "
4982 "data abs %d len %d\n",
4983 pktinfo_off, pktinfo_len, data_off, data_len);
4988 * Find useful per-packet-info.
4990 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
4991 pktinfo_len, &info);
4992 if (__predict_false(error)) {
4993 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
4999 if (__predict_false(data_off + data_len > pkt->rm_len)) {
5000 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5001 "data overflow, msglen %u, data abs %d len %d\n",
5002 pkt->rm_len, data_off, data_len);
5005 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5008 static __inline void
5009 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5011 const struct rndis_msghdr *hdr;
5013 if (__predict_false(dlen < sizeof(*hdr))) {
5014 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5019 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5020 /* Hot data path. */
5021 hn_rndis_rx_data(rxr, data, dlen);
5026 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5027 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5029 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5033 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5035 const struct hn_nvs_hdr *hdr;
5037 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5038 if_printf(sc->hn_ifp, "invalid nvs notify\n");
5041 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5043 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5044 /* Useless; ignore */
5047 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5051 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5052 const struct vmbus_chanpkt_hdr *pkt)
5054 struct hn_nvs_sendctx *sndc;
5056 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5057 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5058 VMBUS_CHANPKT_DATALEN(pkt));
5061 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5067 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5068 const struct vmbus_chanpkt_hdr *pkthdr)
5070 const struct vmbus_chanpkt_rxbuf *pkt;
5071 const struct hn_nvs_hdr *nvs_hdr;
5074 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5075 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5078 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5080 /* Make sure that this is a RNDIS message. */
5081 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5082 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5087 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5088 if (__predict_false(hlen < sizeof(*pkt))) {
5089 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5092 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5094 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5095 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5100 count = pkt->cp_rxbuf_cnt;
5101 if (__predict_false(hlen <
5102 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5103 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5107 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5108 for (i = 0; i < count; ++i) {
5111 ofs = pkt->cp_rxbuf[i].rb_ofs;
5112 len = pkt->cp_rxbuf[i].rb_len;
5113 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5114 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5115 "ofs %d, len %d\n", i, ofs, len);
5118 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5122 * Ack the consumed RXBUF associated w/ this channel packet,
5123 * so that this RXBUF can be recycled by the hypervisor.
5125 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5129 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5132 struct hn_nvs_rndis_ack ack;
5135 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5136 ack.nvs_status = HN_NVS_STATUS_OK;
5140 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5141 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5142 if (__predict_false(error == EAGAIN)) {
5145 * This should _not_ happen in real world, since the
5146 * consumption of the TX bufring from the TX path is
5149 if (rxr->hn_ack_failed == 0)
5150 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5151 rxr->hn_ack_failed++;
5158 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5163 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5165 struct hn_rx_ring *rxr = xrxr;
5166 struct hn_softc *sc = rxr->hn_ifp->if_softc;
5169 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5172 pktlen = rxr->hn_pktbuf_len;
5173 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5174 if (__predict_false(error == ENOBUFS)) {
5179 * Expand channel packet buffer.
5182 * Use M_WAITOK here, since allocation failure
5185 nlen = rxr->hn_pktbuf_len * 2;
5186 while (nlen < pktlen)
5188 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5190 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5191 rxr->hn_pktbuf_len, nlen);
5193 free(rxr->hn_pktbuf, M_DEVBUF);
5194 rxr->hn_pktbuf = nbuf;
5195 rxr->hn_pktbuf_len = nlen;
5198 } else if (__predict_false(error == EAGAIN)) {
5199 /* No more channel packets; done! */
5202 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5204 switch (pkt->cph_type) {
5205 case VMBUS_CHANPKT_TYPE_COMP:
5206 hn_nvs_handle_comp(sc, chan, pkt);
5209 case VMBUS_CHANPKT_TYPE_RXBUF:
5210 hn_nvs_handle_rxbuf(rxr, chan, pkt);
5213 case VMBUS_CHANPKT_TYPE_INBAND:
5214 hn_nvs_handle_notify(sc, pkt);
5218 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5223 hn_chan_rollup(rxr, rxr->hn_txr);
5227 hn_tx_taskq_create(void *arg __unused)
5230 if (vm_guest != VM_GUEST_HV)
5233 if (!hn_share_tx_taskq)
5236 hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
5237 taskqueue_thread_enqueue, &hn_tx_taskq);
5238 taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
5239 if (hn_bind_tx_taskq >= 0) {
5240 int cpu = hn_bind_tx_taskq;
5241 struct task cpuset_task;
5244 if (cpu > mp_ncpus - 1)
5246 CPU_SETOF(cpu, &cpu_set);
5247 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, &cpu_set);
5248 taskqueue_enqueue(hn_tx_taskq, &cpuset_task);
5249 taskqueue_drain(hn_tx_taskq, &cpuset_task);
5252 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5253 hn_tx_taskq_create, NULL);
5256 hn_tx_taskq_destroy(void *arg __unused)
5259 if (hn_tx_taskq != NULL)
5260 taskqueue_free(hn_tx_taskq);
5262 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5263 hn_tx_taskq_destroy, NULL);