2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
61 #include <sys/param.h>
63 #include <sys/kernel.h>
64 #include <sys/limits.h>
65 #include <sys/malloc.h>
67 #include <sys/module.h>
69 #include <sys/queue.h>
72 #include <sys/socket.h>
73 #include <sys/sockio.h>
75 #include <sys/sysctl.h>
76 #include <sys/systm.h>
77 #include <sys/taskqueue.h>
78 #include <sys/buf_ring.h>
80 #include <machine/atomic.h>
81 #include <machine/in_cksum.h>
84 #include <net/ethernet.h>
86 #include <net/if_arp.h>
87 #include <net/if_media.h>
88 #include <net/if_types.h>
89 #include <net/if_var.h>
90 #include <net/if_vlan_var.h>
91 #include <net/rndis.h>
93 #include <netinet/in_systm.h>
94 #include <netinet/in.h>
95 #include <netinet/ip.h>
96 #include <netinet/ip6.h>
97 #include <netinet/tcp.h>
98 #include <netinet/tcp_lro.h>
99 #include <netinet/udp.h>
101 #include <dev/hyperv/include/hyperv.h>
102 #include <dev/hyperv/include/hyperv_busdma.h>
103 #include <dev/hyperv/include/vmbus.h>
104 #include <dev/hyperv/include/vmbus_xact.h>
106 #include <dev/hyperv/netvsc/ndis.h>
107 #include <dev/hyperv/netvsc/if_hnreg.h>
108 #include <dev/hyperv/netvsc/if_hnvar.h>
109 #include <dev/hyperv/netvsc/hn_nvs.h>
110 #include <dev/hyperv/netvsc/hn_rndis.h>
112 #include "vmbus_if.h"
114 #define HN_IFSTART_SUPPORT
116 #define HN_RING_CNT_DEF_MAX 8
118 /* YYY should get it from the underlying channel */
119 #define HN_TX_DESC_CNT 512
121 #define HN_RNDIS_PKT_LEN \
122 (sizeof(struct rndis_packet_msg) + \
123 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
124 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
125 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
126 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
127 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
128 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
130 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
131 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
132 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
133 /* -1 for RNDIS packet message */
134 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
136 #define HN_DIRECT_TX_SIZE_DEF 128
138 #define HN_EARLY_TXEOF_THRESH 8
140 #define HN_PKTBUF_LEN_DEF (16 * 1024)
142 #define HN_LROENT_CNT_DEF 128
144 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
145 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
146 /* YYY 2*MTU is a bit rough, but should be good enough. */
147 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
149 #define HN_LRO_ACKCNT_DEF 1
151 #define HN_LOCK_INIT(sc) \
152 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
153 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
154 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
155 #define HN_LOCK(sc) sx_xlock(&(sc)->hn_lock)
156 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
158 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
159 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
160 #define HN_CSUM_IP_HWASSIST(sc) \
161 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
162 #define HN_CSUM_IP6_HWASSIST(sc) \
163 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
165 #define HN_PKTSIZE_MIN(align) \
166 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
167 HN_RNDIS_PKT_LEN, (align))
168 #define HN_PKTSIZE(m, align) \
169 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
172 #ifndef HN_USE_TXDESC_BUFRING
173 SLIST_ENTRY(hn_txdesc) link;
175 STAILQ_ENTRY(hn_txdesc) agg_link;
177 /* Aggregated txdescs, in sending order. */
178 STAILQ_HEAD(, hn_txdesc) agg_list;
180 /* The oldest packet, if transmission aggregation happens. */
182 struct hn_tx_ring *txr;
184 uint32_t flags; /* HN_TXD_FLAG_ */
185 struct hn_nvs_sendctx send_ctx;
189 bus_dmamap_t data_dmap;
191 bus_addr_t rndis_pkt_paddr;
192 struct rndis_packet_msg *rndis_pkt;
193 bus_dmamap_t rndis_pkt_dmap;
196 #define HN_TXD_FLAG_ONLIST 0x0001
197 #define HN_TXD_FLAG_DMAMAP 0x0002
198 #define HN_TXD_FLAG_ONAGG 0x0004
207 #define HN_RXINFO_VLAN 0x0001
208 #define HN_RXINFO_CSUM 0x0002
209 #define HN_RXINFO_HASHINF 0x0004
210 #define HN_RXINFO_HASHVAL 0x0008
211 #define HN_RXINFO_ALL \
214 HN_RXINFO_HASHINF | \
217 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
218 #define HN_NDIS_RXCSUM_INFO_INVALID 0
219 #define HN_NDIS_HASH_INFO_INVALID 0
221 static int hn_probe(device_t);
222 static int hn_attach(device_t);
223 static int hn_detach(device_t);
224 static int hn_shutdown(device_t);
225 static void hn_chan_callback(struct vmbus_channel *,
228 static void hn_init(void *);
229 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
230 #ifdef HN_IFSTART_SUPPORT
231 static void hn_start(struct ifnet *);
233 static int hn_transmit(struct ifnet *, struct mbuf *);
234 static void hn_xmit_qflush(struct ifnet *);
235 static int hn_ifmedia_upd(struct ifnet *);
236 static void hn_ifmedia_sts(struct ifnet *,
237 struct ifmediareq *);
239 static int hn_rndis_rxinfo(const void *, int,
241 static void hn_rndis_rx_data(struct hn_rx_ring *,
243 static void hn_rndis_rx_status(struct hn_softc *,
246 static void hn_nvs_handle_notify(struct hn_softc *,
247 const struct vmbus_chanpkt_hdr *);
248 static void hn_nvs_handle_comp(struct hn_softc *,
249 struct vmbus_channel *,
250 const struct vmbus_chanpkt_hdr *);
251 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
252 struct vmbus_channel *,
253 const struct vmbus_chanpkt_hdr *);
254 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
255 struct vmbus_channel *, uint64_t);
257 #if __FreeBSD_version >= 1100099
258 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
259 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
261 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
262 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
263 #if __FreeBSD_version < 1100095
264 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
266 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
268 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
269 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
270 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
271 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
272 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
273 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
274 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
275 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
276 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
277 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
278 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
279 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
280 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
281 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
283 static void hn_stop(struct hn_softc *);
284 static void hn_init_locked(struct hn_softc *);
285 static int hn_chan_attach(struct hn_softc *,
286 struct vmbus_channel *);
287 static void hn_chan_detach(struct hn_softc *,
288 struct vmbus_channel *);
289 static int hn_attach_subchans(struct hn_softc *);
290 static void hn_detach_allchans(struct hn_softc *);
291 static void hn_chan_rollup(struct hn_rx_ring *,
292 struct hn_tx_ring *);
293 static void hn_set_ring_inuse(struct hn_softc *, int);
294 static int hn_synth_attach(struct hn_softc *, int);
295 static void hn_synth_detach(struct hn_softc *);
296 static int hn_synth_alloc_subchans(struct hn_softc *,
298 static void hn_suspend(struct hn_softc *);
299 static void hn_suspend_data(struct hn_softc *);
300 static void hn_suspend_mgmt(struct hn_softc *);
301 static void hn_resume(struct hn_softc *);
302 static void hn_resume_data(struct hn_softc *);
303 static void hn_resume_mgmt(struct hn_softc *);
304 static void hn_suspend_mgmt_taskfunc(void *, int);
305 static void hn_chan_drain(struct vmbus_channel *);
307 static void hn_update_link_status(struct hn_softc *);
308 static void hn_change_network(struct hn_softc *);
309 static void hn_link_taskfunc(void *, int);
310 static void hn_netchg_init_taskfunc(void *, int);
311 static void hn_netchg_status_taskfunc(void *, int);
312 static void hn_link_status(struct hn_softc *);
314 static int hn_create_rx_data(struct hn_softc *, int);
315 static void hn_destroy_rx_data(struct hn_softc *);
316 static int hn_check_iplen(const struct mbuf *, int);
317 static int hn_set_rxfilter(struct hn_softc *);
318 static int hn_rss_reconfig(struct hn_softc *);
319 static void hn_rss_ind_fixup(struct hn_softc *, int);
320 static int hn_rxpkt(struct hn_rx_ring *, const void *,
321 int, const struct hn_rxinfo *);
323 static int hn_tx_ring_create(struct hn_softc *, int);
324 static void hn_tx_ring_destroy(struct hn_tx_ring *);
325 static int hn_create_tx_data(struct hn_softc *, int);
326 static void hn_fixup_tx_data(struct hn_softc *);
327 static void hn_destroy_tx_data(struct hn_softc *);
328 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
329 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
330 struct hn_txdesc *, struct mbuf **);
331 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
333 static void hn_set_chim_size(struct hn_softc *, int);
334 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
335 static bool hn_tx_ring_pending(struct hn_tx_ring *);
336 static void hn_tx_ring_qflush(struct hn_tx_ring *);
337 static void hn_resume_tx(struct hn_softc *, int);
338 static void hn_set_txagg(struct hn_softc *);
339 static void *hn_try_txagg(struct ifnet *,
340 struct hn_tx_ring *, struct hn_txdesc *,
342 static int hn_get_txswq_depth(const struct hn_tx_ring *);
343 static void hn_txpkt_done(struct hn_nvs_sendctx *,
344 struct hn_softc *, struct vmbus_channel *,
346 static int hn_txpkt_sglist(struct hn_tx_ring *,
348 static int hn_txpkt_chim(struct hn_tx_ring *,
350 static int hn_xmit(struct hn_tx_ring *, int);
351 static void hn_xmit_taskfunc(void *, int);
352 static void hn_xmit_txeof(struct hn_tx_ring *);
353 static void hn_xmit_txeof_taskfunc(void *, int);
354 #ifdef HN_IFSTART_SUPPORT
355 static int hn_start_locked(struct hn_tx_ring *, int);
356 static void hn_start_taskfunc(void *, int);
357 static void hn_start_txeof(struct hn_tx_ring *);
358 static void hn_start_txeof_taskfunc(void *, int);
361 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
362 "Hyper-V network interface");
364 /* Trust tcp segements verification on host side. */
365 static int hn_trust_hosttcp = 1;
366 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
367 &hn_trust_hosttcp, 0,
368 "Trust tcp segement verification on host side, "
369 "when csum info is missing (global setting)");
371 /* Trust udp datagrams verification on host side. */
372 static int hn_trust_hostudp = 1;
373 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
374 &hn_trust_hostudp, 0,
375 "Trust udp datagram verification on host side, "
376 "when csum info is missing (global setting)");
378 /* Trust ip packets verification on host side. */
379 static int hn_trust_hostip = 1;
380 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
382 "Trust ip packet verification on host side, "
383 "when csum info is missing (global setting)");
385 /* Limit TSO burst size */
386 static int hn_tso_maxlen = IP_MAXPACKET;
387 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
388 &hn_tso_maxlen, 0, "TSO burst limit");
390 /* Limit chimney send size */
391 static int hn_tx_chimney_size = 0;
392 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
393 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
395 /* Limit the size of packet for direct transmission */
396 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
397 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
398 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
400 /* # of LRO entries per RX ring */
401 #if defined(INET) || defined(INET6)
402 #if __FreeBSD_version >= 1100095
403 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
404 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
405 &hn_lro_entry_count, 0, "LRO entry count");
409 /* Use shared TX taskqueue */
410 static int hn_share_tx_taskq = 0;
411 SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
412 &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
414 #ifndef HN_USE_TXDESC_BUFRING
415 static int hn_use_txdesc_bufring = 0;
417 static int hn_use_txdesc_bufring = 1;
419 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
420 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
422 /* Bind TX taskqueue to the target CPU */
423 static int hn_bind_tx_taskq = -1;
424 SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
425 &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
427 #ifdef HN_IFSTART_SUPPORT
428 /* Use ifnet.if_start instead of ifnet.if_transmit */
429 static int hn_use_if_start = 0;
430 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
431 &hn_use_if_start, 0, "Use if_start TX method");
434 /* # of channels to use */
435 static int hn_chan_cnt = 0;
436 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
438 "# of channels to use; each channel has one RX ring and one TX ring");
440 /* # of transmit rings to use */
441 static int hn_tx_ring_cnt = 0;
442 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
443 &hn_tx_ring_cnt, 0, "# of TX rings to use");
445 /* Software TX ring deptch */
446 static int hn_tx_swq_depth = 0;
447 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
448 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
450 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
451 #if __FreeBSD_version >= 1100095
452 static u_int hn_lro_mbufq_depth = 0;
453 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
454 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
457 /* Packet transmission aggregation size limit */
458 static int hn_tx_agg_size = -1;
459 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
460 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
462 /* Packet transmission aggregation count limit */
463 static int hn_tx_agg_pkts = 0;
464 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
465 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
467 static u_int hn_cpu_index; /* next CPU for channel */
468 static struct taskqueue *hn_tx_taskq; /* shared TX taskqueue */
471 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
472 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
473 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
474 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
475 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
476 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
479 static device_method_t hn_methods[] = {
480 /* Device interface */
481 DEVMETHOD(device_probe, hn_probe),
482 DEVMETHOD(device_attach, hn_attach),
483 DEVMETHOD(device_detach, hn_detach),
484 DEVMETHOD(device_shutdown, hn_shutdown),
488 static driver_t hn_driver = {
491 sizeof(struct hn_softc)
494 static devclass_t hn_devclass;
496 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
497 MODULE_VERSION(hn, 1);
498 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
500 #if __FreeBSD_version >= 1100099
502 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
506 for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
507 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
512 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
515 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
516 txd->chim_size == 0, ("invalid rndis sglist txd"));
517 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
518 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
522 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
524 struct hn_nvs_rndis rndis;
526 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
527 txd->chim_size > 0, ("invalid rndis chim txd"));
529 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
530 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
531 rndis.nvs_chim_idx = txd->chim_index;
532 rndis.nvs_chim_sz = txd->chim_size;
534 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
535 &rndis, sizeof(rndis), &txd->send_ctx));
538 static __inline uint32_t
539 hn_chim_alloc(struct hn_softc *sc)
541 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
542 u_long *bmap = sc->hn_chim_bmap;
543 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
545 for (i = 0; i < bmap_cnt; ++i) {
548 idx = ffsl(~bmap[i]);
552 --idx; /* ffsl is 1-based */
553 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
554 ("invalid i %d and idx %d", i, idx));
556 if (atomic_testandset_long(&bmap[i], idx))
559 ret = i * LONG_BIT + idx;
566 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
571 idx = chim_idx / LONG_BIT;
572 KASSERT(idx < sc->hn_chim_bmap_cnt,
573 ("invalid chimney index 0x%x", chim_idx));
575 mask = 1UL << (chim_idx % LONG_BIT);
576 KASSERT(sc->hn_chim_bmap[idx] & mask,
577 ("index bitmap 0x%lx, chimney index %u, "
578 "bitmap idx %d, bitmask 0x%lx",
579 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
581 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
584 #if defined(INET6) || defined(INET)
586 * NOTE: If this function failed, the m_head would be freed.
588 static __inline struct mbuf *
589 hn_tso_fixup(struct mbuf *m_head)
591 struct ether_vlan_header *evl;
595 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
597 #define PULLUP_HDR(m, len) \
599 if (__predict_false((m)->m_len < (len))) { \
600 (m) = m_pullup((m), (len)); \
606 PULLUP_HDR(m_head, sizeof(*evl));
607 evl = mtod(m_head, struct ether_vlan_header *);
608 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
609 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
611 ehlen = ETHER_HDR_LEN;
614 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
618 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
619 ip = mtodo(m_head, ehlen);
620 iphlen = ip->ip_hl << 2;
622 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
623 th = mtodo(m_head, ehlen + iphlen);
627 th->th_sum = in_pseudo(ip->ip_src.s_addr,
628 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
631 #if defined(INET6) && defined(INET)
638 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
639 ip6 = mtodo(m_head, ehlen);
640 if (ip6->ip6_nxt != IPPROTO_TCP) {
645 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
646 th = mtodo(m_head, ehlen + sizeof(*ip6));
649 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
656 #endif /* INET6 || INET */
659 hn_set_rxfilter(struct hn_softc *sc)
661 struct ifnet *ifp = sc->hn_ifp;
667 if (ifp->if_flags & IFF_PROMISC) {
668 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
670 filter = NDIS_PACKET_TYPE_DIRECTED;
671 if (ifp->if_flags & IFF_BROADCAST)
672 filter |= NDIS_PACKET_TYPE_BROADCAST;
675 * See the comment in SIOCADDMULTI/SIOCDELMULTI.
677 /* TODO: support multicast list */
678 if ((ifp->if_flags & IFF_ALLMULTI) ||
679 !TAILQ_EMPTY(&ifp->if_multiaddrs))
680 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
682 /* Always enable ALLMULTI */
683 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
687 if (sc->hn_rx_filter != filter) {
688 error = hn_rndis_set_rxfilter(sc, filter);
690 sc->hn_rx_filter = filter;
696 hn_set_txagg(struct hn_softc *sc)
702 * Setup aggregation size.
704 if (sc->hn_agg_size < 0)
707 size = sc->hn_agg_size;
709 if (sc->hn_rndis_agg_size < size)
710 size = sc->hn_rndis_agg_size;
712 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
719 /* NOTE: Type of the per TX ring setting is 'int'. */
723 /* NOTE: We only aggregate packets using chimney sending buffers. */
724 if (size > (uint32_t)sc->hn_chim_szmax)
725 size = sc->hn_chim_szmax;
728 * Setup aggregation packet count.
730 if (sc->hn_agg_pkts < 0)
733 pkts = sc->hn_agg_pkts;
735 if (sc->hn_rndis_agg_pkts < pkts)
736 pkts = sc->hn_rndis_agg_pkts;
745 /* NOTE: Type of the per TX ring setting is 'short'. */
750 /* NOTE: Type of the per TX ring setting is 'short'. */
751 if (sc->hn_rndis_agg_align > SHRT_MAX) {
758 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
759 size, pkts, sc->hn_rndis_agg_align);
762 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
763 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
765 mtx_lock(&txr->hn_tx_lock);
766 txr->hn_agg_szmax = size;
767 txr->hn_agg_pktmax = pkts;
768 txr->hn_agg_align = sc->hn_rndis_agg_align;
769 mtx_unlock(&txr->hn_tx_lock);
774 hn_get_txswq_depth(const struct hn_tx_ring *txr)
777 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
778 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
779 return txr->hn_txdesc_cnt;
780 return hn_tx_swq_depth;
784 hn_rss_reconfig(struct hn_softc *sc)
790 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
797 * Direct reconfiguration by setting the UNCHG flags does
798 * _not_ work properly.
801 if_printf(sc->hn_ifp, "disable RSS\n");
802 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
804 if_printf(sc->hn_ifp, "RSS disable failed\n");
809 * Reenable the RSS w/ the updated RSS key or indirect
813 if_printf(sc->hn_ifp, "reconfig RSS\n");
814 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
816 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
823 hn_rss_ind_fixup(struct hn_softc *sc, int nchan)
825 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
828 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
831 * Check indirect table to make sure that all channels in it
834 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
835 if (rss->rss_ind[i] >= nchan) {
836 if_printf(sc->hn_ifp,
837 "RSS indirect table %d fixup: %u -> %d\n",
838 i, rss->rss_ind[i], nchan - 1);
839 rss->rss_ind[i] = nchan - 1;
845 hn_ifmedia_upd(struct ifnet *ifp __unused)
852 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
854 struct hn_softc *sc = ifp->if_softc;
856 ifmr->ifm_status = IFM_AVALID;
857 ifmr->ifm_active = IFM_ETHER;
859 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
860 ifmr->ifm_active |= IFM_NONE;
863 ifmr->ifm_status |= IFM_ACTIVE;
864 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
867 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
868 static const struct hyperv_guid g_net_vsc_device_type = {
869 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
870 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
874 hn_probe(device_t dev)
877 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
878 &g_net_vsc_device_type) == 0) {
879 device_set_desc(dev, "Hyper-V Network Interface");
880 return BUS_PROBE_DEFAULT;
886 hn_cpuset_setthread_task(void *xmask, int pending __unused)
888 cpuset_t *mask = xmask;
891 error = cpuset_setthread(curthread->td_tid, mask);
893 panic("curthread=%ju: can't pin; error=%d",
894 (uintmax_t)curthread->td_tid, error);
899 hn_attach(device_t dev)
901 struct hn_softc *sc = device_get_softc(dev);
902 struct sysctl_oid_list *child;
903 struct sysctl_ctx_list *ctx;
904 uint8_t eaddr[ETHER_ADDR_LEN];
905 struct ifnet *ifp = NULL;
906 int error, ring_cnt, tx_ring_cnt;
909 sc->hn_prichan = vmbus_get_channel(dev);
913 * Initialize these tunables once.
915 sc->hn_agg_size = hn_tx_agg_size;
916 sc->hn_agg_pkts = hn_tx_agg_pkts;
919 * Setup taskqueue for transmission.
921 if (hn_tx_taskq == NULL) {
922 sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
923 taskqueue_thread_enqueue, &sc->hn_tx_taskq);
924 taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx",
925 device_get_nameunit(dev));
926 if (hn_bind_tx_taskq >= 0) {
927 int cpu = hn_bind_tx_taskq;
928 struct task cpuset_task;
931 if (cpu > mp_ncpus - 1)
933 CPU_SETOF(cpu, &cpu_set);
934 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task,
936 taskqueue_enqueue(sc->hn_tx_taskq, &cpuset_task);
937 taskqueue_drain(sc->hn_tx_taskq, &cpuset_task);
940 sc->hn_tx_taskq = hn_tx_taskq;
944 * Setup taskqueue for mangement tasks, e.g. link status.
946 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
947 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
948 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
949 device_get_nameunit(dev));
950 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
951 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
952 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
953 hn_netchg_status_taskfunc, sc);
956 * Allocate ifnet and setup its name earlier, so that if_printf
957 * can be used by functions, which will be called after
960 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
962 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
965 * Initialize ifmedia earlier so that it can be unconditionally
966 * destroyed, if error happened later on.
968 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
971 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
972 * to use (tx_ring_cnt).
975 * The # of RX rings to use is same as the # of channels to use.
977 ring_cnt = hn_chan_cnt;
981 if (ring_cnt > HN_RING_CNT_DEF_MAX)
982 ring_cnt = HN_RING_CNT_DEF_MAX;
983 } else if (ring_cnt > mp_ncpus) {
987 tx_ring_cnt = hn_tx_ring_cnt;
988 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
989 tx_ring_cnt = ring_cnt;
990 #ifdef HN_IFSTART_SUPPORT
991 if (hn_use_if_start) {
992 /* ifnet.if_start only needs one TX ring. */
998 * Set the leader CPU for channels.
1000 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1003 * Create enough TX/RX rings, even if only limited number of
1004 * channels can be allocated.
1006 error = hn_create_tx_data(sc, tx_ring_cnt);
1009 error = hn_create_rx_data(sc, ring_cnt);
1014 * Create transaction context for NVS and RNDIS transactions.
1016 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1017 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1018 if (sc->hn_xact == NULL)
1022 * Attach the synthetic parts, i.e. NVS and RNDIS.
1024 error = hn_synth_attach(sc, ETHERMTU);
1028 error = hn_rndis_get_eaddr(sc, eaddr);
1032 #if __FreeBSD_version >= 1100099
1033 if (sc->hn_rx_ring_inuse > 1) {
1035 * Reduce TCP segment aggregation limit for multiple
1036 * RX rings to increase ACK timeliness.
1038 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1043 * Fixup TX stuffs after synthetic parts are attached.
1045 hn_fixup_tx_data(sc);
1047 ctx = device_get_sysctl_ctx(dev);
1048 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1049 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1050 &sc->hn_nvs_ver, 0, "NVS version");
1051 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1052 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1053 hn_ndis_version_sysctl, "A", "NDIS version");
1054 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1055 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1056 hn_caps_sysctl, "A", "capabilities");
1057 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1058 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1059 hn_hwassist_sysctl, "A", "hwassist");
1060 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1061 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1062 hn_rxfilter_sysctl, "A", "rxfilter");
1063 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1064 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1065 hn_rss_hash_sysctl, "A", "RSS hash");
1066 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1067 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1068 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1069 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1070 hn_rss_key_sysctl, "IU", "RSS key");
1071 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1072 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1073 hn_rss_ind_sysctl, "IU", "RSS indirect table");
1074 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1075 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1076 "RNDIS offered packet transmission aggregation size limit");
1077 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1078 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1079 "RNDIS offered packet transmission aggregation count limit");
1080 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1081 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1082 "RNDIS packet transmission aggregation alignment");
1083 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1084 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1085 hn_txagg_size_sysctl, "I",
1086 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1087 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1088 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1089 hn_txagg_pkts_sysctl, "I",
1090 "Packet transmission aggregation packets, "
1091 "0 -- disable, -1 -- auto");
1094 * Setup the ifmedia, which has been initialized earlier.
1096 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1097 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1098 /* XXX ifmedia_set really should do this for us */
1099 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1102 * Setup the ifnet for this interface.
1106 ifp->if_baudrate = IF_Gbps(10);
1108 /* if_baudrate is 32bits on 32bit system. */
1109 ifp->if_baudrate = IF_Gbps(1);
1111 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1112 ifp->if_ioctl = hn_ioctl;
1113 ifp->if_init = hn_init;
1114 #ifdef HN_IFSTART_SUPPORT
1115 if (hn_use_if_start) {
1116 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1118 ifp->if_start = hn_start;
1119 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1120 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1121 IFQ_SET_READY(&ifp->if_snd);
1125 ifp->if_transmit = hn_transmit;
1126 ifp->if_qflush = hn_xmit_qflush;
1129 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1131 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1132 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1134 if (sc->hn_caps & HN_CAP_VLAN) {
1135 /* XXX not sure about VLAN_MTU. */
1136 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1139 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1140 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1141 ifp->if_capabilities |= IFCAP_TXCSUM;
1142 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1143 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1144 if (sc->hn_caps & HN_CAP_TSO4) {
1145 ifp->if_capabilities |= IFCAP_TSO4;
1146 ifp->if_hwassist |= CSUM_IP_TSO;
1148 if (sc->hn_caps & HN_CAP_TSO6) {
1149 ifp->if_capabilities |= IFCAP_TSO6;
1150 ifp->if_hwassist |= CSUM_IP6_TSO;
1153 /* Enable all available capabilities by default. */
1154 ifp->if_capenable = ifp->if_capabilities;
1156 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1157 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1158 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1159 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1162 ether_ifattach(ifp, eaddr);
1164 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1165 if_printf(ifp, "TSO segcnt %u segsz %u\n",
1166 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1169 /* Inform the upper layer about the long frame support. */
1170 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1173 * Kick off link status check.
1175 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1176 hn_update_link_status(sc);
1180 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1181 hn_synth_detach(sc);
1187 hn_detach(device_t dev)
1189 struct hn_softc *sc = device_get_softc(dev);
1190 struct ifnet *ifp = sc->hn_ifp;
1192 if (device_is_attached(dev)) {
1194 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1195 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1199 * hn_stop() only suspends data, so managment
1200 * stuffs have to be suspended manually here.
1202 hn_suspend_mgmt(sc);
1203 hn_synth_detach(sc);
1206 ether_ifdetach(ifp);
1209 ifmedia_removeall(&sc->hn_media);
1210 hn_destroy_rx_data(sc);
1211 hn_destroy_tx_data(sc);
1213 if (sc->hn_tx_taskq != hn_tx_taskq)
1214 taskqueue_free(sc->hn_tx_taskq);
1215 taskqueue_free(sc->hn_mgmt_taskq0);
1217 if (sc->hn_xact != NULL)
1218 vmbus_xact_ctx_destroy(sc->hn_xact);
1222 HN_LOCK_DESTROY(sc);
1227 hn_shutdown(device_t dev)
1234 hn_link_status(struct hn_softc *sc)
1236 uint32_t link_status;
1239 error = hn_rndis_get_linkstatus(sc, &link_status);
1241 /* XXX what to do? */
1245 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1246 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1248 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1249 if_link_state_change(sc->hn_ifp,
1250 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1251 LINK_STATE_UP : LINK_STATE_DOWN);
1255 hn_link_taskfunc(void *xsc, int pending __unused)
1257 struct hn_softc *sc = xsc;
1259 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1265 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1267 struct hn_softc *sc = xsc;
1269 /* Prevent any link status checks from running. */
1270 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1273 * Fake up a [link down --> link up] state change; 5 seconds
1274 * delay is used, which closely simulates miibus reaction
1275 * upon link down event.
1277 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1278 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1279 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1280 &sc->hn_netchg_status, 5 * hz);
1284 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1286 struct hn_softc *sc = xsc;
1288 /* Re-allow link status checks. */
1289 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1294 hn_update_link_status(struct hn_softc *sc)
1297 if (sc->hn_mgmt_taskq != NULL)
1298 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1302 hn_change_network(struct hn_softc *sc)
1305 if (sc->hn_mgmt_taskq != NULL)
1306 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1310 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1311 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1313 struct mbuf *m = *m_head;
1316 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1318 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1319 m, segs, nsegs, BUS_DMA_NOWAIT);
1320 if (error == EFBIG) {
1323 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1327 *m_head = m = m_new;
1328 txr->hn_tx_collapsed++;
1330 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1331 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1334 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1335 BUS_DMASYNC_PREWRITE);
1336 txd->flags |= HN_TXD_FLAG_DMAMAP;
1342 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1345 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1346 ("put an onlist txd %#x", txd->flags));
1347 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1348 ("put an onagg txd %#x", txd->flags));
1350 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1351 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1354 if (!STAILQ_EMPTY(&txd->agg_list)) {
1355 struct hn_txdesc *tmp_txd;
1357 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1360 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1361 ("resursive aggregation on aggregated txdesc"));
1362 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1363 ("not aggregated txdesc"));
1364 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1365 ("aggregated txdesc uses dmamap"));
1366 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1367 ("aggregated txdesc consumes "
1368 "chimney sending buffer"));
1369 KASSERT(tmp_txd->chim_size == 0,
1370 ("aggregated txdesc has non-zero "
1371 "chimney sending size"));
1373 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1374 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1375 freed = hn_txdesc_put(txr, tmp_txd);
1376 KASSERT(freed, ("failed to free aggregated txdesc"));
1380 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1381 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1382 ("chim txd uses dmamap"));
1383 hn_chim_free(txr->hn_sc, txd->chim_index);
1384 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1386 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1387 bus_dmamap_sync(txr->hn_tx_data_dtag,
1388 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1389 bus_dmamap_unload(txr->hn_tx_data_dtag,
1391 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1394 if (txd->m != NULL) {
1399 txd->flags |= HN_TXD_FLAG_ONLIST;
1400 #ifndef HN_USE_TXDESC_BUFRING
1401 mtx_lock_spin(&txr->hn_txlist_spin);
1402 KASSERT(txr->hn_txdesc_avail >= 0 &&
1403 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1404 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1405 txr->hn_txdesc_avail++;
1406 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1407 mtx_unlock_spin(&txr->hn_txlist_spin);
1409 atomic_add_int(&txr->hn_txdesc_avail, 1);
1410 buf_ring_enqueue(txr->hn_txdesc_br, txd);
1416 static __inline struct hn_txdesc *
1417 hn_txdesc_get(struct hn_tx_ring *txr)
1419 struct hn_txdesc *txd;
1421 #ifndef HN_USE_TXDESC_BUFRING
1422 mtx_lock_spin(&txr->hn_txlist_spin);
1423 txd = SLIST_FIRST(&txr->hn_txlist);
1425 KASSERT(txr->hn_txdesc_avail > 0,
1426 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1427 txr->hn_txdesc_avail--;
1428 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1430 mtx_unlock_spin(&txr->hn_txlist_spin);
1432 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1436 #ifdef HN_USE_TXDESC_BUFRING
1437 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1439 KASSERT(txd->m == NULL && txd->refs == 0 &&
1440 STAILQ_EMPTY(&txd->agg_list) &&
1441 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1442 txd->chim_size == 0 &&
1443 (txd->flags & HN_TXD_FLAG_ONLIST) &&
1444 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1445 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1446 txd->flags &= ~HN_TXD_FLAG_ONLIST;
1452 static __inline void
1453 hn_txdesc_hold(struct hn_txdesc *txd)
1456 /* 0->1 transition will never work */
1457 KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
1458 atomic_add_int(&txd->refs, 1);
1461 static __inline void
1462 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1465 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1466 ("recursive aggregation on aggregating txdesc"));
1468 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1469 ("already aggregated"));
1470 KASSERT(STAILQ_EMPTY(&txd->agg_list),
1471 ("recursive aggregation on to-be-aggregated txdesc"));
1473 txd->flags |= HN_TXD_FLAG_ONAGG;
1474 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1478 hn_tx_ring_pending(struct hn_tx_ring *txr)
1480 bool pending = false;
1482 #ifndef HN_USE_TXDESC_BUFRING
1483 mtx_lock_spin(&txr->hn_txlist_spin);
1484 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1486 mtx_unlock_spin(&txr->hn_txlist_spin);
1488 if (!buf_ring_full(txr->hn_txdesc_br))
1494 static __inline void
1495 hn_txeof(struct hn_tx_ring *txr)
1497 txr->hn_has_txeof = 0;
1502 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1503 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1505 struct hn_txdesc *txd = sndc->hn_cbarg;
1506 struct hn_tx_ring *txr;
1509 KASSERT(txr->hn_chan == chan,
1510 ("channel mismatch, on chan%u, should be chan%u",
1511 vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan)));
1513 txr->hn_has_txeof = 1;
1514 hn_txdesc_put(txr, txd);
1516 ++txr->hn_txdone_cnt;
1517 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1518 txr->hn_txdone_cnt = 0;
1519 if (txr->hn_oactive)
1525 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1527 #if defined(INET) || defined(INET6)
1528 struct lro_ctrl *lro = &rxr->hn_lro;
1529 struct lro_entry *queued;
1531 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1532 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1533 tcp_lro_flush(lro, queued);
1539 * 'txr' could be NULL, if multiple channels and
1540 * ifnet.if_start method are enabled.
1542 if (txr == NULL || !txr->hn_has_txeof)
1545 txr->hn_txdone_cnt = 0;
1549 static __inline uint32_t
1550 hn_rndis_pktmsg_offset(uint32_t ofs)
1553 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1554 ("invalid RNDIS packet msg offset %u", ofs));
1555 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1558 static __inline void *
1559 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1560 size_t pi_dlen, uint32_t pi_type)
1562 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1563 struct rndis_pktinfo *pi;
1565 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1566 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1569 * Per-packet-info does not move; it only grows.
1572 * rm_pktinfooffset in this phase counts from the beginning
1573 * of rndis_packet_msg.
1575 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1576 ("%u pktinfo overflows RNDIS packet msg", pi_type));
1577 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1578 pkt->rm_pktinfolen);
1579 pkt->rm_pktinfolen += pi_size;
1581 pi->rm_size = pi_size;
1582 pi->rm_type = pi_type;
1583 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1585 /* Data immediately follow per-packet-info. */
1586 pkt->rm_dataoffset += pi_size;
1588 /* Update RNDIS packet msg length */
1589 pkt->rm_len += pi_size;
1591 return (pi->rm_data);
1595 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1597 struct hn_txdesc *txd;
1601 txd = txr->hn_agg_txd;
1602 KASSERT(txd != NULL, ("no aggregate txdesc"));
1605 * Since hn_txpkt() will reset this temporary stat, save
1606 * it now, so that oerrors can be updated properly, if
1607 * hn_txpkt() ever fails.
1609 pkts = txr->hn_stat_pkts;
1612 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1613 * failure, save it for later freeing, if hn_txpkt() ever
1617 error = hn_txpkt(ifp, txr, txd);
1618 if (__predict_false(error)) {
1619 /* txd is freed, but m is not. */
1622 txr->hn_flush_failed++;
1623 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1626 /* Reset all aggregation states. */
1627 txr->hn_agg_txd = NULL;
1628 txr->hn_agg_szleft = 0;
1629 txr->hn_agg_pktleft = 0;
1630 txr->hn_agg_prevpkt = NULL;
1636 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1641 if (txr->hn_agg_txd != NULL) {
1642 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1643 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1644 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1648 * Update the previous RNDIS packet's total length,
1649 * it can be increased due to the mandatory alignment
1650 * padding for this RNDIS packet. And update the
1651 * aggregating txdesc's chimney sending buffer size
1655 * Zero-out the padding, as required by the RNDIS spec.
1658 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1659 agg_txd->chim_size += pkt->rm_len - olen;
1661 /* Link this txdesc to the parent. */
1662 hn_txdesc_agg(agg_txd, txd);
1664 chim = (uint8_t *)pkt + pkt->rm_len;
1665 /* Save the current packet for later fixup. */
1666 txr->hn_agg_prevpkt = chim;
1668 txr->hn_agg_pktleft--;
1669 txr->hn_agg_szleft -= pktsize;
1670 if (txr->hn_agg_szleft <=
1671 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1673 * Probably can't aggregate more packets,
1674 * flush this aggregating txdesc proactively.
1676 txr->hn_agg_pktleft = 0;
1681 hn_flush_txagg(ifp, txr);
1683 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1685 txr->hn_tx_chimney_tried++;
1686 txd->chim_index = hn_chim_alloc(txr->hn_sc);
1687 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1689 txr->hn_tx_chimney++;
1691 chim = txr->hn_sc->hn_chim +
1692 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1694 if (txr->hn_agg_pktmax > 1 &&
1695 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1696 txr->hn_agg_txd = txd;
1697 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1698 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1699 txr->hn_agg_prevpkt = chim;
1706 * If this function fails, then both txd and m_head0 will be freed.
1709 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1710 struct mbuf **m_head0)
1712 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1713 int error, nsegs, i;
1714 struct mbuf *m_head = *m_head0;
1715 struct rndis_packet_msg *pkt;
1718 int pkt_hlen, pkt_size;
1720 pkt = txd->rndis_pkt;
1721 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1722 if (pkt_size < txr->hn_chim_size) {
1723 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1727 if (txr->hn_agg_txd != NULL)
1728 hn_flush_txagg(ifp, txr);
1731 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1732 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1733 pkt->rm_dataoffset = sizeof(*pkt);
1734 pkt->rm_datalen = m_head->m_pkthdr.len;
1735 pkt->rm_oobdataoffset = 0;
1736 pkt->rm_oobdatalen = 0;
1737 pkt->rm_oobdataelements = 0;
1738 pkt->rm_pktinfooffset = sizeof(*pkt);
1739 pkt->rm_pktinfolen = 0;
1740 pkt->rm_vchandle = 0;
1741 pkt->rm_reserved = 0;
1743 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1745 * Set the hash value for this packet, so that the host could
1746 * dispatch the TX done event for this packet back to this TX
1749 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1750 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1751 *pi_data = txr->hn_tx_idx;
1754 if (m_head->m_flags & M_VLANTAG) {
1755 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1756 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1757 *pi_data = NDIS_VLAN_INFO_MAKE(
1758 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1759 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1760 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1763 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1764 #if defined(INET6) || defined(INET)
1765 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1766 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1768 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1769 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1770 m_head->m_pkthdr.tso_segsz);
1773 #if defined(INET6) && defined(INET)
1778 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1779 m_head->m_pkthdr.tso_segsz);
1782 #endif /* INET6 || INET */
1783 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1784 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1785 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1786 if (m_head->m_pkthdr.csum_flags &
1787 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1788 *pi_data = NDIS_TXCSUM_INFO_IPV6;
1790 *pi_data = NDIS_TXCSUM_INFO_IPV4;
1791 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1792 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1795 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1796 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1797 else if (m_head->m_pkthdr.csum_flags &
1798 (CSUM_IP_UDP | CSUM_IP6_UDP))
1799 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1802 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1803 /* Convert RNDIS packet message offsets */
1804 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1805 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1808 * Fast path: Chimney sending.
1811 struct hn_txdesc *tgt_txd = txd;
1813 if (txr->hn_agg_txd != NULL) {
1814 tgt_txd = txr->hn_agg_txd;
1820 KASSERT(pkt == chim,
1821 ("RNDIS pkt not in chimney sending buffer"));
1822 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1823 ("chimney sending buffer is not used"));
1824 tgt_txd->chim_size += pkt->rm_len;
1826 m_copydata(m_head, 0, m_head->m_pkthdr.len,
1827 ((uint8_t *)chim) + pkt_hlen);
1829 txr->hn_gpa_cnt = 0;
1830 txr->hn_sendpkt = hn_txpkt_chim;
1834 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1835 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1836 ("chimney buffer is used"));
1837 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1839 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1840 if (__predict_false(error)) {
1844 * This mbuf is not linked w/ the txd yet, so free it now.
1849 freed = hn_txdesc_put(txr, txd);
1851 ("fail to free txd upon txdma error"));
1853 txr->hn_txdma_failed++;
1854 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1859 /* +1 RNDIS packet message */
1860 txr->hn_gpa_cnt = nsegs + 1;
1862 /* send packet with page buffer */
1863 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1864 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1865 txr->hn_gpa[0].gpa_len = pkt_hlen;
1868 * Fill the page buffers with mbuf info after the page
1869 * buffer for RNDIS packet message.
1871 for (i = 0; i < nsegs; ++i) {
1872 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1874 gpa->gpa_page = atop(segs[i].ds_addr);
1875 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1876 gpa->gpa_len = segs[i].ds_len;
1879 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1881 txr->hn_sendpkt = hn_txpkt_sglist;
1885 /* Set the completion routine */
1886 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1888 /* Update temporary stats for later use. */
1889 txr->hn_stat_pkts++;
1890 txr->hn_stat_size += m_head->m_pkthdr.len;
1891 if (m_head->m_flags & M_MCAST)
1892 txr->hn_stat_mcasts++;
1899 * If this function fails, then txd will be freed, but the mbuf
1900 * associated w/ the txd will _not_ be freed.
1903 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1905 int error, send_failed = 0;
1909 * Make sure that this txd and any aggregated txds are not freed
1910 * before ETHER_BPF_MTAP.
1912 hn_txdesc_hold(txd);
1913 error = txr->hn_sendpkt(txr, txd);
1915 if (bpf_peers_present(ifp->if_bpf)) {
1916 const struct hn_txdesc *tmp_txd;
1918 ETHER_BPF_MTAP(ifp, txd->m);
1919 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1920 ETHER_BPF_MTAP(ifp, tmp_txd->m);
1923 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1924 #ifdef HN_IFSTART_SUPPORT
1925 if (!hn_use_if_start)
1928 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1930 if (txr->hn_stat_mcasts != 0) {
1931 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1932 txr->hn_stat_mcasts);
1935 txr->hn_pkts += txr->hn_stat_pkts;
1938 hn_txdesc_put(txr, txd);
1940 if (__predict_false(error)) {
1944 * This should "really rarely" happen.
1946 * XXX Too many RX to be acked or too many sideband
1947 * commands to run? Ask netvsc_channel_rollup()
1948 * to kick start later.
1950 txr->hn_has_txeof = 1;
1952 txr->hn_send_failed++;
1955 * Try sending again after set hn_has_txeof;
1956 * in case that we missed the last
1957 * netvsc_channel_rollup().
1961 if_printf(ifp, "send failed\n");
1964 * Caller will perform further processing on the
1965 * associated mbuf, so don't free it in hn_txdesc_put();
1966 * only unload it from the DMA map in hn_txdesc_put(),
1970 freed = hn_txdesc_put(txr, txd);
1972 ("fail to free txd upon send error"));
1974 txr->hn_send_failed++;
1977 /* Reset temporary stats, after this sending is done. */
1978 txr->hn_stat_size = 0;
1979 txr->hn_stat_pkts = 0;
1980 txr->hn_stat_mcasts = 0;
1986 * Append the specified data to the indicated mbuf chain,
1987 * Extend the mbuf chain if the new data does not fit in
1990 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
1991 * There should be an equivalent in the kernel mbuf code,
1992 * but there does not appear to be one yet.
1994 * Differs from m_append() in that additional mbufs are
1995 * allocated with cluster size MJUMPAGESIZE, and filled
1998 * Return 1 if able to complete the job; otherwise 0.
2001 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2004 int remainder, space;
2006 for (m = m0; m->m_next != NULL; m = m->m_next)
2009 space = M_TRAILINGSPACE(m);
2012 * Copy into available space.
2014 if (space > remainder)
2016 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2021 while (remainder > 0) {
2023 * Allocate a new mbuf; could check space
2024 * and allocate a cluster instead.
2026 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
2029 n->m_len = min(MJUMPAGESIZE, remainder);
2030 bcopy(cp, mtod(n, caddr_t), n->m_len);
2032 remainder -= n->m_len;
2036 if (m0->m_flags & M_PKTHDR)
2037 m0->m_pkthdr.len += len - remainder;
2039 return (remainder == 0);
2042 #if defined(INET) || defined(INET6)
2044 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2046 #if __FreeBSD_version >= 1100095
2047 if (hn_lro_mbufq_depth) {
2048 tcp_lro_queue_mbuf(lc, m);
2052 return tcp_lro_rx(lc, m, 0);
2057 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2058 const struct hn_rxinfo *info)
2060 struct ifnet *ifp = rxr->hn_ifp;
2062 int size, do_lro = 0, do_csum = 1;
2063 int hash_type = M_HASHTYPE_OPAQUE;
2065 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2069 * Bail out if packet contains more data than configured MTU.
2071 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
2073 } else if (dlen <= MHLEN) {
2074 m_new = m_gethdr(M_NOWAIT, MT_DATA);
2075 if (m_new == NULL) {
2076 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2079 memcpy(mtod(m_new, void *), data, dlen);
2080 m_new->m_pkthdr.len = m_new->m_len = dlen;
2081 rxr->hn_small_pkts++;
2084 * Get an mbuf with a cluster. For packets 2K or less,
2085 * get a standard 2K cluster. For anything larger, get a
2086 * 4K cluster. Any buffers larger than 4K can cause problems
2087 * if looped around to the Hyper-V TX channel, so avoid them.
2090 if (dlen > MCLBYTES) {
2092 size = MJUMPAGESIZE;
2095 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2096 if (m_new == NULL) {
2097 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2101 hv_m_append(m_new, dlen, data);
2103 m_new->m_pkthdr.rcvif = ifp;
2105 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2108 /* receive side checksum offload */
2109 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2110 /* IP csum offload */
2111 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2112 m_new->m_pkthdr.csum_flags |=
2113 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2117 /* TCP/UDP csum offload */
2118 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2119 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2120 m_new->m_pkthdr.csum_flags |=
2121 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2122 m_new->m_pkthdr.csum_data = 0xffff;
2123 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2131 * As of this write (Oct 28th, 2016), host side will turn
2132 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2133 * the do_lro setting here is actually _not_ accurate. We
2134 * depend on the RSS hash type check to reset do_lro.
2136 if ((info->csum_info &
2137 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2138 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2141 const struct ether_header *eh;
2146 if (m_new->m_len < hoff)
2148 eh = mtod(m_new, struct ether_header *);
2149 etype = ntohs(eh->ether_type);
2150 if (etype == ETHERTYPE_VLAN) {
2151 const struct ether_vlan_header *evl;
2153 hoff = sizeof(*evl);
2154 if (m_new->m_len < hoff)
2156 evl = mtod(m_new, struct ether_vlan_header *);
2157 etype = ntohs(evl->evl_proto);
2160 if (etype == ETHERTYPE_IP) {
2163 pr = hn_check_iplen(m_new, hoff);
2164 if (pr == IPPROTO_TCP) {
2166 (rxr->hn_trust_hcsum &
2167 HN_TRUST_HCSUM_TCP)) {
2168 rxr->hn_csum_trusted++;
2169 m_new->m_pkthdr.csum_flags |=
2170 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2171 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2172 m_new->m_pkthdr.csum_data = 0xffff;
2175 } else if (pr == IPPROTO_UDP) {
2177 (rxr->hn_trust_hcsum &
2178 HN_TRUST_HCSUM_UDP)) {
2179 rxr->hn_csum_trusted++;
2180 m_new->m_pkthdr.csum_flags |=
2181 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2182 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2183 m_new->m_pkthdr.csum_data = 0xffff;
2185 } else if (pr != IPPROTO_DONE && do_csum &&
2186 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2187 rxr->hn_csum_trusted++;
2188 m_new->m_pkthdr.csum_flags |=
2189 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2194 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2195 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2196 NDIS_VLAN_INFO_ID(info->vlan_info),
2197 NDIS_VLAN_INFO_PRI(info->vlan_info),
2198 NDIS_VLAN_INFO_CFI(info->vlan_info));
2199 m_new->m_flags |= M_VLANTAG;
2202 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2204 m_new->m_pkthdr.flowid = info->hash_value;
2205 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2206 NDIS_HASH_FUNCTION_TOEPLITZ) {
2207 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2211 * do_lro is resetted, if the hash types are not TCP
2212 * related. See the comment in the above csum_flags
2216 case NDIS_HASH_IPV4:
2217 hash_type = M_HASHTYPE_RSS_IPV4;
2221 case NDIS_HASH_TCP_IPV4:
2222 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2225 case NDIS_HASH_IPV6:
2226 hash_type = M_HASHTYPE_RSS_IPV6;
2230 case NDIS_HASH_IPV6_EX:
2231 hash_type = M_HASHTYPE_RSS_IPV6_EX;
2235 case NDIS_HASH_TCP_IPV6:
2236 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2239 case NDIS_HASH_TCP_IPV6_EX:
2240 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2245 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2247 M_HASHTYPE_SET(m_new, hash_type);
2250 * Note: Moved RX completion back to hv_nv_on_receive() so all
2251 * messages (not just data messages) will trigger a response.
2257 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2258 #if defined(INET) || defined(INET6)
2259 struct lro_ctrl *lro = &rxr->hn_lro;
2262 rxr->hn_lro_tried++;
2263 if (hn_lro_rx(lro, m_new) == 0) {
2271 /* We're not holding the lock here, so don't release it */
2272 (*ifp->if_input)(ifp, m_new);
2278 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2280 struct hn_softc *sc = ifp->if_softc;
2281 struct ifreq *ifr = (struct ifreq *)data;
2282 int mask, error = 0;
2286 if (ifr->ifr_mtu > HN_MTU_MAX) {
2293 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2298 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2299 /* Can't change MTU */
2305 if (ifp->if_mtu == ifr->ifr_mtu) {
2311 * Suspend this interface before the synthetic parts
2317 * Detach the synthetics parts, i.e. NVS and RNDIS.
2319 hn_synth_detach(sc);
2322 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2323 * with the new MTU setting.
2325 error = hn_synth_attach(sc, ifr->ifr_mtu);
2332 * Commit the requested MTU, after the synthetic parts
2333 * have been successfully attached.
2335 ifp->if_mtu = ifr->ifr_mtu;
2338 * Make sure that various parameters based on MTU are
2339 * still valid, after the MTU change.
2341 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2342 hn_set_chim_size(sc, sc->hn_chim_szmax);
2343 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2344 #if __FreeBSD_version >= 1100099
2345 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2346 HN_LRO_LENLIM_MIN(ifp))
2347 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2351 * All done! Resume the interface now.
2361 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2366 if (ifp->if_flags & IFF_UP) {
2367 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2368 hn_set_rxfilter(sc);
2372 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2375 sc->hn_if_flags = ifp->if_flags;
2382 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2384 if (mask & IFCAP_TXCSUM) {
2385 ifp->if_capenable ^= IFCAP_TXCSUM;
2386 if (ifp->if_capenable & IFCAP_TXCSUM)
2387 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2389 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2391 if (mask & IFCAP_TXCSUM_IPV6) {
2392 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2393 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2394 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2396 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2399 /* TODO: flip RNDIS offload parameters for RXCSUM. */
2400 if (mask & IFCAP_RXCSUM)
2401 ifp->if_capenable ^= IFCAP_RXCSUM;
2403 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2404 if (mask & IFCAP_RXCSUM_IPV6)
2405 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2408 if (mask & IFCAP_LRO)
2409 ifp->if_capenable ^= IFCAP_LRO;
2411 if (mask & IFCAP_TSO4) {
2412 ifp->if_capenable ^= IFCAP_TSO4;
2413 if (ifp->if_capenable & IFCAP_TSO4)
2414 ifp->if_hwassist |= CSUM_IP_TSO;
2416 ifp->if_hwassist &= ~CSUM_IP_TSO;
2418 if (mask & IFCAP_TSO6) {
2419 ifp->if_capenable ^= IFCAP_TSO6;
2420 if (ifp->if_capenable & IFCAP_TSO6)
2421 ifp->if_hwassist |= CSUM_IP6_TSO;
2423 ifp->if_hwassist &= ~CSUM_IP6_TSO;
2434 * Multicast uses mutex, while RNDIS RX filter setting
2435 * sleeps. We workaround this by always enabling
2436 * ALLMULTI. ALLMULTI would actually always be on, even
2437 * if we supported the SIOCADDMULTI/SIOCDELMULTI, since
2438 * we don't support multicast address list configuration
2443 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2447 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2448 hn_set_rxfilter(sc);
2456 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2460 error = ether_ioctl(ifp, cmd, data);
2467 hn_stop(struct hn_softc *sc)
2469 struct ifnet *ifp = sc->hn_ifp;
2474 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2475 ("synthetic parts were not attached"));
2477 /* Clear RUNNING bit _before_ hn_suspend_data() */
2478 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2479 hn_suspend_data(sc);
2481 /* Clear OACTIVE bit. */
2482 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2483 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2484 sc->hn_tx_ring[i].hn_oactive = 0;
2488 hn_init_locked(struct hn_softc *sc)
2490 struct ifnet *ifp = sc->hn_ifp;
2495 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2498 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2501 /* Configure RX filter */
2502 hn_set_rxfilter(sc);
2504 /* Clear OACTIVE bit. */
2505 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2506 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2507 sc->hn_tx_ring[i].hn_oactive = 0;
2509 /* Clear TX 'suspended' bit. */
2510 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2512 /* Everything is ready; unleash! */
2513 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2519 struct hn_softc *sc = xsc;
2526 #if __FreeBSD_version >= 1100099
2529 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2531 struct hn_softc *sc = arg1;
2532 unsigned int lenlim;
2535 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2536 error = sysctl_handle_int(oidp, &lenlim, 0, req);
2537 if (error || req->newptr == NULL)
2541 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2542 lenlim > TCP_LRO_LENGTH_MAX) {
2546 hn_set_lro_lenlim(sc, lenlim);
2553 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2555 struct hn_softc *sc = arg1;
2556 int ackcnt, error, i;
2559 * lro_ackcnt_lim is append count limit,
2560 * +1 to turn it into aggregation limit.
2562 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2563 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2564 if (error || req->newptr == NULL)
2567 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2571 * Convert aggregation limit back to append
2576 for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
2577 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2585 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2587 struct hn_softc *sc = arg1;
2592 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2595 error = sysctl_handle_int(oidp, &on, 0, req);
2596 if (error || req->newptr == NULL)
2600 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2601 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2604 rxr->hn_trust_hcsum |= hcsum;
2606 rxr->hn_trust_hcsum &= ~hcsum;
2613 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2615 struct hn_softc *sc = arg1;
2616 int chim_size, error;
2618 chim_size = sc->hn_tx_ring[0].hn_chim_size;
2619 error = sysctl_handle_int(oidp, &chim_size, 0, req);
2620 if (error || req->newptr == NULL)
2623 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2627 hn_set_chim_size(sc, chim_size);
2632 #if __FreeBSD_version < 1100095
2634 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2636 struct hn_softc *sc = arg1;
2637 int ofs = arg2, i, error;
2638 struct hn_rx_ring *rxr;
2642 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2643 rxr = &sc->hn_rx_ring[i];
2644 stat += *((int *)((uint8_t *)rxr + ofs));
2647 error = sysctl_handle_64(oidp, &stat, 0, req);
2648 if (error || req->newptr == NULL)
2651 /* Zero out this stat. */
2652 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2653 rxr = &sc->hn_rx_ring[i];
2654 *((int *)((uint8_t *)rxr + ofs)) = 0;
2660 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2662 struct hn_softc *sc = arg1;
2663 int ofs = arg2, i, error;
2664 struct hn_rx_ring *rxr;
2668 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2669 rxr = &sc->hn_rx_ring[i];
2670 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2673 error = sysctl_handle_64(oidp, &stat, 0, req);
2674 if (error || req->newptr == NULL)
2677 /* Zero out this stat. */
2678 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2679 rxr = &sc->hn_rx_ring[i];
2680 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2688 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2690 struct hn_softc *sc = arg1;
2691 int ofs = arg2, i, error;
2692 struct hn_rx_ring *rxr;
2696 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2697 rxr = &sc->hn_rx_ring[i];
2698 stat += *((u_long *)((uint8_t *)rxr + ofs));
2701 error = sysctl_handle_long(oidp, &stat, 0, req);
2702 if (error || req->newptr == NULL)
2705 /* Zero out this stat. */
2706 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2707 rxr = &sc->hn_rx_ring[i];
2708 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
2714 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2716 struct hn_softc *sc = arg1;
2717 int ofs = arg2, i, error;
2718 struct hn_tx_ring *txr;
2722 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2723 txr = &sc->hn_tx_ring[i];
2724 stat += *((u_long *)((uint8_t *)txr + ofs));
2727 error = sysctl_handle_long(oidp, &stat, 0, req);
2728 if (error || req->newptr == NULL)
2731 /* Zero out this stat. */
2732 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2733 txr = &sc->hn_tx_ring[i];
2734 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2740 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2742 struct hn_softc *sc = arg1;
2743 int ofs = arg2, i, error, conf;
2744 struct hn_tx_ring *txr;
2746 txr = &sc->hn_tx_ring[0];
2747 conf = *((int *)((uint8_t *)txr + ofs));
2749 error = sysctl_handle_int(oidp, &conf, 0, req);
2750 if (error || req->newptr == NULL)
2754 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2755 txr = &sc->hn_tx_ring[i];
2756 *((int *)((uint8_t *)txr + ofs)) = conf;
2764 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2766 struct hn_softc *sc = arg1;
2769 size = sc->hn_agg_size;
2770 error = sysctl_handle_int(oidp, &size, 0, req);
2771 if (error || req->newptr == NULL)
2775 sc->hn_agg_size = size;
2783 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2785 struct hn_softc *sc = arg1;
2788 pkts = sc->hn_agg_pkts;
2789 error = sysctl_handle_int(oidp, &pkts, 0, req);
2790 if (error || req->newptr == NULL)
2794 sc->hn_agg_pkts = pkts;
2802 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2804 struct hn_softc *sc = arg1;
2807 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2808 return (sysctl_handle_int(oidp, &pkts, 0, req));
2812 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2814 struct hn_softc *sc = arg1;
2817 align = sc->hn_tx_ring[0].hn_agg_align;
2818 return (sysctl_handle_int(oidp, &align, 0, req));
2822 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2824 struct hn_softc *sc = arg1;
2827 snprintf(verstr, sizeof(verstr), "%u.%u",
2828 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2829 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2830 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2834 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2836 struct hn_softc *sc = arg1;
2843 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2844 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2848 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2850 struct hn_softc *sc = arg1;
2851 char assist_str[128];
2855 hwassist = sc->hn_ifp->if_hwassist;
2857 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2858 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2862 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2864 struct hn_softc *sc = arg1;
2865 char filter_str[128];
2869 filter = sc->hn_rx_filter;
2871 snprintf(filter_str, sizeof(filter_str), "%b", filter,
2873 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2877 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2879 struct hn_softc *sc = arg1;
2884 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2885 if (error || req->newptr == NULL)
2888 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2891 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2893 if (sc->hn_rx_ring_inuse > 1) {
2894 error = hn_rss_reconfig(sc);
2896 /* Not RSS capable, at least for now; just save the RSS key. */
2905 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2907 struct hn_softc *sc = arg1;
2912 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2913 if (error || req->newptr == NULL)
2917 * Don't allow RSS indirect table change, if this interface is not
2918 * RSS capable currently.
2920 if (sc->hn_rx_ring_inuse == 1) {
2925 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2928 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2930 hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse);
2931 error = hn_rss_reconfig(sc);
2938 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
2940 struct hn_softc *sc = arg1;
2945 hash = sc->hn_rss_hash;
2947 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
2948 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
2952 hn_check_iplen(const struct mbuf *m, int hoff)
2954 const struct ip *ip;
2955 int len, iphlen, iplen;
2956 const struct tcphdr *th;
2957 int thoff; /* TCP data offset */
2959 len = hoff + sizeof(struct ip);
2961 /* The packet must be at least the size of an IP header. */
2962 if (m->m_pkthdr.len < len)
2963 return IPPROTO_DONE;
2965 /* The fixed IP header must reside completely in the first mbuf. */
2967 return IPPROTO_DONE;
2969 ip = mtodo(m, hoff);
2971 /* Bound check the packet's stated IP header length. */
2972 iphlen = ip->ip_hl << 2;
2973 if (iphlen < sizeof(struct ip)) /* minimum header length */
2974 return IPPROTO_DONE;
2976 /* The full IP header must reside completely in the one mbuf. */
2977 if (m->m_len < hoff + iphlen)
2978 return IPPROTO_DONE;
2980 iplen = ntohs(ip->ip_len);
2983 * Check that the amount of data in the buffers is as
2984 * at least much as the IP header would have us expect.
2986 if (m->m_pkthdr.len < hoff + iplen)
2987 return IPPROTO_DONE;
2990 * Ignore IP fragments.
2992 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
2993 return IPPROTO_DONE;
2996 * The TCP/IP or UDP/IP header must be entirely contained within
2997 * the first fragment of a packet.
3001 if (iplen < iphlen + sizeof(struct tcphdr))
3002 return IPPROTO_DONE;
3003 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3004 return IPPROTO_DONE;
3005 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3006 thoff = th->th_off << 2;
3007 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3008 return IPPROTO_DONE;
3009 if (m->m_len < hoff + iphlen + thoff)
3010 return IPPROTO_DONE;
3013 if (iplen < iphlen + sizeof(struct udphdr))
3014 return IPPROTO_DONE;
3015 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3016 return IPPROTO_DONE;
3020 return IPPROTO_DONE;
3027 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3029 struct sysctl_oid_list *child;
3030 struct sysctl_ctx_list *ctx;
3031 device_t dev = sc->hn_dev;
3032 #if defined(INET) || defined(INET6)
3033 #if __FreeBSD_version >= 1100095
3040 * Create RXBUF for reception.
3043 * - It is shared by all channels.
3044 * - A large enough buffer is allocated, certain version of NVSes
3045 * may further limit the usable space.
3047 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3048 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3049 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3050 if (sc->hn_rxbuf == NULL) {
3051 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3055 sc->hn_rx_ring_cnt = ring_cnt;
3056 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3058 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3059 M_DEVBUF, M_WAITOK | M_ZERO);
3061 #if defined(INET) || defined(INET6)
3062 #if __FreeBSD_version >= 1100095
3063 lroent_cnt = hn_lro_entry_count;
3064 if (lroent_cnt < TCP_LRO_ENTRIES)
3065 lroent_cnt = TCP_LRO_ENTRIES;
3067 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3069 #endif /* INET || INET6 */
3071 ctx = device_get_sysctl_ctx(dev);
3072 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3074 /* Create dev.hn.UNIT.rx sysctl tree */
3075 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3076 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3078 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3079 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3081 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3082 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3083 &rxr->hn_br_dma, BUS_DMA_WAITOK);
3084 if (rxr->hn_br == NULL) {
3085 device_printf(dev, "allocate bufring failed\n");
3089 if (hn_trust_hosttcp)
3090 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3091 if (hn_trust_hostudp)
3092 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3093 if (hn_trust_hostip)
3094 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3095 rxr->hn_ifp = sc->hn_ifp;
3096 if (i < sc->hn_tx_ring_cnt)
3097 rxr->hn_txr = &sc->hn_tx_ring[i];
3098 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3099 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3101 rxr->hn_rxbuf = sc->hn_rxbuf;
3106 #if defined(INET) || defined(INET6)
3107 #if __FreeBSD_version >= 1100095
3108 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3109 hn_lro_mbufq_depth);
3111 tcp_lro_init(&rxr->hn_lro);
3112 rxr->hn_lro.ifp = sc->hn_ifp;
3114 #if __FreeBSD_version >= 1100099
3115 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3116 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3118 #endif /* INET || INET6 */
3120 if (sc->hn_rx_sysctl_tree != NULL) {
3124 * Create per RX ring sysctl tree:
3125 * dev.hn.UNIT.rx.RINGID
3127 snprintf(name, sizeof(name), "%d", i);
3128 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3129 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3130 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3132 if (rxr->hn_rx_sysctl_tree != NULL) {
3133 SYSCTL_ADD_ULONG(ctx,
3134 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3135 OID_AUTO, "packets", CTLFLAG_RW,
3136 &rxr->hn_pkts, "# of packets received");
3137 SYSCTL_ADD_ULONG(ctx,
3138 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3139 OID_AUTO, "rss_pkts", CTLFLAG_RW,
3141 "# of packets w/ RSS info received");
3143 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3144 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3145 &rxr->hn_pktbuf_len, 0,
3146 "Temporary channel packet buffer length");
3151 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3152 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3153 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3154 #if __FreeBSD_version < 1100095
3155 hn_rx_stat_int_sysctl,
3157 hn_rx_stat_u64_sysctl,
3159 "LU", "LRO queued");
3160 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3161 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3162 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3163 #if __FreeBSD_version < 1100095
3164 hn_rx_stat_int_sysctl,
3166 hn_rx_stat_u64_sysctl,
3168 "LU", "LRO flushed");
3169 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3170 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3171 __offsetof(struct hn_rx_ring, hn_lro_tried),
3172 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3173 #if __FreeBSD_version >= 1100099
3174 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3175 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3176 hn_lro_lenlim_sysctl, "IU",
3177 "Max # of data bytes to be aggregated by LRO");
3178 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3179 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3180 hn_lro_ackcnt_sysctl, "I",
3181 "Max # of ACKs to be aggregated by LRO");
3183 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3184 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3185 hn_trust_hcsum_sysctl, "I",
3186 "Trust tcp segement verification on host side, "
3187 "when csum info is missing");
3188 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3189 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3190 hn_trust_hcsum_sysctl, "I",
3191 "Trust udp datagram verification on host side, "
3192 "when csum info is missing");
3193 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3194 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3195 hn_trust_hcsum_sysctl, "I",
3196 "Trust ip packet verification on host side, "
3197 "when csum info is missing");
3198 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3199 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3200 __offsetof(struct hn_rx_ring, hn_csum_ip),
3201 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3202 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3203 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3204 __offsetof(struct hn_rx_ring, hn_csum_tcp),
3205 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3206 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3207 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3208 __offsetof(struct hn_rx_ring, hn_csum_udp),
3209 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3210 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3211 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3212 __offsetof(struct hn_rx_ring, hn_csum_trusted),
3213 hn_rx_stat_ulong_sysctl, "LU",
3214 "# of packets that we trust host's csum verification");
3215 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3216 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3217 __offsetof(struct hn_rx_ring, hn_small_pkts),
3218 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3219 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3220 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3221 __offsetof(struct hn_rx_ring, hn_ack_failed),
3222 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3223 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3224 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3225 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3226 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3232 hn_destroy_rx_data(struct hn_softc *sc)
3236 if (sc->hn_rxbuf != NULL) {
3237 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3238 sc->hn_rxbuf = NULL;
3241 if (sc->hn_rx_ring_cnt == 0)
3244 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3245 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3247 if (rxr->hn_br == NULL)
3249 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3252 #if defined(INET) || defined(INET6)
3253 tcp_lro_free(&rxr->hn_lro);
3255 free(rxr->hn_pktbuf, M_DEVBUF);
3257 free(sc->hn_rx_ring, M_DEVBUF);
3258 sc->hn_rx_ring = NULL;
3260 sc->hn_rx_ring_cnt = 0;
3261 sc->hn_rx_ring_inuse = 0;
3265 hn_tx_ring_create(struct hn_softc *sc, int id)
3267 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3268 device_t dev = sc->hn_dev;
3269 bus_dma_tag_t parent_dtag;
3273 txr->hn_tx_idx = id;
3275 #ifndef HN_USE_TXDESC_BUFRING
3276 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3278 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3280 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3281 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3282 M_DEVBUF, M_WAITOK | M_ZERO);
3283 #ifndef HN_USE_TXDESC_BUFRING
3284 SLIST_INIT(&txr->hn_txlist);
3286 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3287 M_WAITOK, &txr->hn_tx_lock);
3290 txr->hn_tx_taskq = sc->hn_tx_taskq;
3292 #ifdef HN_IFSTART_SUPPORT
3293 if (hn_use_if_start) {
3294 txr->hn_txeof = hn_start_txeof;
3295 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3296 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3302 txr->hn_txeof = hn_xmit_txeof;
3303 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3304 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3306 br_depth = hn_get_txswq_depth(txr);
3307 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3308 M_WAITOK, &txr->hn_tx_lock);
3311 txr->hn_direct_tx_size = hn_direct_tx_size;
3314 * Always schedule transmission instead of trying to do direct
3315 * transmission. This one gives the best performance so far.
3317 txr->hn_sched_tx = 1;
3319 parent_dtag = bus_get_dma_tag(dev);
3321 /* DMA tag for RNDIS packet messages. */
3322 error = bus_dma_tag_create(parent_dtag, /* parent */
3323 HN_RNDIS_PKT_ALIGN, /* alignment */
3324 HN_RNDIS_PKT_BOUNDARY, /* boundary */
3325 BUS_SPACE_MAXADDR, /* lowaddr */
3326 BUS_SPACE_MAXADDR, /* highaddr */
3327 NULL, NULL, /* filter, filterarg */
3328 HN_RNDIS_PKT_LEN, /* maxsize */
3330 HN_RNDIS_PKT_LEN, /* maxsegsize */
3332 NULL, /* lockfunc */
3333 NULL, /* lockfuncarg */
3334 &txr->hn_tx_rndis_dtag);
3336 device_printf(dev, "failed to create rndis dmatag\n");
3340 /* DMA tag for data. */
3341 error = bus_dma_tag_create(parent_dtag, /* parent */
3343 HN_TX_DATA_BOUNDARY, /* boundary */
3344 BUS_SPACE_MAXADDR, /* lowaddr */
3345 BUS_SPACE_MAXADDR, /* highaddr */
3346 NULL, NULL, /* filter, filterarg */
3347 HN_TX_DATA_MAXSIZE, /* maxsize */
3348 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
3349 HN_TX_DATA_SEGSIZE, /* maxsegsize */
3351 NULL, /* lockfunc */
3352 NULL, /* lockfuncarg */
3353 &txr->hn_tx_data_dtag);
3355 device_printf(dev, "failed to create data dmatag\n");
3359 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3360 struct hn_txdesc *txd = &txr->hn_txdesc[i];
3363 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3364 STAILQ_INIT(&txd->agg_list);
3367 * Allocate and load RNDIS packet message.
3369 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3370 (void **)&txd->rndis_pkt,
3371 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3372 &txd->rndis_pkt_dmap);
3375 "failed to allocate rndis_packet_msg, %d\n", i);
3379 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3380 txd->rndis_pkt_dmap,
3381 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3382 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3386 "failed to load rndis_packet_msg, %d\n", i);
3387 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3388 txd->rndis_pkt, txd->rndis_pkt_dmap);
3392 /* DMA map for TX data. */
3393 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3397 "failed to allocate tx data dmamap\n");
3398 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3399 txd->rndis_pkt_dmap);
3400 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3401 txd->rndis_pkt, txd->rndis_pkt_dmap);
3405 /* All set, put it to list */
3406 txd->flags |= HN_TXD_FLAG_ONLIST;
3407 #ifndef HN_USE_TXDESC_BUFRING
3408 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3410 buf_ring_enqueue(txr->hn_txdesc_br, txd);
3413 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3415 if (sc->hn_tx_sysctl_tree != NULL) {
3416 struct sysctl_oid_list *child;
3417 struct sysctl_ctx_list *ctx;
3421 * Create per TX ring sysctl tree:
3422 * dev.hn.UNIT.tx.RINGID
3424 ctx = device_get_sysctl_ctx(dev);
3425 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3427 snprintf(name, sizeof(name), "%d", id);
3428 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3429 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3431 if (txr->hn_tx_sysctl_tree != NULL) {
3432 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3434 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3435 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3436 "# of available TX descs");
3437 #ifdef HN_IFSTART_SUPPORT
3438 if (!hn_use_if_start)
3441 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3442 CTLFLAG_RD, &txr->hn_oactive, 0,
3445 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3446 CTLFLAG_RW, &txr->hn_pkts,
3447 "# of packets transmitted");
3448 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3449 CTLFLAG_RW, &txr->hn_sends, "# of sends");
3457 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3459 struct hn_tx_ring *txr = txd->txr;
3461 KASSERT(txd->m == NULL, ("still has mbuf installed"));
3462 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3464 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3465 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3466 txd->rndis_pkt_dmap);
3467 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3471 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3473 struct hn_txdesc *txd;
3475 if (txr->hn_txdesc == NULL)
3478 #ifndef HN_USE_TXDESC_BUFRING
3479 while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) {
3480 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
3481 hn_txdesc_dmamap_destroy(txd);
3484 mtx_lock(&txr->hn_tx_lock);
3485 while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
3486 hn_txdesc_dmamap_destroy(txd);
3487 mtx_unlock(&txr->hn_tx_lock);
3490 if (txr->hn_tx_data_dtag != NULL)
3491 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3492 if (txr->hn_tx_rndis_dtag != NULL)
3493 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3495 #ifdef HN_USE_TXDESC_BUFRING
3496 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3499 free(txr->hn_txdesc, M_DEVBUF);
3500 txr->hn_txdesc = NULL;
3502 if (txr->hn_mbuf_br != NULL)
3503 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3505 #ifndef HN_USE_TXDESC_BUFRING
3506 mtx_destroy(&txr->hn_txlist_spin);
3508 mtx_destroy(&txr->hn_tx_lock);
3512 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3514 struct sysctl_oid_list *child;
3515 struct sysctl_ctx_list *ctx;
3519 * Create TXBUF for chimney sending.
3521 * NOTE: It is shared by all channels.
3523 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3524 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3525 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3526 if (sc->hn_chim == NULL) {
3527 device_printf(sc->hn_dev, "allocate txbuf failed\n");
3531 sc->hn_tx_ring_cnt = ring_cnt;
3532 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3534 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3535 M_DEVBUF, M_WAITOK | M_ZERO);
3537 ctx = device_get_sysctl_ctx(sc->hn_dev);
3538 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3540 /* Create dev.hn.UNIT.tx sysctl tree */
3541 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3542 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3544 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3547 error = hn_tx_ring_create(sc, i);
3552 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3553 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3554 __offsetof(struct hn_tx_ring, hn_no_txdescs),
3555 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3556 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3557 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3558 __offsetof(struct hn_tx_ring, hn_send_failed),
3559 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3560 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3561 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3562 __offsetof(struct hn_tx_ring, hn_txdma_failed),
3563 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3564 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3565 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3566 __offsetof(struct hn_tx_ring, hn_flush_failed),
3567 hn_tx_stat_ulong_sysctl, "LU",
3568 "# of packet transmission aggregation flush failure");
3569 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3570 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3571 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3572 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3573 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3574 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3575 __offsetof(struct hn_tx_ring, hn_tx_chimney),
3576 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3577 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3578 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3579 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3580 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3581 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3582 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3583 "# of total TX descs");
3584 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3585 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3586 "Chimney send packet size upper boundary");
3587 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3588 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3589 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3590 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3591 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3592 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3593 hn_tx_conf_int_sysctl, "I",
3594 "Size of the packet for direct transmission");
3595 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3596 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3597 __offsetof(struct hn_tx_ring, hn_sched_tx),
3598 hn_tx_conf_int_sysctl, "I",
3599 "Always schedule transmission "
3600 "instead of doing direct transmission");
3601 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3602 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3603 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3604 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3605 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3606 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3607 "Applied packet transmission aggregation size");
3608 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3609 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3610 hn_txagg_pktmax_sysctl, "I",
3611 "Applied packet transmission aggregation packets");
3612 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3613 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3614 hn_txagg_align_sysctl, "I",
3615 "Applied packet transmission aggregation alignment");
3621 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3625 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3626 sc->hn_tx_ring[i].hn_chim_size = chim_size;
3630 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3632 struct ifnet *ifp = sc->hn_ifp;
3635 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3638 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3639 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3640 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3642 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3643 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3644 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3646 if (tso_maxlen < tso_minlen)
3647 tso_maxlen = tso_minlen;
3648 else if (tso_maxlen > IP_MAXPACKET)
3649 tso_maxlen = IP_MAXPACKET;
3650 if (tso_maxlen > sc->hn_ndis_tso_szmax)
3651 tso_maxlen = sc->hn_ndis_tso_szmax;
3652 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3654 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3658 hn_fixup_tx_data(struct hn_softc *sc)
3660 uint64_t csum_assist;
3663 hn_set_chim_size(sc, sc->hn_chim_szmax);
3664 if (hn_tx_chimney_size > 0 &&
3665 hn_tx_chimney_size < sc->hn_chim_szmax)
3666 hn_set_chim_size(sc, hn_tx_chimney_size);
3669 if (sc->hn_caps & HN_CAP_IPCS)
3670 csum_assist |= CSUM_IP;
3671 if (sc->hn_caps & HN_CAP_TCP4CS)
3672 csum_assist |= CSUM_IP_TCP;
3673 if (sc->hn_caps & HN_CAP_UDP4CS)
3674 csum_assist |= CSUM_IP_UDP;
3676 if (sc->hn_caps & HN_CAP_TCP6CS)
3677 csum_assist |= CSUM_IP6_TCP;
3678 if (sc->hn_caps & HN_CAP_UDP6CS)
3679 csum_assist |= CSUM_IP6_UDP;
3681 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3682 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3684 if (sc->hn_caps & HN_CAP_HASHVAL) {
3686 * Support HASHVAL pktinfo on TX path.
3689 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3690 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3691 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3696 hn_destroy_tx_data(struct hn_softc *sc)
3700 if (sc->hn_chim != NULL) {
3701 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3705 if (sc->hn_tx_ring_cnt == 0)
3708 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3709 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3711 free(sc->hn_tx_ring, M_DEVBUF);
3712 sc->hn_tx_ring = NULL;
3714 sc->hn_tx_ring_cnt = 0;
3715 sc->hn_tx_ring_inuse = 0;
3718 #ifdef HN_IFSTART_SUPPORT
3721 hn_start_taskfunc(void *xtxr, int pending __unused)
3723 struct hn_tx_ring *txr = xtxr;
3725 mtx_lock(&txr->hn_tx_lock);
3726 hn_start_locked(txr, 0);
3727 mtx_unlock(&txr->hn_tx_lock);
3731 hn_start_locked(struct hn_tx_ring *txr, int len)
3733 struct hn_softc *sc = txr->hn_sc;
3734 struct ifnet *ifp = sc->hn_ifp;
3737 KASSERT(hn_use_if_start,
3738 ("hn_start_locked is called, when if_start is disabled"));
3739 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3740 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3741 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3743 if (__predict_false(txr->hn_suspended))
3746 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3750 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3751 struct hn_txdesc *txd;
3752 struct mbuf *m_head;
3755 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3759 if (len > 0 && m_head->m_pkthdr.len > len) {
3761 * This sending could be time consuming; let callers
3762 * dispatch this packet sending (and sending of any
3763 * following up packets) to tx taskqueue.
3765 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3770 #if defined(INET6) || defined(INET)
3771 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3772 m_head = hn_tso_fixup(m_head);
3773 if (__predict_false(m_head == NULL)) {
3774 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3780 txd = hn_txdesc_get(txr);
3782 txr->hn_no_txdescs++;
3783 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3784 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3788 error = hn_encap(ifp, txr, txd, &m_head);
3790 /* Both txd and m_head are freed */
3791 KASSERT(txr->hn_agg_txd == NULL,
3792 ("encap failed w/ pending aggregating txdesc"));
3796 if (txr->hn_agg_pktleft == 0) {
3797 if (txr->hn_agg_txd != NULL) {
3798 KASSERT(m_head == NULL,
3799 ("pending mbuf for aggregating txdesc"));
3800 error = hn_flush_txagg(ifp, txr);
3801 if (__predict_false(error)) {
3802 atomic_set_int(&ifp->if_drv_flags,
3807 KASSERT(m_head != NULL, ("mbuf was freed"));
3808 error = hn_txpkt(ifp, txr, txd);
3809 if (__predict_false(error)) {
3810 /* txd is freed, but m_head is not */
3811 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3812 atomic_set_int(&ifp->if_drv_flags,
3820 KASSERT(txr->hn_agg_txd != NULL,
3821 ("no aggregating txdesc"));
3822 KASSERT(m_head == NULL,
3823 ("pending mbuf for aggregating txdesc"));
3828 /* Flush pending aggerated transmission. */
3829 if (txr->hn_agg_txd != NULL)
3830 hn_flush_txagg(ifp, txr);
3835 hn_start(struct ifnet *ifp)
3837 struct hn_softc *sc = ifp->if_softc;
3838 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
3840 if (txr->hn_sched_tx)
3843 if (mtx_trylock(&txr->hn_tx_lock)) {
3846 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3847 mtx_unlock(&txr->hn_tx_lock);
3852 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3856 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3858 struct hn_tx_ring *txr = xtxr;
3860 mtx_lock(&txr->hn_tx_lock);
3861 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3862 hn_start_locked(txr, 0);
3863 mtx_unlock(&txr->hn_tx_lock);
3867 hn_start_txeof(struct hn_tx_ring *txr)
3869 struct hn_softc *sc = txr->hn_sc;
3870 struct ifnet *ifp = sc->hn_ifp;
3872 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3874 if (txr->hn_sched_tx)
3877 if (mtx_trylock(&txr->hn_tx_lock)) {
3880 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3881 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3882 mtx_unlock(&txr->hn_tx_lock);
3884 taskqueue_enqueue(txr->hn_tx_taskq,
3890 * Release the OACTIVE earlier, with the hope, that
3891 * others could catch up. The task will clear the
3892 * flag again with the hn_tx_lock to avoid possible
3895 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3896 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3900 #endif /* HN_IFSTART_SUPPORT */
3903 hn_xmit(struct hn_tx_ring *txr, int len)
3905 struct hn_softc *sc = txr->hn_sc;
3906 struct ifnet *ifp = sc->hn_ifp;
3907 struct mbuf *m_head;
3910 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3911 #ifdef HN_IFSTART_SUPPORT
3912 KASSERT(hn_use_if_start == 0,
3913 ("hn_xmit is called, when if_start is enabled"));
3915 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3917 if (__predict_false(txr->hn_suspended))
3920 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
3923 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
3924 struct hn_txdesc *txd;
3927 if (len > 0 && m_head->m_pkthdr.len > len) {
3929 * This sending could be time consuming; let callers
3930 * dispatch this packet sending (and sending of any
3931 * following up packets) to tx taskqueue.
3933 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3938 txd = hn_txdesc_get(txr);
3940 txr->hn_no_txdescs++;
3941 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3942 txr->hn_oactive = 1;
3946 error = hn_encap(ifp, txr, txd, &m_head);
3948 /* Both txd and m_head are freed; discard */
3949 KASSERT(txr->hn_agg_txd == NULL,
3950 ("encap failed w/ pending aggregating txdesc"));
3951 drbr_advance(ifp, txr->hn_mbuf_br);
3955 if (txr->hn_agg_pktleft == 0) {
3956 if (txr->hn_agg_txd != NULL) {
3957 KASSERT(m_head == NULL,
3958 ("pending mbuf for aggregating txdesc"));
3959 error = hn_flush_txagg(ifp, txr);
3960 if (__predict_false(error)) {
3961 txr->hn_oactive = 1;
3965 KASSERT(m_head != NULL, ("mbuf was freed"));
3966 error = hn_txpkt(ifp, txr, txd);
3967 if (__predict_false(error)) {
3968 /* txd is freed, but m_head is not */
3969 drbr_putback(ifp, txr->hn_mbuf_br,
3971 txr->hn_oactive = 1;
3978 KASSERT(txr->hn_agg_txd != NULL,
3979 ("no aggregating txdesc"));
3980 KASSERT(m_head == NULL,
3981 ("pending mbuf for aggregating txdesc"));
3986 drbr_advance(ifp, txr->hn_mbuf_br);
3989 /* Flush pending aggerated transmission. */
3990 if (txr->hn_agg_txd != NULL)
3991 hn_flush_txagg(ifp, txr);
3996 hn_transmit(struct ifnet *ifp, struct mbuf *m)
3998 struct hn_softc *sc = ifp->if_softc;
3999 struct hn_tx_ring *txr;
4002 #if defined(INET6) || defined(INET)
4004 * Perform TSO packet header fixup now, since the TSO
4005 * packet header should be cache-hot.
4007 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4008 m = hn_tso_fixup(m);
4009 if (__predict_false(m == NULL)) {
4010 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4017 * Select the TX ring based on flowid
4019 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
4020 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4021 txr = &sc->hn_tx_ring[idx];
4023 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4025 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4029 if (txr->hn_oactive)
4032 if (txr->hn_sched_tx)
4035 if (mtx_trylock(&txr->hn_tx_lock)) {
4038 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4039 mtx_unlock(&txr->hn_tx_lock);
4044 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4049 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4053 mtx_lock(&txr->hn_tx_lock);
4054 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4056 mtx_unlock(&txr->hn_tx_lock);
4060 hn_xmit_qflush(struct ifnet *ifp)
4062 struct hn_softc *sc = ifp->if_softc;
4065 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4066 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4071 hn_xmit_txeof(struct hn_tx_ring *txr)
4074 if (txr->hn_sched_tx)
4077 if (mtx_trylock(&txr->hn_tx_lock)) {
4080 txr->hn_oactive = 0;
4081 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4082 mtx_unlock(&txr->hn_tx_lock);
4084 taskqueue_enqueue(txr->hn_tx_taskq,
4090 * Release the oactive earlier, with the hope, that
4091 * others could catch up. The task will clear the
4092 * oactive again with the hn_tx_lock to avoid possible
4095 txr->hn_oactive = 0;
4096 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4101 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4103 struct hn_tx_ring *txr = xtxr;
4105 mtx_lock(&txr->hn_tx_lock);
4107 mtx_unlock(&txr->hn_tx_lock);
4111 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4113 struct hn_tx_ring *txr = xtxr;
4115 mtx_lock(&txr->hn_tx_lock);
4116 txr->hn_oactive = 0;
4118 mtx_unlock(&txr->hn_tx_lock);
4122 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4124 struct vmbus_chan_br cbr;
4125 struct hn_rx_ring *rxr;
4126 struct hn_tx_ring *txr = NULL;
4129 idx = vmbus_chan_subidx(chan);
4132 * Link this channel to RX/TX ring.
4134 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4135 ("invalid channel index %d, should > 0 && < %d",
4136 idx, sc->hn_rx_ring_inuse));
4137 rxr = &sc->hn_rx_ring[idx];
4138 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4139 ("RX ring %d already attached", idx));
4140 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4143 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4144 idx, vmbus_chan_id(chan));
4147 if (idx < sc->hn_tx_ring_inuse) {
4148 txr = &sc->hn_tx_ring[idx];
4149 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4150 ("TX ring %d already attached", idx));
4151 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4153 txr->hn_chan = chan;
4155 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4156 idx, vmbus_chan_id(chan));
4160 /* Bind this channel to a proper CPU. */
4161 vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus);
4166 cbr.cbr = rxr->hn_br;
4167 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4168 cbr.cbr_txsz = HN_TXBR_SIZE;
4169 cbr.cbr_rxsz = HN_RXBR_SIZE;
4170 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4172 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4173 vmbus_chan_id(chan), error);
4174 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4176 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4182 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4184 struct hn_rx_ring *rxr;
4187 idx = vmbus_chan_subidx(chan);
4190 * Link this channel to RX/TX ring.
4192 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4193 ("invalid channel index %d, should > 0 && < %d",
4194 idx, sc->hn_rx_ring_inuse));
4195 rxr = &sc->hn_rx_ring[idx];
4196 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4197 ("RX ring %d is not attached", idx));
4198 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4200 if (idx < sc->hn_tx_ring_inuse) {
4201 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4203 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4204 ("TX ring %d is not attached attached", idx));
4205 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4209 * Close this channel.
4212 * Channel closing does _not_ destroy the target channel.
4214 vmbus_chan_close(chan);
4218 hn_attach_subchans(struct hn_softc *sc)
4220 struct vmbus_channel **subchans;
4221 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4224 if (subchan_cnt == 0)
4227 /* Attach the sub-channels. */
4228 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4229 for (i = 0; i < subchan_cnt; ++i) {
4230 error = hn_chan_attach(sc, subchans[i]);
4234 vmbus_subchan_rel(subchans, subchan_cnt);
4237 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4240 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4248 hn_detach_allchans(struct hn_softc *sc)
4250 struct vmbus_channel **subchans;
4251 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4254 if (subchan_cnt == 0)
4257 /* Detach the sub-channels. */
4258 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4259 for (i = 0; i < subchan_cnt; ++i)
4260 hn_chan_detach(sc, subchans[i]);
4261 vmbus_subchan_rel(subchans, subchan_cnt);
4265 * Detach the primary channel, _after_ all sub-channels
4268 hn_chan_detach(sc, sc->hn_prichan);
4270 /* Wait for sub-channels to be destroyed, if any. */
4271 vmbus_subchan_drain(sc->hn_prichan);
4274 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4275 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4276 HN_RX_FLAG_ATTACHED) == 0,
4277 ("%dth RX ring is still attached", i));
4279 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4280 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4281 HN_TX_FLAG_ATTACHED) == 0,
4282 ("%dth TX ring is still attached", i));
4288 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4290 struct vmbus_channel **subchans;
4291 int nchan, rxr_cnt, error;
4293 nchan = *nsubch + 1;
4296 * Multiple RX/TX rings are not requested.
4303 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4306 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4308 /* No RSS; this is benign. */
4313 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4317 if (nchan > rxr_cnt)
4320 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4326 * Allocate sub-channels from NVS.
4328 *nsubch = nchan - 1;
4329 error = hn_nvs_alloc_subchans(sc, nsubch);
4330 if (error || *nsubch == 0) {
4331 /* Failed to allocate sub-channels. */
4337 * Wait for all sub-channels to become ready before moving on.
4339 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4340 vmbus_subchan_rel(subchans, *nsubch);
4345 hn_synth_attach(struct hn_softc *sc, int mtu)
4347 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4348 int error, nsubch, nchan, i;
4351 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4352 ("synthetic parts were attached"));
4354 /* Save capabilities for later verification. */
4355 old_caps = sc->hn_caps;
4358 /* Clear RSS stuffs. */
4359 sc->hn_rss_ind_size = 0;
4360 sc->hn_rss_hash = 0;
4363 * Attach the primary channel _before_ attaching NVS and RNDIS.
4365 error = hn_chan_attach(sc, sc->hn_prichan);
4372 error = hn_nvs_attach(sc, mtu);
4377 * Attach RNDIS _after_ NVS is attached.
4379 error = hn_rndis_attach(sc, mtu);
4384 * Make sure capabilities are not changed.
4386 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4387 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4388 old_caps, sc->hn_caps);
4389 /* Restore old capabilities and abort. */
4390 sc->hn_caps = old_caps;
4395 * Allocate sub-channels for multi-TX/RX rings.
4398 * The # of RX rings that can be used is equivalent to the # of
4399 * channels to be requested.
4401 nsubch = sc->hn_rx_ring_cnt - 1;
4402 error = hn_synth_alloc_subchans(sc, &nsubch);
4408 /* Only the primary channel can be used; done */
4413 * Configure RSS key and indirect table _after_ all sub-channels
4417 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4419 * RSS key is not set yet; set it to the default RSS key.
4422 if_printf(sc->hn_ifp, "setup default RSS key\n");
4423 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4424 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4427 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4429 * RSS indirect table is not set yet; set it up in round-
4433 if_printf(sc->hn_ifp, "setup default RSS indirect "
4436 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
4437 rss->rss_ind[i] = i % nchan;
4438 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4441 * # of usable channels may be changed, so we have to
4442 * make sure that all entries in RSS indirect table
4445 hn_rss_ind_fixup(sc, nchan);
4448 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4451 * Failed to configure RSS key or indirect table; only
4452 * the primary channel can be used.
4458 * Set the # of TX/RX rings that could be used according to
4459 * the # of channels that NVS offered.
4461 hn_set_ring_inuse(sc, nchan);
4464 * Attach the sub-channels, if any.
4466 error = hn_attach_subchans(sc);
4471 * Fixup transmission aggregation setup.
4475 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4481 * The interface must have been suspended though hn_suspend(), before
4482 * this function get called.
4485 hn_synth_detach(struct hn_softc *sc)
4489 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4490 ("synthetic parts were not attached"));
4492 /* Detach the RNDIS first. */
4493 hn_rndis_detach(sc);
4498 /* Detach all of the channels. */
4499 hn_detach_allchans(sc);
4501 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4505 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4507 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4508 ("invalid ring count %d", ring_cnt));
4510 if (sc->hn_tx_ring_cnt > ring_cnt)
4511 sc->hn_tx_ring_inuse = ring_cnt;
4513 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4514 sc->hn_rx_ring_inuse = ring_cnt;
4517 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4518 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4523 hn_chan_drain(struct vmbus_channel *chan)
4526 while (!vmbus_chan_rx_empty(chan) || !vmbus_chan_tx_empty(chan))
4528 vmbus_chan_intr_drain(chan);
4532 hn_suspend_data(struct hn_softc *sc)
4534 struct vmbus_channel **subch = NULL;
4542 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4543 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4545 mtx_lock(&txr->hn_tx_lock);
4546 txr->hn_suspended = 1;
4547 mtx_unlock(&txr->hn_tx_lock);
4548 /* No one is able send more packets now. */
4550 /* Wait for all pending sends to finish. */
4551 while (hn_tx_ring_pending(txr))
4552 pause("hnwtx", 1 /* 1 tick */);
4554 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4555 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4559 * Disable RX by clearing RX filter.
4561 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
4562 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
4565 * Give RNDIS enough time to flush all pending data packets.
4567 pause("waitrx", (200 * hz) / 1000);
4570 * Drain RX/TX bufrings and interrupts.
4572 nsubch = sc->hn_rx_ring_inuse - 1;
4574 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4576 if (subch != NULL) {
4577 for (i = 0; i < nsubch; ++i)
4578 hn_chan_drain(subch[i]);
4580 hn_chan_drain(sc->hn_prichan);
4583 vmbus_subchan_rel(subch, nsubch);
4587 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4590 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4594 hn_suspend_mgmt(struct hn_softc *sc)
4601 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4602 * through hn_mgmt_taskq.
4604 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4605 vmbus_chan_run_task(sc->hn_prichan, &task);
4608 * Make sure that all pending management tasks are completed.
4610 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4611 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4612 taskqueue_drain_all(sc->hn_mgmt_taskq0);
4616 hn_suspend(struct hn_softc *sc)
4619 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4620 hn_suspend_data(sc);
4621 hn_suspend_mgmt(sc);
4625 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4629 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4630 ("invalid TX ring count %d", tx_ring_cnt));
4632 for (i = 0; i < tx_ring_cnt; ++i) {
4633 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4635 mtx_lock(&txr->hn_tx_lock);
4636 txr->hn_suspended = 0;
4637 mtx_unlock(&txr->hn_tx_lock);
4642 hn_resume_data(struct hn_softc *sc)
4651 hn_set_rxfilter(sc);
4654 * Make sure to clear suspend status on "all" TX rings,
4655 * since hn_tx_ring_inuse can be changed after
4656 * hn_suspend_data().
4658 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4660 #ifdef HN_IFSTART_SUPPORT
4661 if (!hn_use_if_start)
4665 * Flush unused drbrs, since hn_tx_ring_inuse may be
4668 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4669 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4675 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4676 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4679 * Use txeof task, so that any pending oactive can be
4682 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4687 hn_resume_mgmt(struct hn_softc *sc)
4690 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4693 * Kick off network change detection, if it was pending.
4694 * If no network change was pending, start link status
4695 * checks, which is more lightweight than network change
4698 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4699 hn_change_network(sc);
4701 hn_update_link_status(sc);
4705 hn_resume(struct hn_softc *sc)
4708 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4714 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4716 const struct rndis_status_msg *msg;
4719 if (dlen < sizeof(*msg)) {
4720 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4725 switch (msg->rm_status) {
4726 case RNDIS_STATUS_MEDIA_CONNECT:
4727 case RNDIS_STATUS_MEDIA_DISCONNECT:
4728 hn_update_link_status(sc);
4731 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4732 /* Not really useful; ignore. */
4735 case RNDIS_STATUS_NETWORK_CHANGE:
4736 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4737 if (dlen < ofs + msg->rm_stbuflen ||
4738 msg->rm_stbuflen < sizeof(uint32_t)) {
4739 if_printf(sc->hn_ifp, "network changed\n");
4743 memcpy(&change, ((const uint8_t *)msg) + ofs,
4745 if_printf(sc->hn_ifp, "network changed, change %u\n",
4748 hn_change_network(sc);
4752 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4759 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
4761 const struct rndis_pktinfo *pi = info_data;
4764 while (info_dlen != 0) {
4768 if (__predict_false(info_dlen < sizeof(*pi)))
4770 if (__predict_false(info_dlen < pi->rm_size))
4772 info_dlen -= pi->rm_size;
4774 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
4776 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
4778 dlen = pi->rm_size - pi->rm_pktinfooffset;
4781 switch (pi->rm_type) {
4782 case NDIS_PKTINFO_TYPE_VLAN:
4783 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
4785 info->vlan_info = *((const uint32_t *)data);
4786 mask |= HN_RXINFO_VLAN;
4789 case NDIS_PKTINFO_TYPE_CSUM:
4790 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
4792 info->csum_info = *((const uint32_t *)data);
4793 mask |= HN_RXINFO_CSUM;
4796 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
4797 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
4799 info->hash_value = *((const uint32_t *)data);
4800 mask |= HN_RXINFO_HASHVAL;
4803 case HN_NDIS_PKTINFO_TYPE_HASHINF:
4804 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
4806 info->hash_info = *((const uint32_t *)data);
4807 mask |= HN_RXINFO_HASHINF;
4814 if (mask == HN_RXINFO_ALL) {
4815 /* All found; done */
4819 pi = (const struct rndis_pktinfo *)
4820 ((const uint8_t *)pi + pi->rm_size);
4825 * - If there is no hash value, invalidate the hash info.
4827 if ((mask & HN_RXINFO_HASHVAL) == 0)
4828 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
4832 static __inline bool
4833 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
4836 if (off < check_off) {
4837 if (__predict_true(off + len <= check_off))
4839 } else if (off > check_off) {
4840 if (__predict_true(check_off + check_len <= off))
4847 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
4849 const struct rndis_packet_msg *pkt;
4850 struct hn_rxinfo info;
4851 int data_off, pktinfo_off, data_len, pktinfo_len;
4856 if (__predict_false(dlen < sizeof(*pkt))) {
4857 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
4862 if (__predict_false(dlen < pkt->rm_len)) {
4863 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
4864 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
4867 if (__predict_false(pkt->rm_len <
4868 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
4869 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
4870 "msglen %u, data %u, oob %u, pktinfo %u\n",
4871 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
4872 pkt->rm_pktinfolen);
4875 if (__predict_false(pkt->rm_datalen == 0)) {
4876 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
4883 #define IS_OFFSET_INVALID(ofs) \
4884 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
4885 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
4887 /* XXX Hyper-V does not meet data offset alignment requirement */
4888 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
4889 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4890 "data offset %u\n", pkt->rm_dataoffset);
4893 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
4894 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
4895 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4896 "oob offset %u\n", pkt->rm_oobdataoffset);
4899 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
4900 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
4901 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4902 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
4906 #undef IS_OFFSET_INVALID
4908 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
4909 data_len = pkt->rm_datalen;
4910 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
4911 pktinfo_len = pkt->rm_pktinfolen;
4914 * Check OOB coverage.
4916 if (__predict_false(pkt->rm_oobdatalen != 0)) {
4917 int oob_off, oob_len;
4919 if_printf(rxr->hn_ifp, "got oobdata\n");
4920 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
4921 oob_len = pkt->rm_oobdatalen;
4923 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
4924 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4925 "oob overflow, msglen %u, oob abs %d len %d\n",
4926 pkt->rm_len, oob_off, oob_len);
4931 * Check against data.
4933 if (hn_rndis_check_overlap(oob_off, oob_len,
4934 data_off, data_len)) {
4935 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4936 "oob overlaps data, oob abs %d len %d, "
4937 "data abs %d len %d\n",
4938 oob_off, oob_len, data_off, data_len);
4943 * Check against pktinfo.
4945 if (pktinfo_len != 0 &&
4946 hn_rndis_check_overlap(oob_off, oob_len,
4947 pktinfo_off, pktinfo_len)) {
4948 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4949 "oob overlaps pktinfo, oob abs %d len %d, "
4950 "pktinfo abs %d len %d\n",
4951 oob_off, oob_len, pktinfo_off, pktinfo_len);
4957 * Check per-packet-info coverage and find useful per-packet-info.
4959 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
4960 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
4961 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
4962 if (__predict_true(pktinfo_len != 0)) {
4966 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
4967 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4968 "pktinfo overflow, msglen %u, "
4969 "pktinfo abs %d len %d\n",
4970 pkt->rm_len, pktinfo_off, pktinfo_len);
4975 * Check packet info coverage.
4977 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
4978 data_off, data_len);
4979 if (__predict_false(overlap)) {
4980 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4981 "pktinfo overlap data, pktinfo abs %d len %d, "
4982 "data abs %d len %d\n",
4983 pktinfo_off, pktinfo_len, data_off, data_len);
4988 * Find useful per-packet-info.
4990 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
4991 pktinfo_len, &info);
4992 if (__predict_false(error)) {
4993 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
4999 if (__predict_false(data_off + data_len > pkt->rm_len)) {
5000 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5001 "data overflow, msglen %u, data abs %d len %d\n",
5002 pkt->rm_len, data_off, data_len);
5005 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5008 static __inline void
5009 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5011 const struct rndis_msghdr *hdr;
5013 if (__predict_false(dlen < sizeof(*hdr))) {
5014 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5019 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5020 /* Hot data path. */
5021 hn_rndis_rx_data(rxr, data, dlen);
5026 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5027 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5029 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5033 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5035 const struct hn_nvs_hdr *hdr;
5037 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5038 if_printf(sc->hn_ifp, "invalid nvs notify\n");
5041 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5043 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5044 /* Useless; ignore */
5047 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5051 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5052 const struct vmbus_chanpkt_hdr *pkt)
5054 struct hn_nvs_sendctx *sndc;
5056 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5057 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5058 VMBUS_CHANPKT_DATALEN(pkt));
5061 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5067 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5068 const struct vmbus_chanpkt_hdr *pkthdr)
5070 const struct vmbus_chanpkt_rxbuf *pkt;
5071 const struct hn_nvs_hdr *nvs_hdr;
5074 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5075 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5078 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5080 /* Make sure that this is a RNDIS message. */
5081 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5082 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5087 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5088 if (__predict_false(hlen < sizeof(*pkt))) {
5089 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5092 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5094 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5095 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5100 count = pkt->cp_rxbuf_cnt;
5101 if (__predict_false(hlen <
5102 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5103 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5107 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5108 for (i = 0; i < count; ++i) {
5111 ofs = pkt->cp_rxbuf[i].rb_ofs;
5112 len = pkt->cp_rxbuf[i].rb_len;
5113 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5114 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5115 "ofs %d, len %d\n", i, ofs, len);
5118 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5122 * Ack the consumed RXBUF associated w/ this channel packet,
5123 * so that this RXBUF can be recycled by the hypervisor.
5125 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5129 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5132 struct hn_nvs_rndis_ack ack;
5135 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5136 ack.nvs_status = HN_NVS_STATUS_OK;
5140 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5141 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5142 if (__predict_false(error == EAGAIN)) {
5145 * This should _not_ happen in real world, since the
5146 * consumption of the TX bufring from the TX path is
5149 if (rxr->hn_ack_failed == 0)
5150 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5151 rxr->hn_ack_failed++;
5158 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5163 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5165 struct hn_rx_ring *rxr = xrxr;
5166 struct hn_softc *sc = rxr->hn_ifp->if_softc;
5169 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5172 pktlen = rxr->hn_pktbuf_len;
5173 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5174 if (__predict_false(error == ENOBUFS)) {
5179 * Expand channel packet buffer.
5182 * Use M_WAITOK here, since allocation failure
5185 nlen = rxr->hn_pktbuf_len * 2;
5186 while (nlen < pktlen)
5188 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5190 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5191 rxr->hn_pktbuf_len, nlen);
5193 free(rxr->hn_pktbuf, M_DEVBUF);
5194 rxr->hn_pktbuf = nbuf;
5195 rxr->hn_pktbuf_len = nlen;
5198 } else if (__predict_false(error == EAGAIN)) {
5199 /* No more channel packets; done! */
5202 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5204 switch (pkt->cph_type) {
5205 case VMBUS_CHANPKT_TYPE_COMP:
5206 hn_nvs_handle_comp(sc, chan, pkt);
5209 case VMBUS_CHANPKT_TYPE_RXBUF:
5210 hn_nvs_handle_rxbuf(rxr, chan, pkt);
5213 case VMBUS_CHANPKT_TYPE_INBAND:
5214 hn_nvs_handle_notify(sc, pkt);
5218 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5223 hn_chan_rollup(rxr, rxr->hn_txr);
5227 hn_tx_taskq_create(void *arg __unused)
5230 if (vm_guest != VM_GUEST_HV)
5233 if (!hn_share_tx_taskq)
5236 hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
5237 taskqueue_thread_enqueue, &hn_tx_taskq);
5238 taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
5239 if (hn_bind_tx_taskq >= 0) {
5240 int cpu = hn_bind_tx_taskq;
5241 struct task cpuset_task;
5244 if (cpu > mp_ncpus - 1)
5246 CPU_SETOF(cpu, &cpu_set);
5247 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, &cpu_set);
5248 taskqueue_enqueue(hn_tx_taskq, &cpuset_task);
5249 taskqueue_drain(hn_tx_taskq, &cpuset_task);
5252 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5253 hn_tx_taskq_create, NULL);
5256 hn_tx_taskq_destroy(void *arg __unused)
5259 if (hn_tx_taskq != NULL)
5260 taskqueue_free(hn_tx_taskq);
5262 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5263 hn_tx_taskq_destroy, NULL);