2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
61 #include <sys/param.h>
63 #include <sys/kernel.h>
64 #include <sys/limits.h>
65 #include <sys/malloc.h>
67 #include <sys/module.h>
69 #include <sys/queue.h>
72 #include <sys/socket.h>
73 #include <sys/sockio.h>
75 #include <sys/sysctl.h>
76 #include <sys/systm.h>
77 #include <sys/taskqueue.h>
78 #include <sys/buf_ring.h>
80 #include <machine/atomic.h>
81 #include <machine/in_cksum.h>
84 #include <net/ethernet.h>
86 #include <net/if_arp.h>
87 #include <net/if_media.h>
88 #include <net/if_types.h>
89 #include <net/if_var.h>
90 #include <net/if_vlan_var.h>
91 #include <net/rndis.h>
93 #include <netinet/in_systm.h>
94 #include <netinet/in.h>
95 #include <netinet/ip.h>
96 #include <netinet/ip6.h>
97 #include <netinet/tcp.h>
98 #include <netinet/tcp_lro.h>
99 #include <netinet/udp.h>
101 #include <dev/hyperv/include/hyperv.h>
102 #include <dev/hyperv/include/hyperv_busdma.h>
103 #include <dev/hyperv/include/vmbus.h>
104 #include <dev/hyperv/include/vmbus_xact.h>
106 #include <dev/hyperv/netvsc/ndis.h>
107 #include <dev/hyperv/netvsc/if_hnreg.h>
108 #include <dev/hyperv/netvsc/if_hnvar.h>
109 #include <dev/hyperv/netvsc/hn_nvs.h>
110 #include <dev/hyperv/netvsc/hn_rndis.h>
112 #include "vmbus_if.h"
114 #define HN_IFSTART_SUPPORT
116 #define HN_RING_CNT_DEF_MAX 8
118 /* YYY should get it from the underlying channel */
119 #define HN_TX_DESC_CNT 512
121 #define HN_RNDIS_PKT_LEN \
122 (sizeof(struct rndis_packet_msg) + \
123 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
124 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
125 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
126 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
127 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
128 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
130 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
131 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
132 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
133 /* -1 for RNDIS packet message */
134 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
136 #define HN_DIRECT_TX_SIZE_DEF 128
138 #define HN_EARLY_TXEOF_THRESH 8
140 #define HN_PKTBUF_LEN_DEF (16 * 1024)
142 #define HN_LROENT_CNT_DEF 128
144 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
145 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
146 /* YYY 2*MTU is a bit rough, but should be good enough. */
147 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
149 #define HN_LRO_ACKCNT_DEF 1
151 #define HN_LOCK_INIT(sc) \
152 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
153 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
154 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
155 #define HN_LOCK(sc) \
157 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
160 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
162 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
163 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
164 #define HN_CSUM_IP_HWASSIST(sc) \
165 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
166 #define HN_CSUM_IP6_HWASSIST(sc) \
167 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
169 #define HN_PKTSIZE_MIN(align) \
170 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
171 HN_RNDIS_PKT_LEN, (align))
172 #define HN_PKTSIZE(m, align) \
173 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
176 #ifndef HN_USE_TXDESC_BUFRING
177 SLIST_ENTRY(hn_txdesc) link;
179 STAILQ_ENTRY(hn_txdesc) agg_link;
181 /* Aggregated txdescs, in sending order. */
182 STAILQ_HEAD(, hn_txdesc) agg_list;
184 /* The oldest packet, if transmission aggregation happens. */
186 struct hn_tx_ring *txr;
188 uint32_t flags; /* HN_TXD_FLAG_ */
189 struct hn_nvs_sendctx send_ctx;
193 bus_dmamap_t data_dmap;
195 bus_addr_t rndis_pkt_paddr;
196 struct rndis_packet_msg *rndis_pkt;
197 bus_dmamap_t rndis_pkt_dmap;
200 #define HN_TXD_FLAG_ONLIST 0x0001
201 #define HN_TXD_FLAG_DMAMAP 0x0002
202 #define HN_TXD_FLAG_ONAGG 0x0004
211 #define HN_RXINFO_VLAN 0x0001
212 #define HN_RXINFO_CSUM 0x0002
213 #define HN_RXINFO_HASHINF 0x0004
214 #define HN_RXINFO_HASHVAL 0x0008
215 #define HN_RXINFO_ALL \
218 HN_RXINFO_HASHINF | \
221 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
222 #define HN_NDIS_RXCSUM_INFO_INVALID 0
223 #define HN_NDIS_HASH_INFO_INVALID 0
225 static int hn_probe(device_t);
226 static int hn_attach(device_t);
227 static int hn_detach(device_t);
228 static int hn_shutdown(device_t);
229 static void hn_chan_callback(struct vmbus_channel *,
232 static void hn_init(void *);
233 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
234 #ifdef HN_IFSTART_SUPPORT
235 static void hn_start(struct ifnet *);
237 static int hn_transmit(struct ifnet *, struct mbuf *);
238 static void hn_xmit_qflush(struct ifnet *);
239 static int hn_ifmedia_upd(struct ifnet *);
240 static void hn_ifmedia_sts(struct ifnet *,
241 struct ifmediareq *);
243 static int hn_rndis_rxinfo(const void *, int,
245 static void hn_rndis_rx_data(struct hn_rx_ring *,
247 static void hn_rndis_rx_status(struct hn_softc *,
250 static void hn_nvs_handle_notify(struct hn_softc *,
251 const struct vmbus_chanpkt_hdr *);
252 static void hn_nvs_handle_comp(struct hn_softc *,
253 struct vmbus_channel *,
254 const struct vmbus_chanpkt_hdr *);
255 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
256 struct vmbus_channel *,
257 const struct vmbus_chanpkt_hdr *);
258 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
259 struct vmbus_channel *, uint64_t);
261 #if __FreeBSD_version >= 1100099
262 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
263 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
265 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
266 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
267 #if __FreeBSD_version < 1100095
268 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
270 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
272 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
273 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
274 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
275 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
276 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
277 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
278 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
279 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
280 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
281 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
282 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
283 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
284 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
285 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
287 static void hn_stop(struct hn_softc *);
288 static void hn_init_locked(struct hn_softc *);
289 static int hn_chan_attach(struct hn_softc *,
290 struct vmbus_channel *);
291 static void hn_chan_detach(struct hn_softc *,
292 struct vmbus_channel *);
293 static int hn_attach_subchans(struct hn_softc *);
294 static void hn_detach_allchans(struct hn_softc *);
295 static void hn_chan_rollup(struct hn_rx_ring *,
296 struct hn_tx_ring *);
297 static void hn_set_ring_inuse(struct hn_softc *, int);
298 static int hn_synth_attach(struct hn_softc *, int);
299 static void hn_synth_detach(struct hn_softc *);
300 static int hn_synth_alloc_subchans(struct hn_softc *,
302 static bool hn_synth_attachable(const struct hn_softc *);
303 static void hn_suspend(struct hn_softc *);
304 static void hn_suspend_data(struct hn_softc *);
305 static void hn_suspend_mgmt(struct hn_softc *);
306 static void hn_resume(struct hn_softc *);
307 static void hn_resume_data(struct hn_softc *);
308 static void hn_resume_mgmt(struct hn_softc *);
309 static void hn_suspend_mgmt_taskfunc(void *, int);
310 static void hn_chan_drain(struct hn_softc *,
311 struct vmbus_channel *);
313 static void hn_update_link_status(struct hn_softc *);
314 static void hn_change_network(struct hn_softc *);
315 static void hn_link_taskfunc(void *, int);
316 static void hn_netchg_init_taskfunc(void *, int);
317 static void hn_netchg_status_taskfunc(void *, int);
318 static void hn_link_status(struct hn_softc *);
320 static int hn_create_rx_data(struct hn_softc *, int);
321 static void hn_destroy_rx_data(struct hn_softc *);
322 static int hn_check_iplen(const struct mbuf *, int);
323 static int hn_set_rxfilter(struct hn_softc *);
324 static int hn_rss_reconfig(struct hn_softc *);
325 static void hn_rss_ind_fixup(struct hn_softc *);
326 static int hn_rxpkt(struct hn_rx_ring *, const void *,
327 int, const struct hn_rxinfo *);
329 static int hn_tx_ring_create(struct hn_softc *, int);
330 static void hn_tx_ring_destroy(struct hn_tx_ring *);
331 static int hn_create_tx_data(struct hn_softc *, int);
332 static void hn_fixup_tx_data(struct hn_softc *);
333 static void hn_destroy_tx_data(struct hn_softc *);
334 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
335 static void hn_txdesc_gc(struct hn_tx_ring *,
337 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
338 struct hn_txdesc *, struct mbuf **);
339 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
341 static void hn_set_chim_size(struct hn_softc *, int);
342 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
343 static bool hn_tx_ring_pending(struct hn_tx_ring *);
344 static void hn_tx_ring_qflush(struct hn_tx_ring *);
345 static void hn_resume_tx(struct hn_softc *, int);
346 static void hn_set_txagg(struct hn_softc *);
347 static void *hn_try_txagg(struct ifnet *,
348 struct hn_tx_ring *, struct hn_txdesc *,
350 static int hn_get_txswq_depth(const struct hn_tx_ring *);
351 static void hn_txpkt_done(struct hn_nvs_sendctx *,
352 struct hn_softc *, struct vmbus_channel *,
354 static int hn_txpkt_sglist(struct hn_tx_ring *,
356 static int hn_txpkt_chim(struct hn_tx_ring *,
358 static int hn_xmit(struct hn_tx_ring *, int);
359 static void hn_xmit_taskfunc(void *, int);
360 static void hn_xmit_txeof(struct hn_tx_ring *);
361 static void hn_xmit_txeof_taskfunc(void *, int);
362 #ifdef HN_IFSTART_SUPPORT
363 static int hn_start_locked(struct hn_tx_ring *, int);
364 static void hn_start_taskfunc(void *, int);
365 static void hn_start_txeof(struct hn_tx_ring *);
366 static void hn_start_txeof_taskfunc(void *, int);
369 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
370 "Hyper-V network interface");
372 /* Trust tcp segements verification on host side. */
373 static int hn_trust_hosttcp = 1;
374 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
375 &hn_trust_hosttcp, 0,
376 "Trust tcp segement verification on host side, "
377 "when csum info is missing (global setting)");
379 /* Trust udp datagrams verification on host side. */
380 static int hn_trust_hostudp = 1;
381 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
382 &hn_trust_hostudp, 0,
383 "Trust udp datagram verification on host side, "
384 "when csum info is missing (global setting)");
386 /* Trust ip packets verification on host side. */
387 static int hn_trust_hostip = 1;
388 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
390 "Trust ip packet verification on host side, "
391 "when csum info is missing (global setting)");
393 /* Limit TSO burst size */
394 static int hn_tso_maxlen = IP_MAXPACKET;
395 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
396 &hn_tso_maxlen, 0, "TSO burst limit");
398 /* Limit chimney send size */
399 static int hn_tx_chimney_size = 0;
400 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
401 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
403 /* Limit the size of packet for direct transmission */
404 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
405 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
406 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
408 /* # of LRO entries per RX ring */
409 #if defined(INET) || defined(INET6)
410 #if __FreeBSD_version >= 1100095
411 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
412 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
413 &hn_lro_entry_count, 0, "LRO entry count");
417 /* Use shared TX taskqueue */
418 static int hn_share_tx_taskq = 0;
419 SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
420 &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
422 #ifndef HN_USE_TXDESC_BUFRING
423 static int hn_use_txdesc_bufring = 0;
425 static int hn_use_txdesc_bufring = 1;
427 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
428 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
430 /* Bind TX taskqueue to the target CPU */
431 static int hn_bind_tx_taskq = -1;
432 SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
433 &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
435 #ifdef HN_IFSTART_SUPPORT
436 /* Use ifnet.if_start instead of ifnet.if_transmit */
437 static int hn_use_if_start = 0;
438 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
439 &hn_use_if_start, 0, "Use if_start TX method");
442 /* # of channels to use */
443 static int hn_chan_cnt = 0;
444 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
446 "# of channels to use; each channel has one RX ring and one TX ring");
448 /* # of transmit rings to use */
449 static int hn_tx_ring_cnt = 0;
450 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
451 &hn_tx_ring_cnt, 0, "# of TX rings to use");
453 /* Software TX ring deptch */
454 static int hn_tx_swq_depth = 0;
455 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
456 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
458 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
459 #if __FreeBSD_version >= 1100095
460 static u_int hn_lro_mbufq_depth = 0;
461 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
462 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
465 /* Packet transmission aggregation size limit */
466 static int hn_tx_agg_size = -1;
467 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
468 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
470 /* Packet transmission aggregation count limit */
471 static int hn_tx_agg_pkts = -1;
472 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
473 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
475 static u_int hn_cpu_index; /* next CPU for channel */
476 static struct taskqueue *hn_tx_taskq; /* shared TX taskqueue */
479 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
480 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
481 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
482 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
483 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
484 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
487 static device_method_t hn_methods[] = {
488 /* Device interface */
489 DEVMETHOD(device_probe, hn_probe),
490 DEVMETHOD(device_attach, hn_attach),
491 DEVMETHOD(device_detach, hn_detach),
492 DEVMETHOD(device_shutdown, hn_shutdown),
496 static driver_t hn_driver = {
499 sizeof(struct hn_softc)
502 static devclass_t hn_devclass;
504 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
505 MODULE_VERSION(hn, 1);
506 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
508 #if __FreeBSD_version >= 1100099
510 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
514 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
515 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
520 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
523 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
524 txd->chim_size == 0, ("invalid rndis sglist txd"));
525 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
526 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
530 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
532 struct hn_nvs_rndis rndis;
534 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
535 txd->chim_size > 0, ("invalid rndis chim txd"));
537 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
538 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
539 rndis.nvs_chim_idx = txd->chim_index;
540 rndis.nvs_chim_sz = txd->chim_size;
542 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
543 &rndis, sizeof(rndis), &txd->send_ctx));
546 static __inline uint32_t
547 hn_chim_alloc(struct hn_softc *sc)
549 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
550 u_long *bmap = sc->hn_chim_bmap;
551 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
553 for (i = 0; i < bmap_cnt; ++i) {
556 idx = ffsl(~bmap[i]);
560 --idx; /* ffsl is 1-based */
561 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
562 ("invalid i %d and idx %d", i, idx));
564 if (atomic_testandset_long(&bmap[i], idx))
567 ret = i * LONG_BIT + idx;
574 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
579 idx = chim_idx / LONG_BIT;
580 KASSERT(idx < sc->hn_chim_bmap_cnt,
581 ("invalid chimney index 0x%x", chim_idx));
583 mask = 1UL << (chim_idx % LONG_BIT);
584 KASSERT(sc->hn_chim_bmap[idx] & mask,
585 ("index bitmap 0x%lx, chimney index %u, "
586 "bitmap idx %d, bitmask 0x%lx",
587 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
589 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
592 #if defined(INET6) || defined(INET)
594 * NOTE: If this function failed, the m_head would be freed.
596 static __inline struct mbuf *
597 hn_tso_fixup(struct mbuf *m_head)
599 struct ether_vlan_header *evl;
603 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
605 #define PULLUP_HDR(m, len) \
607 if (__predict_false((m)->m_len < (len))) { \
608 (m) = m_pullup((m), (len)); \
614 PULLUP_HDR(m_head, sizeof(*evl));
615 evl = mtod(m_head, struct ether_vlan_header *);
616 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
617 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
619 ehlen = ETHER_HDR_LEN;
622 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
626 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
627 ip = mtodo(m_head, ehlen);
628 iphlen = ip->ip_hl << 2;
630 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
631 th = mtodo(m_head, ehlen + iphlen);
635 th->th_sum = in_pseudo(ip->ip_src.s_addr,
636 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
639 #if defined(INET6) && defined(INET)
646 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
647 ip6 = mtodo(m_head, ehlen);
648 if (ip6->ip6_nxt != IPPROTO_TCP) {
653 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
654 th = mtodo(m_head, ehlen + sizeof(*ip6));
657 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
664 #endif /* INET6 || INET */
667 hn_set_rxfilter(struct hn_softc *sc)
669 struct ifnet *ifp = sc->hn_ifp;
675 if (ifp->if_flags & IFF_PROMISC) {
676 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
678 filter = NDIS_PACKET_TYPE_DIRECTED;
679 if (ifp->if_flags & IFF_BROADCAST)
680 filter |= NDIS_PACKET_TYPE_BROADCAST;
681 /* TODO: support multicast list */
682 if ((ifp->if_flags & IFF_ALLMULTI) ||
683 !TAILQ_EMPTY(&ifp->if_multiaddrs))
684 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
687 if (sc->hn_rx_filter != filter) {
688 error = hn_rndis_set_rxfilter(sc, filter);
690 sc->hn_rx_filter = filter;
696 hn_set_txagg(struct hn_softc *sc)
702 * Setup aggregation size.
704 if (sc->hn_agg_size < 0)
707 size = sc->hn_agg_size;
709 if (sc->hn_rndis_agg_size < size)
710 size = sc->hn_rndis_agg_size;
712 /* NOTE: We only aggregate packets using chimney sending buffers. */
713 if (size > (uint32_t)sc->hn_chim_szmax)
714 size = sc->hn_chim_szmax;
716 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
723 /* NOTE: Type of the per TX ring setting is 'int'. */
728 * Setup aggregation packet count.
730 if (sc->hn_agg_pkts < 0)
733 pkts = sc->hn_agg_pkts;
735 if (sc->hn_rndis_agg_pkts < pkts)
736 pkts = sc->hn_rndis_agg_pkts;
745 /* NOTE: Type of the per TX ring setting is 'short'. */
750 /* NOTE: Type of the per TX ring setting is 'short'. */
751 if (sc->hn_rndis_agg_align > SHRT_MAX) {
758 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
759 size, pkts, sc->hn_rndis_agg_align);
762 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
763 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
765 mtx_lock(&txr->hn_tx_lock);
766 txr->hn_agg_szmax = size;
767 txr->hn_agg_pktmax = pkts;
768 txr->hn_agg_align = sc->hn_rndis_agg_align;
769 mtx_unlock(&txr->hn_tx_lock);
774 hn_get_txswq_depth(const struct hn_tx_ring *txr)
777 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
778 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
779 return txr->hn_txdesc_cnt;
780 return hn_tx_swq_depth;
784 hn_rss_reconfig(struct hn_softc *sc)
790 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
797 * Direct reconfiguration by setting the UNCHG flags does
798 * _not_ work properly.
801 if_printf(sc->hn_ifp, "disable RSS\n");
802 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
804 if_printf(sc->hn_ifp, "RSS disable failed\n");
809 * Reenable the RSS w/ the updated RSS key or indirect
813 if_printf(sc->hn_ifp, "reconfig RSS\n");
814 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
816 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
823 hn_rss_ind_fixup(struct hn_softc *sc)
825 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
828 nchan = sc->hn_rx_ring_inuse;
829 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
832 * Check indirect table to make sure that all channels in it
835 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
836 if (rss->rss_ind[i] >= nchan) {
837 if_printf(sc->hn_ifp,
838 "RSS indirect table %d fixup: %u -> %d\n",
839 i, rss->rss_ind[i], nchan - 1);
840 rss->rss_ind[i] = nchan - 1;
846 hn_ifmedia_upd(struct ifnet *ifp __unused)
853 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
855 struct hn_softc *sc = ifp->if_softc;
857 ifmr->ifm_status = IFM_AVALID;
858 ifmr->ifm_active = IFM_ETHER;
860 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
861 ifmr->ifm_active |= IFM_NONE;
864 ifmr->ifm_status |= IFM_ACTIVE;
865 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
868 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
869 static const struct hyperv_guid g_net_vsc_device_type = {
870 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
871 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
875 hn_probe(device_t dev)
878 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
879 &g_net_vsc_device_type) == 0) {
880 device_set_desc(dev, "Hyper-V Network Interface");
881 return BUS_PROBE_DEFAULT;
887 hn_cpuset_setthread_task(void *xmask, int pending __unused)
889 cpuset_t *mask = xmask;
892 error = cpuset_setthread(curthread->td_tid, mask);
894 panic("curthread=%ju: can't pin; error=%d",
895 (uintmax_t)curthread->td_tid, error);
900 hn_attach(device_t dev)
902 struct hn_softc *sc = device_get_softc(dev);
903 struct sysctl_oid_list *child;
904 struct sysctl_ctx_list *ctx;
905 uint8_t eaddr[ETHER_ADDR_LEN];
906 struct ifnet *ifp = NULL;
907 int error, ring_cnt, tx_ring_cnt;
910 sc->hn_prichan = vmbus_get_channel(dev);
914 * Initialize these tunables once.
916 sc->hn_agg_size = hn_tx_agg_size;
917 sc->hn_agg_pkts = hn_tx_agg_pkts;
920 * Setup taskqueue for transmission.
922 if (hn_tx_taskq == NULL) {
923 sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
924 taskqueue_thread_enqueue, &sc->hn_tx_taskq);
925 taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx",
926 device_get_nameunit(dev));
927 if (hn_bind_tx_taskq >= 0) {
928 int cpu = hn_bind_tx_taskq;
929 struct task cpuset_task;
932 if (cpu > mp_ncpus - 1)
934 CPU_SETOF(cpu, &cpu_set);
935 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task,
937 taskqueue_enqueue(sc->hn_tx_taskq, &cpuset_task);
938 taskqueue_drain(sc->hn_tx_taskq, &cpuset_task);
941 sc->hn_tx_taskq = hn_tx_taskq;
945 * Setup taskqueue for mangement tasks, e.g. link status.
947 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
948 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
949 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
950 device_get_nameunit(dev));
951 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
952 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
953 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
954 hn_netchg_status_taskfunc, sc);
957 * Allocate ifnet and setup its name earlier, so that if_printf
958 * can be used by functions, which will be called after
961 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
963 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
966 * Initialize ifmedia earlier so that it can be unconditionally
967 * destroyed, if error happened later on.
969 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
972 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
973 * to use (tx_ring_cnt).
976 * The # of RX rings to use is same as the # of channels to use.
978 ring_cnt = hn_chan_cnt;
982 if (ring_cnt > HN_RING_CNT_DEF_MAX)
983 ring_cnt = HN_RING_CNT_DEF_MAX;
984 } else if (ring_cnt > mp_ncpus) {
988 tx_ring_cnt = hn_tx_ring_cnt;
989 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
990 tx_ring_cnt = ring_cnt;
991 #ifdef HN_IFSTART_SUPPORT
992 if (hn_use_if_start) {
993 /* ifnet.if_start only needs one TX ring. */
999 * Set the leader CPU for channels.
1001 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1004 * Create enough TX/RX rings, even if only limited number of
1005 * channels can be allocated.
1007 error = hn_create_tx_data(sc, tx_ring_cnt);
1010 error = hn_create_rx_data(sc, ring_cnt);
1015 * Create transaction context for NVS and RNDIS transactions.
1017 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1018 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1019 if (sc->hn_xact == NULL) {
1025 * Install orphan handler for the revocation of this device's
1029 * The processing order is critical here:
1030 * Install the orphan handler, _before_ testing whether this
1031 * device's primary channel has been revoked or not.
1033 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1034 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1040 * Attach the synthetic parts, i.e. NVS and RNDIS.
1042 error = hn_synth_attach(sc, ETHERMTU);
1046 error = hn_rndis_get_eaddr(sc, eaddr);
1050 #if __FreeBSD_version >= 1100099
1051 if (sc->hn_rx_ring_inuse > 1) {
1053 * Reduce TCP segment aggregation limit for multiple
1054 * RX rings to increase ACK timeliness.
1056 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1061 * Fixup TX stuffs after synthetic parts are attached.
1063 hn_fixup_tx_data(sc);
1065 ctx = device_get_sysctl_ctx(dev);
1066 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1067 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1068 &sc->hn_nvs_ver, 0, "NVS version");
1069 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1070 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1071 hn_ndis_version_sysctl, "A", "NDIS version");
1072 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1073 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1074 hn_caps_sysctl, "A", "capabilities");
1075 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1076 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1077 hn_hwassist_sysctl, "A", "hwassist");
1078 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1079 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1080 hn_rxfilter_sysctl, "A", "rxfilter");
1081 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1082 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1083 hn_rss_hash_sysctl, "A", "RSS hash");
1084 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1085 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1086 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1087 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1088 hn_rss_key_sysctl, "IU", "RSS key");
1089 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1090 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1091 hn_rss_ind_sysctl, "IU", "RSS indirect table");
1092 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1093 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1094 "RNDIS offered packet transmission aggregation size limit");
1095 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1096 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1097 "RNDIS offered packet transmission aggregation count limit");
1098 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1099 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1100 "RNDIS packet transmission aggregation alignment");
1101 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1102 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1103 hn_txagg_size_sysctl, "I",
1104 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1105 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1106 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1107 hn_txagg_pkts_sysctl, "I",
1108 "Packet transmission aggregation packets, "
1109 "0 -- disable, -1 -- auto");
1112 * Setup the ifmedia, which has been initialized earlier.
1114 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1115 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1116 /* XXX ifmedia_set really should do this for us */
1117 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1120 * Setup the ifnet for this interface.
1124 ifp->if_baudrate = IF_Gbps(10);
1126 /* if_baudrate is 32bits on 32bit system. */
1127 ifp->if_baudrate = IF_Gbps(1);
1129 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1130 ifp->if_ioctl = hn_ioctl;
1131 ifp->if_init = hn_init;
1132 #ifdef HN_IFSTART_SUPPORT
1133 if (hn_use_if_start) {
1134 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1136 ifp->if_start = hn_start;
1137 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1138 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1139 IFQ_SET_READY(&ifp->if_snd);
1143 ifp->if_transmit = hn_transmit;
1144 ifp->if_qflush = hn_xmit_qflush;
1147 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1149 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1150 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1152 if (sc->hn_caps & HN_CAP_VLAN) {
1153 /* XXX not sure about VLAN_MTU. */
1154 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1157 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1158 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1159 ifp->if_capabilities |= IFCAP_TXCSUM;
1160 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1161 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1162 if (sc->hn_caps & HN_CAP_TSO4) {
1163 ifp->if_capabilities |= IFCAP_TSO4;
1164 ifp->if_hwassist |= CSUM_IP_TSO;
1166 if (sc->hn_caps & HN_CAP_TSO6) {
1167 ifp->if_capabilities |= IFCAP_TSO6;
1168 ifp->if_hwassist |= CSUM_IP6_TSO;
1171 /* Enable all available capabilities by default. */
1172 ifp->if_capenable = ifp->if_capabilities;
1175 * Disable IPv6 TSO and TXCSUM by default, they still can
1176 * be enabled through SIOCSIFCAP.
1178 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1179 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1181 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1182 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1183 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1184 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1187 ether_ifattach(ifp, eaddr);
1189 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1190 if_printf(ifp, "TSO segcnt %u segsz %u\n",
1191 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1194 /* Inform the upper layer about the long frame support. */
1195 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1198 * Kick off link status check.
1200 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1201 hn_update_link_status(sc);
1205 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1206 hn_synth_detach(sc);
1212 hn_detach(device_t dev)
1214 struct hn_softc *sc = device_get_softc(dev);
1215 struct ifnet *ifp = sc->hn_ifp;
1217 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1219 * In case that the vmbus missed the orphan handler
1222 vmbus_xact_ctx_orphan(sc->hn_xact);
1225 if (device_is_attached(dev)) {
1227 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1228 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1232 * hn_stop() only suspends data, so managment
1233 * stuffs have to be suspended manually here.
1235 hn_suspend_mgmt(sc);
1236 hn_synth_detach(sc);
1239 ether_ifdetach(ifp);
1242 ifmedia_removeall(&sc->hn_media);
1243 hn_destroy_rx_data(sc);
1244 hn_destroy_tx_data(sc);
1246 if (sc->hn_tx_taskq != hn_tx_taskq)
1247 taskqueue_free(sc->hn_tx_taskq);
1248 taskqueue_free(sc->hn_mgmt_taskq0);
1250 if (sc->hn_xact != NULL) {
1252 * Uninstall the orphan handler _before_ the xact is
1255 vmbus_chan_unset_orphan(sc->hn_prichan);
1256 vmbus_xact_ctx_destroy(sc->hn_xact);
1261 HN_LOCK_DESTROY(sc);
1266 hn_shutdown(device_t dev)
1273 hn_link_status(struct hn_softc *sc)
1275 uint32_t link_status;
1278 error = hn_rndis_get_linkstatus(sc, &link_status);
1280 /* XXX what to do? */
1284 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1285 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1287 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1288 if_link_state_change(sc->hn_ifp,
1289 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1290 LINK_STATE_UP : LINK_STATE_DOWN);
1294 hn_link_taskfunc(void *xsc, int pending __unused)
1296 struct hn_softc *sc = xsc;
1298 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1304 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1306 struct hn_softc *sc = xsc;
1308 /* Prevent any link status checks from running. */
1309 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1312 * Fake up a [link down --> link up] state change; 5 seconds
1313 * delay is used, which closely simulates miibus reaction
1314 * upon link down event.
1316 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1317 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1318 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1319 &sc->hn_netchg_status, 5 * hz);
1323 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1325 struct hn_softc *sc = xsc;
1327 /* Re-allow link status checks. */
1328 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1333 hn_update_link_status(struct hn_softc *sc)
1336 if (sc->hn_mgmt_taskq != NULL)
1337 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1341 hn_change_network(struct hn_softc *sc)
1344 if (sc->hn_mgmt_taskq != NULL)
1345 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1349 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1350 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1352 struct mbuf *m = *m_head;
1355 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1357 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1358 m, segs, nsegs, BUS_DMA_NOWAIT);
1359 if (error == EFBIG) {
1362 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1366 *m_head = m = m_new;
1367 txr->hn_tx_collapsed++;
1369 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1370 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1373 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1374 BUS_DMASYNC_PREWRITE);
1375 txd->flags |= HN_TXD_FLAG_DMAMAP;
1381 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1384 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1385 ("put an onlist txd %#x", txd->flags));
1386 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1387 ("put an onagg txd %#x", txd->flags));
1389 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1390 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1393 if (!STAILQ_EMPTY(&txd->agg_list)) {
1394 struct hn_txdesc *tmp_txd;
1396 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1399 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1400 ("resursive aggregation on aggregated txdesc"));
1401 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1402 ("not aggregated txdesc"));
1403 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1404 ("aggregated txdesc uses dmamap"));
1405 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1406 ("aggregated txdesc consumes "
1407 "chimney sending buffer"));
1408 KASSERT(tmp_txd->chim_size == 0,
1409 ("aggregated txdesc has non-zero "
1410 "chimney sending size"));
1412 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1413 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1414 freed = hn_txdesc_put(txr, tmp_txd);
1415 KASSERT(freed, ("failed to free aggregated txdesc"));
1419 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1420 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1421 ("chim txd uses dmamap"));
1422 hn_chim_free(txr->hn_sc, txd->chim_index);
1423 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1425 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1426 bus_dmamap_sync(txr->hn_tx_data_dtag,
1427 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1428 bus_dmamap_unload(txr->hn_tx_data_dtag,
1430 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1433 if (txd->m != NULL) {
1438 txd->flags |= HN_TXD_FLAG_ONLIST;
1439 #ifndef HN_USE_TXDESC_BUFRING
1440 mtx_lock_spin(&txr->hn_txlist_spin);
1441 KASSERT(txr->hn_txdesc_avail >= 0 &&
1442 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1443 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1444 txr->hn_txdesc_avail++;
1445 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1446 mtx_unlock_spin(&txr->hn_txlist_spin);
1448 atomic_add_int(&txr->hn_txdesc_avail, 1);
1449 buf_ring_enqueue(txr->hn_txdesc_br, txd);
1455 static __inline struct hn_txdesc *
1456 hn_txdesc_get(struct hn_tx_ring *txr)
1458 struct hn_txdesc *txd;
1460 #ifndef HN_USE_TXDESC_BUFRING
1461 mtx_lock_spin(&txr->hn_txlist_spin);
1462 txd = SLIST_FIRST(&txr->hn_txlist);
1464 KASSERT(txr->hn_txdesc_avail > 0,
1465 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1466 txr->hn_txdesc_avail--;
1467 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1469 mtx_unlock_spin(&txr->hn_txlist_spin);
1471 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1475 #ifdef HN_USE_TXDESC_BUFRING
1476 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1478 KASSERT(txd->m == NULL && txd->refs == 0 &&
1479 STAILQ_EMPTY(&txd->agg_list) &&
1480 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1481 txd->chim_size == 0 &&
1482 (txd->flags & HN_TXD_FLAG_ONLIST) &&
1483 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1484 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1485 txd->flags &= ~HN_TXD_FLAG_ONLIST;
1491 static __inline void
1492 hn_txdesc_hold(struct hn_txdesc *txd)
1495 /* 0->1 transition will never work */
1496 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1497 atomic_add_int(&txd->refs, 1);
1500 static __inline void
1501 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1504 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1505 ("recursive aggregation on aggregating txdesc"));
1507 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1508 ("already aggregated"));
1509 KASSERT(STAILQ_EMPTY(&txd->agg_list),
1510 ("recursive aggregation on to-be-aggregated txdesc"));
1512 txd->flags |= HN_TXD_FLAG_ONAGG;
1513 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1517 hn_tx_ring_pending(struct hn_tx_ring *txr)
1519 bool pending = false;
1521 #ifndef HN_USE_TXDESC_BUFRING
1522 mtx_lock_spin(&txr->hn_txlist_spin);
1523 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1525 mtx_unlock_spin(&txr->hn_txlist_spin);
1527 if (!buf_ring_full(txr->hn_txdesc_br))
1533 static __inline void
1534 hn_txeof(struct hn_tx_ring *txr)
1536 txr->hn_has_txeof = 0;
1541 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1542 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1544 struct hn_txdesc *txd = sndc->hn_cbarg;
1545 struct hn_tx_ring *txr;
1548 KASSERT(txr->hn_chan == chan,
1549 ("channel mismatch, on chan%u, should be chan%u",
1550 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1552 txr->hn_has_txeof = 1;
1553 hn_txdesc_put(txr, txd);
1555 ++txr->hn_txdone_cnt;
1556 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1557 txr->hn_txdone_cnt = 0;
1558 if (txr->hn_oactive)
1564 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1566 #if defined(INET) || defined(INET6)
1567 struct lro_ctrl *lro = &rxr->hn_lro;
1568 struct lro_entry *queued;
1570 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1571 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1572 tcp_lro_flush(lro, queued);
1578 * 'txr' could be NULL, if multiple channels and
1579 * ifnet.if_start method are enabled.
1581 if (txr == NULL || !txr->hn_has_txeof)
1584 txr->hn_txdone_cnt = 0;
1588 static __inline uint32_t
1589 hn_rndis_pktmsg_offset(uint32_t ofs)
1592 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1593 ("invalid RNDIS packet msg offset %u", ofs));
1594 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1597 static __inline void *
1598 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1599 size_t pi_dlen, uint32_t pi_type)
1601 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1602 struct rndis_pktinfo *pi;
1604 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1605 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1608 * Per-packet-info does not move; it only grows.
1611 * rm_pktinfooffset in this phase counts from the beginning
1612 * of rndis_packet_msg.
1614 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1615 ("%u pktinfo overflows RNDIS packet msg", pi_type));
1616 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1617 pkt->rm_pktinfolen);
1618 pkt->rm_pktinfolen += pi_size;
1620 pi->rm_size = pi_size;
1621 pi->rm_type = pi_type;
1622 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1624 /* Data immediately follow per-packet-info. */
1625 pkt->rm_dataoffset += pi_size;
1627 /* Update RNDIS packet msg length */
1628 pkt->rm_len += pi_size;
1630 return (pi->rm_data);
1634 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1636 struct hn_txdesc *txd;
1640 txd = txr->hn_agg_txd;
1641 KASSERT(txd != NULL, ("no aggregate txdesc"));
1644 * Since hn_txpkt() will reset this temporary stat, save
1645 * it now, so that oerrors can be updated properly, if
1646 * hn_txpkt() ever fails.
1648 pkts = txr->hn_stat_pkts;
1651 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1652 * failure, save it for later freeing, if hn_txpkt() ever
1656 error = hn_txpkt(ifp, txr, txd);
1657 if (__predict_false(error)) {
1658 /* txd is freed, but m is not. */
1661 txr->hn_flush_failed++;
1662 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1665 /* Reset all aggregation states. */
1666 txr->hn_agg_txd = NULL;
1667 txr->hn_agg_szleft = 0;
1668 txr->hn_agg_pktleft = 0;
1669 txr->hn_agg_prevpkt = NULL;
1675 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1680 if (txr->hn_agg_txd != NULL) {
1681 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1682 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1683 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1687 * Update the previous RNDIS packet's total length,
1688 * it can be increased due to the mandatory alignment
1689 * padding for this RNDIS packet. And update the
1690 * aggregating txdesc's chimney sending buffer size
1694 * Zero-out the padding, as required by the RNDIS spec.
1697 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1698 agg_txd->chim_size += pkt->rm_len - olen;
1700 /* Link this txdesc to the parent. */
1701 hn_txdesc_agg(agg_txd, txd);
1703 chim = (uint8_t *)pkt + pkt->rm_len;
1704 /* Save the current packet for later fixup. */
1705 txr->hn_agg_prevpkt = chim;
1707 txr->hn_agg_pktleft--;
1708 txr->hn_agg_szleft -= pktsize;
1709 if (txr->hn_agg_szleft <=
1710 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1712 * Probably can't aggregate more packets,
1713 * flush this aggregating txdesc proactively.
1715 txr->hn_agg_pktleft = 0;
1720 hn_flush_txagg(ifp, txr);
1722 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1724 txr->hn_tx_chimney_tried++;
1725 txd->chim_index = hn_chim_alloc(txr->hn_sc);
1726 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1728 txr->hn_tx_chimney++;
1730 chim = txr->hn_sc->hn_chim +
1731 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1733 if (txr->hn_agg_pktmax > 1 &&
1734 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1735 txr->hn_agg_txd = txd;
1736 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1737 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1738 txr->hn_agg_prevpkt = chim;
1745 * If this function fails, then both txd and m_head0 will be freed.
1748 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1749 struct mbuf **m_head0)
1751 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1752 int error, nsegs, i;
1753 struct mbuf *m_head = *m_head0;
1754 struct rndis_packet_msg *pkt;
1757 int pkt_hlen, pkt_size;
1759 pkt = txd->rndis_pkt;
1760 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1761 if (pkt_size < txr->hn_chim_size) {
1762 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1766 if (txr->hn_agg_txd != NULL)
1767 hn_flush_txagg(ifp, txr);
1770 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1771 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1772 pkt->rm_dataoffset = sizeof(*pkt);
1773 pkt->rm_datalen = m_head->m_pkthdr.len;
1774 pkt->rm_oobdataoffset = 0;
1775 pkt->rm_oobdatalen = 0;
1776 pkt->rm_oobdataelements = 0;
1777 pkt->rm_pktinfooffset = sizeof(*pkt);
1778 pkt->rm_pktinfolen = 0;
1779 pkt->rm_vchandle = 0;
1780 pkt->rm_reserved = 0;
1782 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1784 * Set the hash value for this packet, so that the host could
1785 * dispatch the TX done event for this packet back to this TX
1788 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1789 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1790 *pi_data = txr->hn_tx_idx;
1793 if (m_head->m_flags & M_VLANTAG) {
1794 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1795 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1796 *pi_data = NDIS_VLAN_INFO_MAKE(
1797 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1798 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1799 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1802 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1803 #if defined(INET6) || defined(INET)
1804 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1805 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1807 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1808 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1809 m_head->m_pkthdr.tso_segsz);
1812 #if defined(INET6) && defined(INET)
1817 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1818 m_head->m_pkthdr.tso_segsz);
1821 #endif /* INET6 || INET */
1822 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1823 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1824 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1825 if (m_head->m_pkthdr.csum_flags &
1826 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1827 *pi_data = NDIS_TXCSUM_INFO_IPV6;
1829 *pi_data = NDIS_TXCSUM_INFO_IPV4;
1830 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1831 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1834 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1835 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1836 else if (m_head->m_pkthdr.csum_flags &
1837 (CSUM_IP_UDP | CSUM_IP6_UDP))
1838 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1841 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1842 /* Convert RNDIS packet message offsets */
1843 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1844 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1847 * Fast path: Chimney sending.
1850 struct hn_txdesc *tgt_txd = txd;
1852 if (txr->hn_agg_txd != NULL) {
1853 tgt_txd = txr->hn_agg_txd;
1859 KASSERT(pkt == chim,
1860 ("RNDIS pkt not in chimney sending buffer"));
1861 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1862 ("chimney sending buffer is not used"));
1863 tgt_txd->chim_size += pkt->rm_len;
1865 m_copydata(m_head, 0, m_head->m_pkthdr.len,
1866 ((uint8_t *)chim) + pkt_hlen);
1868 txr->hn_gpa_cnt = 0;
1869 txr->hn_sendpkt = hn_txpkt_chim;
1873 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1874 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1875 ("chimney buffer is used"));
1876 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1878 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1879 if (__predict_false(error)) {
1883 * This mbuf is not linked w/ the txd yet, so free it now.
1888 freed = hn_txdesc_put(txr, txd);
1890 ("fail to free txd upon txdma error"));
1892 txr->hn_txdma_failed++;
1893 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1898 /* +1 RNDIS packet message */
1899 txr->hn_gpa_cnt = nsegs + 1;
1901 /* send packet with page buffer */
1902 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1903 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1904 txr->hn_gpa[0].gpa_len = pkt_hlen;
1907 * Fill the page buffers with mbuf info after the page
1908 * buffer for RNDIS packet message.
1910 for (i = 0; i < nsegs; ++i) {
1911 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1913 gpa->gpa_page = atop(segs[i].ds_addr);
1914 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1915 gpa->gpa_len = segs[i].ds_len;
1918 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1920 txr->hn_sendpkt = hn_txpkt_sglist;
1924 /* Set the completion routine */
1925 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1927 /* Update temporary stats for later use. */
1928 txr->hn_stat_pkts++;
1929 txr->hn_stat_size += m_head->m_pkthdr.len;
1930 if (m_head->m_flags & M_MCAST)
1931 txr->hn_stat_mcasts++;
1938 * If this function fails, then txd will be freed, but the mbuf
1939 * associated w/ the txd will _not_ be freed.
1942 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1944 int error, send_failed = 0;
1948 * Make sure that this txd and any aggregated txds are not freed
1949 * before ETHER_BPF_MTAP.
1951 hn_txdesc_hold(txd);
1952 error = txr->hn_sendpkt(txr, txd);
1954 if (bpf_peers_present(ifp->if_bpf)) {
1955 const struct hn_txdesc *tmp_txd;
1957 ETHER_BPF_MTAP(ifp, txd->m);
1958 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1959 ETHER_BPF_MTAP(ifp, tmp_txd->m);
1962 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1963 #ifdef HN_IFSTART_SUPPORT
1964 if (!hn_use_if_start)
1967 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1969 if (txr->hn_stat_mcasts != 0) {
1970 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1971 txr->hn_stat_mcasts);
1974 txr->hn_pkts += txr->hn_stat_pkts;
1977 hn_txdesc_put(txr, txd);
1979 if (__predict_false(error)) {
1983 * This should "really rarely" happen.
1985 * XXX Too many RX to be acked or too many sideband
1986 * commands to run? Ask netvsc_channel_rollup()
1987 * to kick start later.
1989 txr->hn_has_txeof = 1;
1991 txr->hn_send_failed++;
1994 * Try sending again after set hn_has_txeof;
1995 * in case that we missed the last
1996 * netvsc_channel_rollup().
2000 if_printf(ifp, "send failed\n");
2003 * Caller will perform further processing on the
2004 * associated mbuf, so don't free it in hn_txdesc_put();
2005 * only unload it from the DMA map in hn_txdesc_put(),
2009 freed = hn_txdesc_put(txr, txd);
2011 ("fail to free txd upon send error"));
2013 txr->hn_send_failed++;
2016 /* Reset temporary stats, after this sending is done. */
2017 txr->hn_stat_size = 0;
2018 txr->hn_stat_pkts = 0;
2019 txr->hn_stat_mcasts = 0;
2025 * Append the specified data to the indicated mbuf chain,
2026 * Extend the mbuf chain if the new data does not fit in
2029 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2030 * There should be an equivalent in the kernel mbuf code,
2031 * but there does not appear to be one yet.
2033 * Differs from m_append() in that additional mbufs are
2034 * allocated with cluster size MJUMPAGESIZE, and filled
2037 * Return 1 if able to complete the job; otherwise 0.
2040 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2043 int remainder, space;
2045 for (m = m0; m->m_next != NULL; m = m->m_next)
2048 space = M_TRAILINGSPACE(m);
2051 * Copy into available space.
2053 if (space > remainder)
2055 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2060 while (remainder > 0) {
2062 * Allocate a new mbuf; could check space
2063 * and allocate a cluster instead.
2065 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
2068 n->m_len = min(MJUMPAGESIZE, remainder);
2069 bcopy(cp, mtod(n, caddr_t), n->m_len);
2071 remainder -= n->m_len;
2075 if (m0->m_flags & M_PKTHDR)
2076 m0->m_pkthdr.len += len - remainder;
2078 return (remainder == 0);
2081 #if defined(INET) || defined(INET6)
2083 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2085 #if __FreeBSD_version >= 1100095
2086 if (hn_lro_mbufq_depth) {
2087 tcp_lro_queue_mbuf(lc, m);
2091 return tcp_lro_rx(lc, m, 0);
2096 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2097 const struct hn_rxinfo *info)
2099 struct ifnet *ifp = rxr->hn_ifp;
2101 int size, do_lro = 0, do_csum = 1;
2102 int hash_type = M_HASHTYPE_OPAQUE;
2104 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2108 * Bail out if packet contains more data than configured MTU.
2110 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
2112 } else if (dlen <= MHLEN) {
2113 m_new = m_gethdr(M_NOWAIT, MT_DATA);
2114 if (m_new == NULL) {
2115 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2118 memcpy(mtod(m_new, void *), data, dlen);
2119 m_new->m_pkthdr.len = m_new->m_len = dlen;
2120 rxr->hn_small_pkts++;
2123 * Get an mbuf with a cluster. For packets 2K or less,
2124 * get a standard 2K cluster. For anything larger, get a
2125 * 4K cluster. Any buffers larger than 4K can cause problems
2126 * if looped around to the Hyper-V TX channel, so avoid them.
2129 if (dlen > MCLBYTES) {
2131 size = MJUMPAGESIZE;
2134 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2135 if (m_new == NULL) {
2136 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2140 hv_m_append(m_new, dlen, data);
2142 m_new->m_pkthdr.rcvif = ifp;
2144 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2147 /* receive side checksum offload */
2148 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2149 /* IP csum offload */
2150 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2151 m_new->m_pkthdr.csum_flags |=
2152 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2156 /* TCP/UDP csum offload */
2157 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2158 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2159 m_new->m_pkthdr.csum_flags |=
2160 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2161 m_new->m_pkthdr.csum_data = 0xffff;
2162 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2170 * As of this write (Oct 28th, 2016), host side will turn
2171 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2172 * the do_lro setting here is actually _not_ accurate. We
2173 * depend on the RSS hash type check to reset do_lro.
2175 if ((info->csum_info &
2176 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2177 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2180 const struct ether_header *eh;
2185 if (m_new->m_len < hoff)
2187 eh = mtod(m_new, struct ether_header *);
2188 etype = ntohs(eh->ether_type);
2189 if (etype == ETHERTYPE_VLAN) {
2190 const struct ether_vlan_header *evl;
2192 hoff = sizeof(*evl);
2193 if (m_new->m_len < hoff)
2195 evl = mtod(m_new, struct ether_vlan_header *);
2196 etype = ntohs(evl->evl_proto);
2199 if (etype == ETHERTYPE_IP) {
2202 pr = hn_check_iplen(m_new, hoff);
2203 if (pr == IPPROTO_TCP) {
2205 (rxr->hn_trust_hcsum &
2206 HN_TRUST_HCSUM_TCP)) {
2207 rxr->hn_csum_trusted++;
2208 m_new->m_pkthdr.csum_flags |=
2209 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2210 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2211 m_new->m_pkthdr.csum_data = 0xffff;
2214 } else if (pr == IPPROTO_UDP) {
2216 (rxr->hn_trust_hcsum &
2217 HN_TRUST_HCSUM_UDP)) {
2218 rxr->hn_csum_trusted++;
2219 m_new->m_pkthdr.csum_flags |=
2220 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2221 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2222 m_new->m_pkthdr.csum_data = 0xffff;
2224 } else if (pr != IPPROTO_DONE && do_csum &&
2225 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2226 rxr->hn_csum_trusted++;
2227 m_new->m_pkthdr.csum_flags |=
2228 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2233 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2234 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2235 NDIS_VLAN_INFO_ID(info->vlan_info),
2236 NDIS_VLAN_INFO_PRI(info->vlan_info),
2237 NDIS_VLAN_INFO_CFI(info->vlan_info));
2238 m_new->m_flags |= M_VLANTAG;
2241 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2243 m_new->m_pkthdr.flowid = info->hash_value;
2244 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2245 NDIS_HASH_FUNCTION_TOEPLITZ) {
2246 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2250 * do_lro is resetted, if the hash types are not TCP
2251 * related. See the comment in the above csum_flags
2255 case NDIS_HASH_IPV4:
2256 hash_type = M_HASHTYPE_RSS_IPV4;
2260 case NDIS_HASH_TCP_IPV4:
2261 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2264 case NDIS_HASH_IPV6:
2265 hash_type = M_HASHTYPE_RSS_IPV6;
2269 case NDIS_HASH_IPV6_EX:
2270 hash_type = M_HASHTYPE_RSS_IPV6_EX;
2274 case NDIS_HASH_TCP_IPV6:
2275 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2278 case NDIS_HASH_TCP_IPV6_EX:
2279 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2284 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2286 M_HASHTYPE_SET(m_new, hash_type);
2289 * Note: Moved RX completion back to hv_nv_on_receive() so all
2290 * messages (not just data messages) will trigger a response.
2296 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2297 #if defined(INET) || defined(INET6)
2298 struct lro_ctrl *lro = &rxr->hn_lro;
2301 rxr->hn_lro_tried++;
2302 if (hn_lro_rx(lro, m_new) == 0) {
2310 /* We're not holding the lock here, so don't release it */
2311 (*ifp->if_input)(ifp, m_new);
2317 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2319 struct hn_softc *sc = ifp->if_softc;
2320 struct ifreq *ifr = (struct ifreq *)data;
2321 int mask, error = 0;
2325 if (ifr->ifr_mtu > HN_MTU_MAX) {
2332 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2337 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2338 /* Can't change MTU */
2344 if (ifp->if_mtu == ifr->ifr_mtu) {
2350 * Suspend this interface before the synthetic parts
2356 * Detach the synthetics parts, i.e. NVS and RNDIS.
2358 hn_synth_detach(sc);
2361 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2362 * with the new MTU setting.
2364 error = hn_synth_attach(sc, ifr->ifr_mtu);
2371 * Commit the requested MTU, after the synthetic parts
2372 * have been successfully attached.
2374 ifp->if_mtu = ifr->ifr_mtu;
2377 * Make sure that various parameters based on MTU are
2378 * still valid, after the MTU change.
2380 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2381 hn_set_chim_size(sc, sc->hn_chim_szmax);
2382 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2383 #if __FreeBSD_version >= 1100099
2384 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2385 HN_LRO_LENLIM_MIN(ifp))
2386 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2390 * All done! Resume the interface now.
2400 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2405 if (ifp->if_flags & IFF_UP) {
2406 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2408 * Caller meight hold mutex, e.g.
2409 * bpf; use busy-wait for the RNDIS
2413 hn_set_rxfilter(sc);
2419 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2422 sc->hn_if_flags = ifp->if_flags;
2429 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2431 if (mask & IFCAP_TXCSUM) {
2432 ifp->if_capenable ^= IFCAP_TXCSUM;
2433 if (ifp->if_capenable & IFCAP_TXCSUM)
2434 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2436 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2438 if (mask & IFCAP_TXCSUM_IPV6) {
2439 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2440 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2441 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2443 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2446 /* TODO: flip RNDIS offload parameters for RXCSUM. */
2447 if (mask & IFCAP_RXCSUM)
2448 ifp->if_capenable ^= IFCAP_RXCSUM;
2450 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2451 if (mask & IFCAP_RXCSUM_IPV6)
2452 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2455 if (mask & IFCAP_LRO)
2456 ifp->if_capenable ^= IFCAP_LRO;
2458 if (mask & IFCAP_TSO4) {
2459 ifp->if_capenable ^= IFCAP_TSO4;
2460 if (ifp->if_capenable & IFCAP_TSO4)
2461 ifp->if_hwassist |= CSUM_IP_TSO;
2463 ifp->if_hwassist &= ~CSUM_IP_TSO;
2465 if (mask & IFCAP_TSO6) {
2466 ifp->if_capenable ^= IFCAP_TSO6;
2467 if (ifp->if_capenable & IFCAP_TSO6)
2468 ifp->if_hwassist |= CSUM_IP6_TSO;
2470 ifp->if_hwassist &= ~CSUM_IP6_TSO;
2480 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2484 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2486 * Multicast uses mutex; use busy-wait for
2490 hn_set_rxfilter(sc);
2499 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2503 error = ether_ioctl(ifp, cmd, data);
2510 hn_stop(struct hn_softc *sc)
2512 struct ifnet *ifp = sc->hn_ifp;
2517 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2518 ("synthetic parts were not attached"));
2520 /* Clear RUNNING bit _before_ hn_suspend_data() */
2521 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2522 hn_suspend_data(sc);
2524 /* Clear OACTIVE bit. */
2525 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2526 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2527 sc->hn_tx_ring[i].hn_oactive = 0;
2531 hn_init_locked(struct hn_softc *sc)
2533 struct ifnet *ifp = sc->hn_ifp;
2538 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2541 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2544 /* Configure RX filter */
2545 hn_set_rxfilter(sc);
2547 /* Clear OACTIVE bit. */
2548 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2549 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2550 sc->hn_tx_ring[i].hn_oactive = 0;
2552 /* Clear TX 'suspended' bit. */
2553 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2555 /* Everything is ready; unleash! */
2556 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2562 struct hn_softc *sc = xsc;
2569 #if __FreeBSD_version >= 1100099
2572 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2574 struct hn_softc *sc = arg1;
2575 unsigned int lenlim;
2578 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2579 error = sysctl_handle_int(oidp, &lenlim, 0, req);
2580 if (error || req->newptr == NULL)
2584 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2585 lenlim > TCP_LRO_LENGTH_MAX) {
2589 hn_set_lro_lenlim(sc, lenlim);
2596 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2598 struct hn_softc *sc = arg1;
2599 int ackcnt, error, i;
2602 * lro_ackcnt_lim is append count limit,
2603 * +1 to turn it into aggregation limit.
2605 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2606 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2607 if (error || req->newptr == NULL)
2610 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2614 * Convert aggregation limit back to append
2619 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2620 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2628 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2630 struct hn_softc *sc = arg1;
2635 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2638 error = sysctl_handle_int(oidp, &on, 0, req);
2639 if (error || req->newptr == NULL)
2643 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2644 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2647 rxr->hn_trust_hcsum |= hcsum;
2649 rxr->hn_trust_hcsum &= ~hcsum;
2656 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2658 struct hn_softc *sc = arg1;
2659 int chim_size, error;
2661 chim_size = sc->hn_tx_ring[0].hn_chim_size;
2662 error = sysctl_handle_int(oidp, &chim_size, 0, req);
2663 if (error || req->newptr == NULL)
2666 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2670 hn_set_chim_size(sc, chim_size);
2675 #if __FreeBSD_version < 1100095
2677 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2679 struct hn_softc *sc = arg1;
2680 int ofs = arg2, i, error;
2681 struct hn_rx_ring *rxr;
2685 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2686 rxr = &sc->hn_rx_ring[i];
2687 stat += *((int *)((uint8_t *)rxr + ofs));
2690 error = sysctl_handle_64(oidp, &stat, 0, req);
2691 if (error || req->newptr == NULL)
2694 /* Zero out this stat. */
2695 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2696 rxr = &sc->hn_rx_ring[i];
2697 *((int *)((uint8_t *)rxr + ofs)) = 0;
2703 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2705 struct hn_softc *sc = arg1;
2706 int ofs = arg2, i, error;
2707 struct hn_rx_ring *rxr;
2711 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2712 rxr = &sc->hn_rx_ring[i];
2713 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2716 error = sysctl_handle_64(oidp, &stat, 0, req);
2717 if (error || req->newptr == NULL)
2720 /* Zero out this stat. */
2721 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2722 rxr = &sc->hn_rx_ring[i];
2723 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2731 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2733 struct hn_softc *sc = arg1;
2734 int ofs = arg2, i, error;
2735 struct hn_rx_ring *rxr;
2739 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2740 rxr = &sc->hn_rx_ring[i];
2741 stat += *((u_long *)((uint8_t *)rxr + ofs));
2744 error = sysctl_handle_long(oidp, &stat, 0, req);
2745 if (error || req->newptr == NULL)
2748 /* Zero out this stat. */
2749 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2750 rxr = &sc->hn_rx_ring[i];
2751 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
2757 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2759 struct hn_softc *sc = arg1;
2760 int ofs = arg2, i, error;
2761 struct hn_tx_ring *txr;
2765 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2766 txr = &sc->hn_tx_ring[i];
2767 stat += *((u_long *)((uint8_t *)txr + ofs));
2770 error = sysctl_handle_long(oidp, &stat, 0, req);
2771 if (error || req->newptr == NULL)
2774 /* Zero out this stat. */
2775 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2776 txr = &sc->hn_tx_ring[i];
2777 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2783 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2785 struct hn_softc *sc = arg1;
2786 int ofs = arg2, i, error, conf;
2787 struct hn_tx_ring *txr;
2789 txr = &sc->hn_tx_ring[0];
2790 conf = *((int *)((uint8_t *)txr + ofs));
2792 error = sysctl_handle_int(oidp, &conf, 0, req);
2793 if (error || req->newptr == NULL)
2797 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2798 txr = &sc->hn_tx_ring[i];
2799 *((int *)((uint8_t *)txr + ofs)) = conf;
2807 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2809 struct hn_softc *sc = arg1;
2812 size = sc->hn_agg_size;
2813 error = sysctl_handle_int(oidp, &size, 0, req);
2814 if (error || req->newptr == NULL)
2818 sc->hn_agg_size = size;
2826 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2828 struct hn_softc *sc = arg1;
2831 pkts = sc->hn_agg_pkts;
2832 error = sysctl_handle_int(oidp, &pkts, 0, req);
2833 if (error || req->newptr == NULL)
2837 sc->hn_agg_pkts = pkts;
2845 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2847 struct hn_softc *sc = arg1;
2850 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2851 return (sysctl_handle_int(oidp, &pkts, 0, req));
2855 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2857 struct hn_softc *sc = arg1;
2860 align = sc->hn_tx_ring[0].hn_agg_align;
2861 return (sysctl_handle_int(oidp, &align, 0, req));
2865 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2867 struct hn_softc *sc = arg1;
2870 snprintf(verstr, sizeof(verstr), "%u.%u",
2871 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2872 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2873 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2877 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2879 struct hn_softc *sc = arg1;
2886 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2887 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2891 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2893 struct hn_softc *sc = arg1;
2894 char assist_str[128];
2898 hwassist = sc->hn_ifp->if_hwassist;
2900 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2901 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2905 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2907 struct hn_softc *sc = arg1;
2908 char filter_str[128];
2912 filter = sc->hn_rx_filter;
2914 snprintf(filter_str, sizeof(filter_str), "%b", filter,
2916 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2920 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2922 struct hn_softc *sc = arg1;
2927 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2928 if (error || req->newptr == NULL)
2931 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2934 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2936 if (sc->hn_rx_ring_inuse > 1) {
2937 error = hn_rss_reconfig(sc);
2939 /* Not RSS capable, at least for now; just save the RSS key. */
2948 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2950 struct hn_softc *sc = arg1;
2955 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2956 if (error || req->newptr == NULL)
2960 * Don't allow RSS indirect table change, if this interface is not
2961 * RSS capable currently.
2963 if (sc->hn_rx_ring_inuse == 1) {
2968 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2971 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2973 hn_rss_ind_fixup(sc);
2974 error = hn_rss_reconfig(sc);
2981 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
2983 struct hn_softc *sc = arg1;
2988 hash = sc->hn_rss_hash;
2990 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
2991 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
2995 hn_check_iplen(const struct mbuf *m, int hoff)
2997 const struct ip *ip;
2998 int len, iphlen, iplen;
2999 const struct tcphdr *th;
3000 int thoff; /* TCP data offset */
3002 len = hoff + sizeof(struct ip);
3004 /* The packet must be at least the size of an IP header. */
3005 if (m->m_pkthdr.len < len)
3006 return IPPROTO_DONE;
3008 /* The fixed IP header must reside completely in the first mbuf. */
3010 return IPPROTO_DONE;
3012 ip = mtodo(m, hoff);
3014 /* Bound check the packet's stated IP header length. */
3015 iphlen = ip->ip_hl << 2;
3016 if (iphlen < sizeof(struct ip)) /* minimum header length */
3017 return IPPROTO_DONE;
3019 /* The full IP header must reside completely in the one mbuf. */
3020 if (m->m_len < hoff + iphlen)
3021 return IPPROTO_DONE;
3023 iplen = ntohs(ip->ip_len);
3026 * Check that the amount of data in the buffers is as
3027 * at least much as the IP header would have us expect.
3029 if (m->m_pkthdr.len < hoff + iplen)
3030 return IPPROTO_DONE;
3033 * Ignore IP fragments.
3035 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3036 return IPPROTO_DONE;
3039 * The TCP/IP or UDP/IP header must be entirely contained within
3040 * the first fragment of a packet.
3044 if (iplen < iphlen + sizeof(struct tcphdr))
3045 return IPPROTO_DONE;
3046 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3047 return IPPROTO_DONE;
3048 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3049 thoff = th->th_off << 2;
3050 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3051 return IPPROTO_DONE;
3052 if (m->m_len < hoff + iphlen + thoff)
3053 return IPPROTO_DONE;
3056 if (iplen < iphlen + sizeof(struct udphdr))
3057 return IPPROTO_DONE;
3058 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3059 return IPPROTO_DONE;
3063 return IPPROTO_DONE;
3070 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3072 struct sysctl_oid_list *child;
3073 struct sysctl_ctx_list *ctx;
3074 device_t dev = sc->hn_dev;
3075 #if defined(INET) || defined(INET6)
3076 #if __FreeBSD_version >= 1100095
3083 * Create RXBUF for reception.
3086 * - It is shared by all channels.
3087 * - A large enough buffer is allocated, certain version of NVSes
3088 * may further limit the usable space.
3090 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3091 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3092 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3093 if (sc->hn_rxbuf == NULL) {
3094 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3098 sc->hn_rx_ring_cnt = ring_cnt;
3099 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3101 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3102 M_DEVBUF, M_WAITOK | M_ZERO);
3104 #if defined(INET) || defined(INET6)
3105 #if __FreeBSD_version >= 1100095
3106 lroent_cnt = hn_lro_entry_count;
3107 if (lroent_cnt < TCP_LRO_ENTRIES)
3108 lroent_cnt = TCP_LRO_ENTRIES;
3110 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3112 #endif /* INET || INET6 */
3114 ctx = device_get_sysctl_ctx(dev);
3115 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3117 /* Create dev.hn.UNIT.rx sysctl tree */
3118 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3119 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3121 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3122 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3124 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3125 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3126 &rxr->hn_br_dma, BUS_DMA_WAITOK);
3127 if (rxr->hn_br == NULL) {
3128 device_printf(dev, "allocate bufring failed\n");
3132 if (hn_trust_hosttcp)
3133 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3134 if (hn_trust_hostudp)
3135 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3136 if (hn_trust_hostip)
3137 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3138 rxr->hn_ifp = sc->hn_ifp;
3139 if (i < sc->hn_tx_ring_cnt)
3140 rxr->hn_txr = &sc->hn_tx_ring[i];
3141 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3142 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3144 rxr->hn_rxbuf = sc->hn_rxbuf;
3149 #if defined(INET) || defined(INET6)
3150 #if __FreeBSD_version >= 1100095
3151 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3152 hn_lro_mbufq_depth);
3154 tcp_lro_init(&rxr->hn_lro);
3155 rxr->hn_lro.ifp = sc->hn_ifp;
3157 #if __FreeBSD_version >= 1100099
3158 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3159 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3161 #endif /* INET || INET6 */
3163 if (sc->hn_rx_sysctl_tree != NULL) {
3167 * Create per RX ring sysctl tree:
3168 * dev.hn.UNIT.rx.RINGID
3170 snprintf(name, sizeof(name), "%d", i);
3171 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3172 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3173 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3175 if (rxr->hn_rx_sysctl_tree != NULL) {
3176 SYSCTL_ADD_ULONG(ctx,
3177 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3178 OID_AUTO, "packets", CTLFLAG_RW,
3179 &rxr->hn_pkts, "# of packets received");
3180 SYSCTL_ADD_ULONG(ctx,
3181 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3182 OID_AUTO, "rss_pkts", CTLFLAG_RW,
3184 "# of packets w/ RSS info received");
3186 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3187 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3188 &rxr->hn_pktbuf_len, 0,
3189 "Temporary channel packet buffer length");
3194 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3195 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3196 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3197 #if __FreeBSD_version < 1100095
3198 hn_rx_stat_int_sysctl,
3200 hn_rx_stat_u64_sysctl,
3202 "LU", "LRO queued");
3203 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3204 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3205 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3206 #if __FreeBSD_version < 1100095
3207 hn_rx_stat_int_sysctl,
3209 hn_rx_stat_u64_sysctl,
3211 "LU", "LRO flushed");
3212 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3213 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3214 __offsetof(struct hn_rx_ring, hn_lro_tried),
3215 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3216 #if __FreeBSD_version >= 1100099
3217 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3218 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3219 hn_lro_lenlim_sysctl, "IU",
3220 "Max # of data bytes to be aggregated by LRO");
3221 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3222 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3223 hn_lro_ackcnt_sysctl, "I",
3224 "Max # of ACKs to be aggregated by LRO");
3226 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3227 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3228 hn_trust_hcsum_sysctl, "I",
3229 "Trust tcp segement verification on host side, "
3230 "when csum info is missing");
3231 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3232 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3233 hn_trust_hcsum_sysctl, "I",
3234 "Trust udp datagram verification on host side, "
3235 "when csum info is missing");
3236 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3237 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3238 hn_trust_hcsum_sysctl, "I",
3239 "Trust ip packet verification on host side, "
3240 "when csum info is missing");
3241 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3242 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3243 __offsetof(struct hn_rx_ring, hn_csum_ip),
3244 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3245 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3246 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3247 __offsetof(struct hn_rx_ring, hn_csum_tcp),
3248 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3249 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3250 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3251 __offsetof(struct hn_rx_ring, hn_csum_udp),
3252 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3253 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3254 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3255 __offsetof(struct hn_rx_ring, hn_csum_trusted),
3256 hn_rx_stat_ulong_sysctl, "LU",
3257 "# of packets that we trust host's csum verification");
3258 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3259 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3260 __offsetof(struct hn_rx_ring, hn_small_pkts),
3261 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3262 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3263 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3264 __offsetof(struct hn_rx_ring, hn_ack_failed),
3265 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3266 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3267 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3268 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3269 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3275 hn_destroy_rx_data(struct hn_softc *sc)
3279 if (sc->hn_rxbuf != NULL) {
3280 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3281 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3283 device_printf(sc->hn_dev, "RXBUF is referenced\n");
3284 sc->hn_rxbuf = NULL;
3287 if (sc->hn_rx_ring_cnt == 0)
3290 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3291 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3293 if (rxr->hn_br == NULL)
3295 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3296 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3298 device_printf(sc->hn_dev,
3299 "%dth channel bufring is referenced", i);
3303 #if defined(INET) || defined(INET6)
3304 tcp_lro_free(&rxr->hn_lro);
3306 free(rxr->hn_pktbuf, M_DEVBUF);
3308 free(sc->hn_rx_ring, M_DEVBUF);
3309 sc->hn_rx_ring = NULL;
3311 sc->hn_rx_ring_cnt = 0;
3312 sc->hn_rx_ring_inuse = 0;
3316 hn_tx_ring_create(struct hn_softc *sc, int id)
3318 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3319 device_t dev = sc->hn_dev;
3320 bus_dma_tag_t parent_dtag;
3324 txr->hn_tx_idx = id;
3326 #ifndef HN_USE_TXDESC_BUFRING
3327 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3329 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3331 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3332 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3333 M_DEVBUF, M_WAITOK | M_ZERO);
3334 #ifndef HN_USE_TXDESC_BUFRING
3335 SLIST_INIT(&txr->hn_txlist);
3337 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3338 M_WAITOK, &txr->hn_tx_lock);
3341 txr->hn_tx_taskq = sc->hn_tx_taskq;
3343 #ifdef HN_IFSTART_SUPPORT
3344 if (hn_use_if_start) {
3345 txr->hn_txeof = hn_start_txeof;
3346 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3347 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3353 txr->hn_txeof = hn_xmit_txeof;
3354 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3355 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3357 br_depth = hn_get_txswq_depth(txr);
3358 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3359 M_WAITOK, &txr->hn_tx_lock);
3362 txr->hn_direct_tx_size = hn_direct_tx_size;
3365 * Always schedule transmission instead of trying to do direct
3366 * transmission. This one gives the best performance so far.
3368 txr->hn_sched_tx = 1;
3370 parent_dtag = bus_get_dma_tag(dev);
3372 /* DMA tag for RNDIS packet messages. */
3373 error = bus_dma_tag_create(parent_dtag, /* parent */
3374 HN_RNDIS_PKT_ALIGN, /* alignment */
3375 HN_RNDIS_PKT_BOUNDARY, /* boundary */
3376 BUS_SPACE_MAXADDR, /* lowaddr */
3377 BUS_SPACE_MAXADDR, /* highaddr */
3378 NULL, NULL, /* filter, filterarg */
3379 HN_RNDIS_PKT_LEN, /* maxsize */
3381 HN_RNDIS_PKT_LEN, /* maxsegsize */
3383 NULL, /* lockfunc */
3384 NULL, /* lockfuncarg */
3385 &txr->hn_tx_rndis_dtag);
3387 device_printf(dev, "failed to create rndis dmatag\n");
3391 /* DMA tag for data. */
3392 error = bus_dma_tag_create(parent_dtag, /* parent */
3394 HN_TX_DATA_BOUNDARY, /* boundary */
3395 BUS_SPACE_MAXADDR, /* lowaddr */
3396 BUS_SPACE_MAXADDR, /* highaddr */
3397 NULL, NULL, /* filter, filterarg */
3398 HN_TX_DATA_MAXSIZE, /* maxsize */
3399 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
3400 HN_TX_DATA_SEGSIZE, /* maxsegsize */
3402 NULL, /* lockfunc */
3403 NULL, /* lockfuncarg */
3404 &txr->hn_tx_data_dtag);
3406 device_printf(dev, "failed to create data dmatag\n");
3410 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3411 struct hn_txdesc *txd = &txr->hn_txdesc[i];
3414 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3415 STAILQ_INIT(&txd->agg_list);
3418 * Allocate and load RNDIS packet message.
3420 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3421 (void **)&txd->rndis_pkt,
3422 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3423 &txd->rndis_pkt_dmap);
3426 "failed to allocate rndis_packet_msg, %d\n", i);
3430 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3431 txd->rndis_pkt_dmap,
3432 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3433 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3437 "failed to load rndis_packet_msg, %d\n", i);
3438 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3439 txd->rndis_pkt, txd->rndis_pkt_dmap);
3443 /* DMA map for TX data. */
3444 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3448 "failed to allocate tx data dmamap\n");
3449 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3450 txd->rndis_pkt_dmap);
3451 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3452 txd->rndis_pkt, txd->rndis_pkt_dmap);
3456 /* All set, put it to list */
3457 txd->flags |= HN_TXD_FLAG_ONLIST;
3458 #ifndef HN_USE_TXDESC_BUFRING
3459 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3461 buf_ring_enqueue(txr->hn_txdesc_br, txd);
3464 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3466 if (sc->hn_tx_sysctl_tree != NULL) {
3467 struct sysctl_oid_list *child;
3468 struct sysctl_ctx_list *ctx;
3472 * Create per TX ring sysctl tree:
3473 * dev.hn.UNIT.tx.RINGID
3475 ctx = device_get_sysctl_ctx(dev);
3476 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3478 snprintf(name, sizeof(name), "%d", id);
3479 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3480 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3482 if (txr->hn_tx_sysctl_tree != NULL) {
3483 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3485 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3486 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3487 "# of available TX descs");
3488 #ifdef HN_IFSTART_SUPPORT
3489 if (!hn_use_if_start)
3492 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3493 CTLFLAG_RD, &txr->hn_oactive, 0,
3496 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3497 CTLFLAG_RW, &txr->hn_pkts,
3498 "# of packets transmitted");
3499 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3500 CTLFLAG_RW, &txr->hn_sends, "# of sends");
3508 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3510 struct hn_tx_ring *txr = txd->txr;
3512 KASSERT(txd->m == NULL, ("still has mbuf installed"));
3513 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3515 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3516 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3517 txd->rndis_pkt_dmap);
3518 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3522 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3525 KASSERT(txd->refs == 0 || txd->refs == 1,
3526 ("invalid txd refs %d", txd->refs));
3528 /* Aggregated txds will be freed by their aggregating txd. */
3529 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3532 freed = hn_txdesc_put(txr, txd);
3533 KASSERT(freed, ("can't free txdesc"));
3538 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3542 if (txr->hn_txdesc == NULL)
3547 * Because the freeing of aggregated txds will be deferred
3548 * to the aggregating txd, two passes are used here:
3549 * - The first pass GCes any pending txds. This GC is necessary,
3550 * since if the channels are revoked, hypervisor will not
3551 * deliver send-done for all pending txds.
3552 * - The second pass frees the busdma stuffs, i.e. after all txds
3555 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3556 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3557 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3558 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3560 if (txr->hn_tx_data_dtag != NULL)
3561 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3562 if (txr->hn_tx_rndis_dtag != NULL)
3563 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3565 #ifdef HN_USE_TXDESC_BUFRING
3566 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3569 free(txr->hn_txdesc, M_DEVBUF);
3570 txr->hn_txdesc = NULL;
3572 if (txr->hn_mbuf_br != NULL)
3573 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3575 #ifndef HN_USE_TXDESC_BUFRING
3576 mtx_destroy(&txr->hn_txlist_spin);
3578 mtx_destroy(&txr->hn_tx_lock);
3582 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3584 struct sysctl_oid_list *child;
3585 struct sysctl_ctx_list *ctx;
3589 * Create TXBUF for chimney sending.
3591 * NOTE: It is shared by all channels.
3593 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3594 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3595 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3596 if (sc->hn_chim == NULL) {
3597 device_printf(sc->hn_dev, "allocate txbuf failed\n");
3601 sc->hn_tx_ring_cnt = ring_cnt;
3602 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3604 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3605 M_DEVBUF, M_WAITOK | M_ZERO);
3607 ctx = device_get_sysctl_ctx(sc->hn_dev);
3608 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3610 /* Create dev.hn.UNIT.tx sysctl tree */
3611 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3612 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3614 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3617 error = hn_tx_ring_create(sc, i);
3622 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3623 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3624 __offsetof(struct hn_tx_ring, hn_no_txdescs),
3625 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3626 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3627 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3628 __offsetof(struct hn_tx_ring, hn_send_failed),
3629 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3630 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3631 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3632 __offsetof(struct hn_tx_ring, hn_txdma_failed),
3633 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3634 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3635 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3636 __offsetof(struct hn_tx_ring, hn_flush_failed),
3637 hn_tx_stat_ulong_sysctl, "LU",
3638 "# of packet transmission aggregation flush failure");
3639 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3640 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3641 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3642 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3643 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3644 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3645 __offsetof(struct hn_tx_ring, hn_tx_chimney),
3646 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3647 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3648 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3649 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3650 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3651 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3652 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3653 "# of total TX descs");
3654 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3655 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3656 "Chimney send packet size upper boundary");
3657 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3658 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3659 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3660 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3661 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3662 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3663 hn_tx_conf_int_sysctl, "I",
3664 "Size of the packet for direct transmission");
3665 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3666 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3667 __offsetof(struct hn_tx_ring, hn_sched_tx),
3668 hn_tx_conf_int_sysctl, "I",
3669 "Always schedule transmission "
3670 "instead of doing direct transmission");
3671 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3672 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3673 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3674 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3675 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3676 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3677 "Applied packet transmission aggregation size");
3678 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3679 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3680 hn_txagg_pktmax_sysctl, "I",
3681 "Applied packet transmission aggregation packets");
3682 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3683 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3684 hn_txagg_align_sysctl, "I",
3685 "Applied packet transmission aggregation alignment");
3691 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3695 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3696 sc->hn_tx_ring[i].hn_chim_size = chim_size;
3700 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3702 struct ifnet *ifp = sc->hn_ifp;
3705 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3708 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3709 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3710 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3712 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3713 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3714 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3716 if (tso_maxlen < tso_minlen)
3717 tso_maxlen = tso_minlen;
3718 else if (tso_maxlen > IP_MAXPACKET)
3719 tso_maxlen = IP_MAXPACKET;
3720 if (tso_maxlen > sc->hn_ndis_tso_szmax)
3721 tso_maxlen = sc->hn_ndis_tso_szmax;
3722 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3724 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3728 hn_fixup_tx_data(struct hn_softc *sc)
3730 uint64_t csum_assist;
3733 hn_set_chim_size(sc, sc->hn_chim_szmax);
3734 if (hn_tx_chimney_size > 0 &&
3735 hn_tx_chimney_size < sc->hn_chim_szmax)
3736 hn_set_chim_size(sc, hn_tx_chimney_size);
3739 if (sc->hn_caps & HN_CAP_IPCS)
3740 csum_assist |= CSUM_IP;
3741 if (sc->hn_caps & HN_CAP_TCP4CS)
3742 csum_assist |= CSUM_IP_TCP;
3743 if (sc->hn_caps & HN_CAP_UDP4CS)
3744 csum_assist |= CSUM_IP_UDP;
3745 if (sc->hn_caps & HN_CAP_TCP6CS)
3746 csum_assist |= CSUM_IP6_TCP;
3747 if (sc->hn_caps & HN_CAP_UDP6CS)
3748 csum_assist |= CSUM_IP6_UDP;
3749 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3750 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3752 if (sc->hn_caps & HN_CAP_HASHVAL) {
3754 * Support HASHVAL pktinfo on TX path.
3757 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3758 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3759 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3764 hn_destroy_tx_data(struct hn_softc *sc)
3768 if (sc->hn_chim != NULL) {
3769 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
3770 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3772 device_printf(sc->hn_dev,
3773 "chimney sending buffer is referenced");
3778 if (sc->hn_tx_ring_cnt == 0)
3781 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3782 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3784 free(sc->hn_tx_ring, M_DEVBUF);
3785 sc->hn_tx_ring = NULL;
3787 sc->hn_tx_ring_cnt = 0;
3788 sc->hn_tx_ring_inuse = 0;
3791 #ifdef HN_IFSTART_SUPPORT
3794 hn_start_taskfunc(void *xtxr, int pending __unused)
3796 struct hn_tx_ring *txr = xtxr;
3798 mtx_lock(&txr->hn_tx_lock);
3799 hn_start_locked(txr, 0);
3800 mtx_unlock(&txr->hn_tx_lock);
3804 hn_start_locked(struct hn_tx_ring *txr, int len)
3806 struct hn_softc *sc = txr->hn_sc;
3807 struct ifnet *ifp = sc->hn_ifp;
3810 KASSERT(hn_use_if_start,
3811 ("hn_start_locked is called, when if_start is disabled"));
3812 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3813 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3814 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3816 if (__predict_false(txr->hn_suspended))
3819 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3823 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3824 struct hn_txdesc *txd;
3825 struct mbuf *m_head;
3828 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3832 if (len > 0 && m_head->m_pkthdr.len > len) {
3834 * This sending could be time consuming; let callers
3835 * dispatch this packet sending (and sending of any
3836 * following up packets) to tx taskqueue.
3838 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3843 #if defined(INET6) || defined(INET)
3844 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3845 m_head = hn_tso_fixup(m_head);
3846 if (__predict_false(m_head == NULL)) {
3847 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3853 txd = hn_txdesc_get(txr);
3855 txr->hn_no_txdescs++;
3856 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3857 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3861 error = hn_encap(ifp, txr, txd, &m_head);
3863 /* Both txd and m_head are freed */
3864 KASSERT(txr->hn_agg_txd == NULL,
3865 ("encap failed w/ pending aggregating txdesc"));
3869 if (txr->hn_agg_pktleft == 0) {
3870 if (txr->hn_agg_txd != NULL) {
3871 KASSERT(m_head == NULL,
3872 ("pending mbuf for aggregating txdesc"));
3873 error = hn_flush_txagg(ifp, txr);
3874 if (__predict_false(error)) {
3875 atomic_set_int(&ifp->if_drv_flags,
3880 KASSERT(m_head != NULL, ("mbuf was freed"));
3881 error = hn_txpkt(ifp, txr, txd);
3882 if (__predict_false(error)) {
3883 /* txd is freed, but m_head is not */
3884 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3885 atomic_set_int(&ifp->if_drv_flags,
3893 KASSERT(txr->hn_agg_txd != NULL,
3894 ("no aggregating txdesc"));
3895 KASSERT(m_head == NULL,
3896 ("pending mbuf for aggregating txdesc"));
3901 /* Flush pending aggerated transmission. */
3902 if (txr->hn_agg_txd != NULL)
3903 hn_flush_txagg(ifp, txr);
3908 hn_start(struct ifnet *ifp)
3910 struct hn_softc *sc = ifp->if_softc;
3911 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
3913 if (txr->hn_sched_tx)
3916 if (mtx_trylock(&txr->hn_tx_lock)) {
3919 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3920 mtx_unlock(&txr->hn_tx_lock);
3925 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3929 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3931 struct hn_tx_ring *txr = xtxr;
3933 mtx_lock(&txr->hn_tx_lock);
3934 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3935 hn_start_locked(txr, 0);
3936 mtx_unlock(&txr->hn_tx_lock);
3940 hn_start_txeof(struct hn_tx_ring *txr)
3942 struct hn_softc *sc = txr->hn_sc;
3943 struct ifnet *ifp = sc->hn_ifp;
3945 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3947 if (txr->hn_sched_tx)
3950 if (mtx_trylock(&txr->hn_tx_lock)) {
3953 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3954 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3955 mtx_unlock(&txr->hn_tx_lock);
3957 taskqueue_enqueue(txr->hn_tx_taskq,
3963 * Release the OACTIVE earlier, with the hope, that
3964 * others could catch up. The task will clear the
3965 * flag again with the hn_tx_lock to avoid possible
3968 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3969 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3973 #endif /* HN_IFSTART_SUPPORT */
3976 hn_xmit(struct hn_tx_ring *txr, int len)
3978 struct hn_softc *sc = txr->hn_sc;
3979 struct ifnet *ifp = sc->hn_ifp;
3980 struct mbuf *m_head;
3983 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3984 #ifdef HN_IFSTART_SUPPORT
3985 KASSERT(hn_use_if_start == 0,
3986 ("hn_xmit is called, when if_start is enabled"));
3988 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3990 if (__predict_false(txr->hn_suspended))
3993 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
3996 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
3997 struct hn_txdesc *txd;
4000 if (len > 0 && m_head->m_pkthdr.len > len) {
4002 * This sending could be time consuming; let callers
4003 * dispatch this packet sending (and sending of any
4004 * following up packets) to tx taskqueue.
4006 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4011 txd = hn_txdesc_get(txr);
4013 txr->hn_no_txdescs++;
4014 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4015 txr->hn_oactive = 1;
4019 error = hn_encap(ifp, txr, txd, &m_head);
4021 /* Both txd and m_head are freed; discard */
4022 KASSERT(txr->hn_agg_txd == NULL,
4023 ("encap failed w/ pending aggregating txdesc"));
4024 drbr_advance(ifp, txr->hn_mbuf_br);
4028 if (txr->hn_agg_pktleft == 0) {
4029 if (txr->hn_agg_txd != NULL) {
4030 KASSERT(m_head == NULL,
4031 ("pending mbuf for aggregating txdesc"));
4032 error = hn_flush_txagg(ifp, txr);
4033 if (__predict_false(error)) {
4034 txr->hn_oactive = 1;
4038 KASSERT(m_head != NULL, ("mbuf was freed"));
4039 error = hn_txpkt(ifp, txr, txd);
4040 if (__predict_false(error)) {
4041 /* txd is freed, but m_head is not */
4042 drbr_putback(ifp, txr->hn_mbuf_br,
4044 txr->hn_oactive = 1;
4051 KASSERT(txr->hn_agg_txd != NULL,
4052 ("no aggregating txdesc"));
4053 KASSERT(m_head == NULL,
4054 ("pending mbuf for aggregating txdesc"));
4059 drbr_advance(ifp, txr->hn_mbuf_br);
4062 /* Flush pending aggerated transmission. */
4063 if (txr->hn_agg_txd != NULL)
4064 hn_flush_txagg(ifp, txr);
4069 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4071 struct hn_softc *sc = ifp->if_softc;
4072 struct hn_tx_ring *txr;
4075 #if defined(INET6) || defined(INET)
4077 * Perform TSO packet header fixup now, since the TSO
4078 * packet header should be cache-hot.
4080 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4081 m = hn_tso_fixup(m);
4082 if (__predict_false(m == NULL)) {
4083 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4090 * Select the TX ring based on flowid
4092 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
4093 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4094 txr = &sc->hn_tx_ring[idx];
4096 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4098 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4102 if (txr->hn_oactive)
4105 if (txr->hn_sched_tx)
4108 if (mtx_trylock(&txr->hn_tx_lock)) {
4111 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4112 mtx_unlock(&txr->hn_tx_lock);
4117 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4122 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4126 mtx_lock(&txr->hn_tx_lock);
4127 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4129 mtx_unlock(&txr->hn_tx_lock);
4133 hn_xmit_qflush(struct ifnet *ifp)
4135 struct hn_softc *sc = ifp->if_softc;
4138 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4139 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4144 hn_xmit_txeof(struct hn_tx_ring *txr)
4147 if (txr->hn_sched_tx)
4150 if (mtx_trylock(&txr->hn_tx_lock)) {
4153 txr->hn_oactive = 0;
4154 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4155 mtx_unlock(&txr->hn_tx_lock);
4157 taskqueue_enqueue(txr->hn_tx_taskq,
4163 * Release the oactive earlier, with the hope, that
4164 * others could catch up. The task will clear the
4165 * oactive again with the hn_tx_lock to avoid possible
4168 txr->hn_oactive = 0;
4169 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4174 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4176 struct hn_tx_ring *txr = xtxr;
4178 mtx_lock(&txr->hn_tx_lock);
4180 mtx_unlock(&txr->hn_tx_lock);
4184 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4186 struct hn_tx_ring *txr = xtxr;
4188 mtx_lock(&txr->hn_tx_lock);
4189 txr->hn_oactive = 0;
4191 mtx_unlock(&txr->hn_tx_lock);
4195 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4197 struct vmbus_chan_br cbr;
4198 struct hn_rx_ring *rxr;
4199 struct hn_tx_ring *txr = NULL;
4202 idx = vmbus_chan_subidx(chan);
4205 * Link this channel to RX/TX ring.
4207 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4208 ("invalid channel index %d, should > 0 && < %d",
4209 idx, sc->hn_rx_ring_inuse));
4210 rxr = &sc->hn_rx_ring[idx];
4211 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4212 ("RX ring %d already attached", idx));
4213 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4216 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4217 idx, vmbus_chan_id(chan));
4220 if (idx < sc->hn_tx_ring_inuse) {
4221 txr = &sc->hn_tx_ring[idx];
4222 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4223 ("TX ring %d already attached", idx));
4224 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4226 txr->hn_chan = chan;
4228 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4229 idx, vmbus_chan_id(chan));
4233 /* Bind this channel to a proper CPU. */
4234 vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus);
4239 cbr.cbr = rxr->hn_br;
4240 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4241 cbr.cbr_txsz = HN_TXBR_SIZE;
4242 cbr.cbr_rxsz = HN_RXBR_SIZE;
4243 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4245 if (error == EISCONN) {
4246 if_printf(sc->hn_ifp, "bufring is connected after "
4247 "chan%u open failure\n", vmbus_chan_id(chan));
4248 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4250 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4251 vmbus_chan_id(chan), error);
4258 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4260 struct hn_rx_ring *rxr;
4263 idx = vmbus_chan_subidx(chan);
4266 * Link this channel to RX/TX ring.
4268 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4269 ("invalid channel index %d, should > 0 && < %d",
4270 idx, sc->hn_rx_ring_inuse));
4271 rxr = &sc->hn_rx_ring[idx];
4272 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4273 ("RX ring %d is not attached", idx));
4274 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4276 if (idx < sc->hn_tx_ring_inuse) {
4277 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4279 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4280 ("TX ring %d is not attached attached", idx));
4281 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4285 * Close this channel.
4288 * Channel closing does _not_ destroy the target channel.
4290 error = vmbus_chan_close_direct(chan);
4291 if (error == EISCONN) {
4292 if_printf(sc->hn_ifp, "chan%u bufring is connected "
4293 "after being closed\n", vmbus_chan_id(chan));
4294 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4296 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4297 vmbus_chan_id(chan), error);
4302 hn_attach_subchans(struct hn_softc *sc)
4304 struct vmbus_channel **subchans;
4305 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4308 KASSERT(subchan_cnt > 0, ("no sub-channels"));
4310 /* Attach the sub-channels. */
4311 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4312 for (i = 0; i < subchan_cnt; ++i) {
4315 error1 = hn_chan_attach(sc, subchans[i]);
4318 /* Move on; all channels will be detached later. */
4321 vmbus_subchan_rel(subchans, subchan_cnt);
4324 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4327 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4335 hn_detach_allchans(struct hn_softc *sc)
4337 struct vmbus_channel **subchans;
4338 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4341 if (subchan_cnt == 0)
4344 /* Detach the sub-channels. */
4345 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4346 for (i = 0; i < subchan_cnt; ++i)
4347 hn_chan_detach(sc, subchans[i]);
4348 vmbus_subchan_rel(subchans, subchan_cnt);
4352 * Detach the primary channel, _after_ all sub-channels
4355 hn_chan_detach(sc, sc->hn_prichan);
4357 /* Wait for sub-channels to be destroyed, if any. */
4358 vmbus_subchan_drain(sc->hn_prichan);
4361 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4362 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4363 HN_RX_FLAG_ATTACHED) == 0,
4364 ("%dth RX ring is still attached", i));
4366 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4367 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4368 HN_TX_FLAG_ATTACHED) == 0,
4369 ("%dth TX ring is still attached", i));
4375 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4377 struct vmbus_channel **subchans;
4378 int nchan, rxr_cnt, error;
4380 nchan = *nsubch + 1;
4383 * Multiple RX/TX rings are not requested.
4390 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4393 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4395 /* No RSS; this is benign. */
4400 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4404 if (nchan > rxr_cnt)
4407 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4413 * Allocate sub-channels from NVS.
4415 *nsubch = nchan - 1;
4416 error = hn_nvs_alloc_subchans(sc, nsubch);
4417 if (error || *nsubch == 0) {
4418 /* Failed to allocate sub-channels. */
4424 * Wait for all sub-channels to become ready before moving on.
4426 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4427 vmbus_subchan_rel(subchans, *nsubch);
4432 hn_synth_attachable(const struct hn_softc *sc)
4436 if (sc->hn_flags & HN_FLAG_ERRORS)
4439 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4440 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4442 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4449 hn_synth_attach(struct hn_softc *sc, int mtu)
4451 #define ATTACHED_NVS 0x0002
4452 #define ATTACHED_RNDIS 0x0004
4454 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4455 int error, nsubch, nchan, i;
4456 uint32_t old_caps, attached = 0;
4458 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4459 ("synthetic parts were attached"));
4461 if (!hn_synth_attachable(sc))
4464 /* Save capabilities for later verification. */
4465 old_caps = sc->hn_caps;
4468 /* Clear RSS stuffs. */
4469 sc->hn_rss_ind_size = 0;
4470 sc->hn_rss_hash = 0;
4473 * Attach the primary channel _before_ attaching NVS and RNDIS.
4475 error = hn_chan_attach(sc, sc->hn_prichan);
4482 error = hn_nvs_attach(sc, mtu);
4485 attached |= ATTACHED_NVS;
4488 * Attach RNDIS _after_ NVS is attached.
4490 error = hn_rndis_attach(sc, mtu);
4493 attached |= ATTACHED_RNDIS;
4496 * Make sure capabilities are not changed.
4498 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4499 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4500 old_caps, sc->hn_caps);
4506 * Allocate sub-channels for multi-TX/RX rings.
4509 * The # of RX rings that can be used is equivalent to the # of
4510 * channels to be requested.
4512 nsubch = sc->hn_rx_ring_cnt - 1;
4513 error = hn_synth_alloc_subchans(sc, &nsubch);
4516 /* NOTE: _Full_ synthetic parts detach is required now. */
4517 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4520 * Set the # of TX/RX rings that could be used according to
4521 * the # of channels that NVS offered.
4524 hn_set_ring_inuse(sc, nchan);
4526 /* Only the primary channel can be used; done */
4531 * Attach the sub-channels.
4533 * NOTE: hn_set_ring_inuse() _must_ have been called.
4535 error = hn_attach_subchans(sc);
4540 * Configure RSS key and indirect table _after_ all sub-channels
4543 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4545 * RSS key is not set yet; set it to the default RSS key.
4548 if_printf(sc->hn_ifp, "setup default RSS key\n");
4549 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4550 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4553 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4555 * RSS indirect table is not set yet; set it up in round-
4559 if_printf(sc->hn_ifp, "setup default RSS indirect "
4562 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
4563 rss->rss_ind[i] = i % nchan;
4564 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4567 * # of usable channels may be changed, so we have to
4568 * make sure that all entries in RSS indirect table
4571 * NOTE: hn_set_ring_inuse() _must_ have been called.
4573 hn_rss_ind_fixup(sc);
4576 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4581 * Fixup transmission aggregation setup.
4587 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4588 hn_synth_detach(sc);
4590 if (attached & ATTACHED_RNDIS)
4591 hn_rndis_detach(sc);
4592 if (attached & ATTACHED_NVS)
4594 hn_chan_detach(sc, sc->hn_prichan);
4595 /* Restore old capabilities. */
4596 sc->hn_caps = old_caps;
4600 #undef ATTACHED_RNDIS
4606 * The interface must have been suspended though hn_suspend(), before
4607 * this function get called.
4610 hn_synth_detach(struct hn_softc *sc)
4613 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4614 ("synthetic parts were not attached"));
4616 /* Detach the RNDIS first. */
4617 hn_rndis_detach(sc);
4622 /* Detach all of the channels. */
4623 hn_detach_allchans(sc);
4625 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4629 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4631 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4632 ("invalid ring count %d", ring_cnt));
4634 if (sc->hn_tx_ring_cnt > ring_cnt)
4635 sc->hn_tx_ring_inuse = ring_cnt;
4637 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4638 sc->hn_rx_ring_inuse = ring_cnt;
4641 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4642 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4647 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4652 * The TX bufring will not be drained by the hypervisor,
4653 * if the primary channel is revoked.
4655 while (!vmbus_chan_rx_empty(chan) ||
4656 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4657 !vmbus_chan_tx_empty(chan)))
4659 vmbus_chan_intr_drain(chan);
4663 hn_suspend_data(struct hn_softc *sc)
4665 struct vmbus_channel **subch = NULL;
4666 struct hn_tx_ring *txr;
4674 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4675 txr = &sc->hn_tx_ring[i];
4677 mtx_lock(&txr->hn_tx_lock);
4678 txr->hn_suspended = 1;
4679 mtx_unlock(&txr->hn_tx_lock);
4680 /* No one is able send more packets now. */
4683 * Wait for all pending sends to finish.
4686 * We will _not_ receive all pending send-done, if the
4687 * primary channel is revoked.
4689 while (hn_tx_ring_pending(txr) &&
4690 !vmbus_chan_is_revoked(sc->hn_prichan))
4691 pause("hnwtx", 1 /* 1 tick */);
4695 * Disable RX by clearing RX filter.
4697 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
4698 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
4701 * Give RNDIS enough time to flush all pending data packets.
4703 pause("waitrx", (200 * hz) / 1000);
4706 * Drain RX/TX bufrings and interrupts.
4708 nsubch = sc->hn_rx_ring_inuse - 1;
4710 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4712 if (subch != NULL) {
4713 for (i = 0; i < nsubch; ++i)
4714 hn_chan_drain(sc, subch[i]);
4716 hn_chan_drain(sc, sc->hn_prichan);
4719 vmbus_subchan_rel(subch, nsubch);
4722 * Drain any pending TX tasks.
4725 * The above hn_chan_drain() can dispatch TX tasks, so the TX
4726 * tasks will have to be drained _after_ the above hn_chan_drain()
4729 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4730 txr = &sc->hn_tx_ring[i];
4732 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4733 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4738 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4741 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4745 hn_suspend_mgmt(struct hn_softc *sc)
4752 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4753 * through hn_mgmt_taskq.
4755 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4756 vmbus_chan_run_task(sc->hn_prichan, &task);
4759 * Make sure that all pending management tasks are completed.
4761 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4762 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4763 taskqueue_drain_all(sc->hn_mgmt_taskq0);
4767 hn_suspend(struct hn_softc *sc)
4770 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4771 hn_suspend_data(sc);
4772 hn_suspend_mgmt(sc);
4776 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4780 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4781 ("invalid TX ring count %d", tx_ring_cnt));
4783 for (i = 0; i < tx_ring_cnt; ++i) {
4784 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4786 mtx_lock(&txr->hn_tx_lock);
4787 txr->hn_suspended = 0;
4788 mtx_unlock(&txr->hn_tx_lock);
4793 hn_resume_data(struct hn_softc *sc)
4802 hn_set_rxfilter(sc);
4805 * Make sure to clear suspend status on "all" TX rings,
4806 * since hn_tx_ring_inuse can be changed after
4807 * hn_suspend_data().
4809 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4811 #ifdef HN_IFSTART_SUPPORT
4812 if (!hn_use_if_start)
4816 * Flush unused drbrs, since hn_tx_ring_inuse may be
4819 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4820 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4826 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4827 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4830 * Use txeof task, so that any pending oactive can be
4833 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4838 hn_resume_mgmt(struct hn_softc *sc)
4841 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4844 * Kick off network change detection, if it was pending.
4845 * If no network change was pending, start link status
4846 * checks, which is more lightweight than network change
4849 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4850 hn_change_network(sc);
4852 hn_update_link_status(sc);
4856 hn_resume(struct hn_softc *sc)
4859 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4865 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4867 const struct rndis_status_msg *msg;
4870 if (dlen < sizeof(*msg)) {
4871 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4876 switch (msg->rm_status) {
4877 case RNDIS_STATUS_MEDIA_CONNECT:
4878 case RNDIS_STATUS_MEDIA_DISCONNECT:
4879 hn_update_link_status(sc);
4882 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4883 /* Not really useful; ignore. */
4886 case RNDIS_STATUS_NETWORK_CHANGE:
4887 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4888 if (dlen < ofs + msg->rm_stbuflen ||
4889 msg->rm_stbuflen < sizeof(uint32_t)) {
4890 if_printf(sc->hn_ifp, "network changed\n");
4894 memcpy(&change, ((const uint8_t *)msg) + ofs,
4896 if_printf(sc->hn_ifp, "network changed, change %u\n",
4899 hn_change_network(sc);
4903 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4910 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
4912 const struct rndis_pktinfo *pi = info_data;
4915 while (info_dlen != 0) {
4919 if (__predict_false(info_dlen < sizeof(*pi)))
4921 if (__predict_false(info_dlen < pi->rm_size))
4923 info_dlen -= pi->rm_size;
4925 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
4927 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
4929 dlen = pi->rm_size - pi->rm_pktinfooffset;
4932 switch (pi->rm_type) {
4933 case NDIS_PKTINFO_TYPE_VLAN:
4934 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
4936 info->vlan_info = *((const uint32_t *)data);
4937 mask |= HN_RXINFO_VLAN;
4940 case NDIS_PKTINFO_TYPE_CSUM:
4941 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
4943 info->csum_info = *((const uint32_t *)data);
4944 mask |= HN_RXINFO_CSUM;
4947 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
4948 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
4950 info->hash_value = *((const uint32_t *)data);
4951 mask |= HN_RXINFO_HASHVAL;
4954 case HN_NDIS_PKTINFO_TYPE_HASHINF:
4955 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
4957 info->hash_info = *((const uint32_t *)data);
4958 mask |= HN_RXINFO_HASHINF;
4965 if (mask == HN_RXINFO_ALL) {
4966 /* All found; done */
4970 pi = (const struct rndis_pktinfo *)
4971 ((const uint8_t *)pi + pi->rm_size);
4976 * - If there is no hash value, invalidate the hash info.
4978 if ((mask & HN_RXINFO_HASHVAL) == 0)
4979 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
4983 static __inline bool
4984 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
4987 if (off < check_off) {
4988 if (__predict_true(off + len <= check_off))
4990 } else if (off > check_off) {
4991 if (__predict_true(check_off + check_len <= off))
4998 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5000 const struct rndis_packet_msg *pkt;
5001 struct hn_rxinfo info;
5002 int data_off, pktinfo_off, data_len, pktinfo_len;
5007 if (__predict_false(dlen < sizeof(*pkt))) {
5008 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5013 if (__predict_false(dlen < pkt->rm_len)) {
5014 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5015 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5018 if (__predict_false(pkt->rm_len <
5019 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5020 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5021 "msglen %u, data %u, oob %u, pktinfo %u\n",
5022 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5023 pkt->rm_pktinfolen);
5026 if (__predict_false(pkt->rm_datalen == 0)) {
5027 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5034 #define IS_OFFSET_INVALID(ofs) \
5035 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
5036 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5038 /* XXX Hyper-V does not meet data offset alignment requirement */
5039 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5040 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5041 "data offset %u\n", pkt->rm_dataoffset);
5044 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5045 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5046 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5047 "oob offset %u\n", pkt->rm_oobdataoffset);
5050 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5051 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5052 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5053 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5057 #undef IS_OFFSET_INVALID
5059 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5060 data_len = pkt->rm_datalen;
5061 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5062 pktinfo_len = pkt->rm_pktinfolen;
5065 * Check OOB coverage.
5067 if (__predict_false(pkt->rm_oobdatalen != 0)) {
5068 int oob_off, oob_len;
5070 if_printf(rxr->hn_ifp, "got oobdata\n");
5071 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5072 oob_len = pkt->rm_oobdatalen;
5074 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5075 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5076 "oob overflow, msglen %u, oob abs %d len %d\n",
5077 pkt->rm_len, oob_off, oob_len);
5082 * Check against data.
5084 if (hn_rndis_check_overlap(oob_off, oob_len,
5085 data_off, data_len)) {
5086 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5087 "oob overlaps data, oob abs %d len %d, "
5088 "data abs %d len %d\n",
5089 oob_off, oob_len, data_off, data_len);
5094 * Check against pktinfo.
5096 if (pktinfo_len != 0 &&
5097 hn_rndis_check_overlap(oob_off, oob_len,
5098 pktinfo_off, pktinfo_len)) {
5099 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5100 "oob overlaps pktinfo, oob abs %d len %d, "
5101 "pktinfo abs %d len %d\n",
5102 oob_off, oob_len, pktinfo_off, pktinfo_len);
5108 * Check per-packet-info coverage and find useful per-packet-info.
5110 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5111 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5112 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5113 if (__predict_true(pktinfo_len != 0)) {
5117 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5118 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5119 "pktinfo overflow, msglen %u, "
5120 "pktinfo abs %d len %d\n",
5121 pkt->rm_len, pktinfo_off, pktinfo_len);
5126 * Check packet info coverage.
5128 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5129 data_off, data_len);
5130 if (__predict_false(overlap)) {
5131 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5132 "pktinfo overlap data, pktinfo abs %d len %d, "
5133 "data abs %d len %d\n",
5134 pktinfo_off, pktinfo_len, data_off, data_len);
5139 * Find useful per-packet-info.
5141 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5142 pktinfo_len, &info);
5143 if (__predict_false(error)) {
5144 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5150 if (__predict_false(data_off + data_len > pkt->rm_len)) {
5151 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5152 "data overflow, msglen %u, data abs %d len %d\n",
5153 pkt->rm_len, data_off, data_len);
5156 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5159 static __inline void
5160 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5162 const struct rndis_msghdr *hdr;
5164 if (__predict_false(dlen < sizeof(*hdr))) {
5165 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5170 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5171 /* Hot data path. */
5172 hn_rndis_rx_data(rxr, data, dlen);
5177 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5178 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5180 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5184 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5186 const struct hn_nvs_hdr *hdr;
5188 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5189 if_printf(sc->hn_ifp, "invalid nvs notify\n");
5192 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5194 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5195 /* Useless; ignore */
5198 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5202 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5203 const struct vmbus_chanpkt_hdr *pkt)
5205 struct hn_nvs_sendctx *sndc;
5207 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5208 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5209 VMBUS_CHANPKT_DATALEN(pkt));
5212 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5218 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5219 const struct vmbus_chanpkt_hdr *pkthdr)
5221 const struct vmbus_chanpkt_rxbuf *pkt;
5222 const struct hn_nvs_hdr *nvs_hdr;
5225 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5226 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5229 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5231 /* Make sure that this is a RNDIS message. */
5232 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5233 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5238 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5239 if (__predict_false(hlen < sizeof(*pkt))) {
5240 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5243 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5245 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5246 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5251 count = pkt->cp_rxbuf_cnt;
5252 if (__predict_false(hlen <
5253 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5254 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5258 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5259 for (i = 0; i < count; ++i) {
5262 ofs = pkt->cp_rxbuf[i].rb_ofs;
5263 len = pkt->cp_rxbuf[i].rb_len;
5264 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5265 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5266 "ofs %d, len %d\n", i, ofs, len);
5269 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5273 * Ack the consumed RXBUF associated w/ this channel packet,
5274 * so that this RXBUF can be recycled by the hypervisor.
5276 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5280 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5283 struct hn_nvs_rndis_ack ack;
5286 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5287 ack.nvs_status = HN_NVS_STATUS_OK;
5291 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5292 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5293 if (__predict_false(error == EAGAIN)) {
5296 * This should _not_ happen in real world, since the
5297 * consumption of the TX bufring from the TX path is
5300 if (rxr->hn_ack_failed == 0)
5301 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5302 rxr->hn_ack_failed++;
5309 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5314 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5316 struct hn_rx_ring *rxr = xrxr;
5317 struct hn_softc *sc = rxr->hn_ifp->if_softc;
5320 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5323 pktlen = rxr->hn_pktbuf_len;
5324 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5325 if (__predict_false(error == ENOBUFS)) {
5330 * Expand channel packet buffer.
5333 * Use M_WAITOK here, since allocation failure
5336 nlen = rxr->hn_pktbuf_len * 2;
5337 while (nlen < pktlen)
5339 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5341 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5342 rxr->hn_pktbuf_len, nlen);
5344 free(rxr->hn_pktbuf, M_DEVBUF);
5345 rxr->hn_pktbuf = nbuf;
5346 rxr->hn_pktbuf_len = nlen;
5349 } else if (__predict_false(error == EAGAIN)) {
5350 /* No more channel packets; done! */
5353 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5355 switch (pkt->cph_type) {
5356 case VMBUS_CHANPKT_TYPE_COMP:
5357 hn_nvs_handle_comp(sc, chan, pkt);
5360 case VMBUS_CHANPKT_TYPE_RXBUF:
5361 hn_nvs_handle_rxbuf(rxr, chan, pkt);
5364 case VMBUS_CHANPKT_TYPE_INBAND:
5365 hn_nvs_handle_notify(sc, pkt);
5369 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5374 hn_chan_rollup(rxr, rxr->hn_txr);
5378 hn_tx_taskq_create(void *arg __unused)
5381 if (vm_guest != VM_GUEST_HV)
5384 if (!hn_share_tx_taskq)
5387 hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
5388 taskqueue_thread_enqueue, &hn_tx_taskq);
5389 taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
5390 if (hn_bind_tx_taskq >= 0) {
5391 int cpu = hn_bind_tx_taskq;
5392 struct task cpuset_task;
5395 if (cpu > mp_ncpus - 1)
5397 CPU_SETOF(cpu, &cpu_set);
5398 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, &cpu_set);
5399 taskqueue_enqueue(hn_tx_taskq, &cpuset_task);
5400 taskqueue_drain(hn_tx_taskq, &cpuset_task);
5403 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5404 hn_tx_taskq_create, NULL);
5407 hn_tx_taskq_destroy(void *arg __unused)
5410 if (hn_tx_taskq != NULL)
5411 taskqueue_free(hn_tx_taskq);
5413 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5414 hn_tx_taskq_destroy, NULL);