2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
62 #include <sys/param.h>
64 #include <sys/kernel.h>
65 #include <sys/limits.h>
66 #include <sys/malloc.h>
68 #include <sys/module.h>
70 #include <sys/queue.h>
73 #include <sys/socket.h>
74 #include <sys/sockio.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/taskqueue.h>
79 #include <sys/buf_ring.h>
81 #include <machine/atomic.h>
82 #include <machine/in_cksum.h>
85 #include <net/ethernet.h>
87 #include <net/if_arp.h>
88 #include <net/if_media.h>
89 #include <net/if_types.h>
90 #include <net/if_var.h>
91 #include <net/if_vlan_var.h>
92 #include <net/rndis.h>
94 #include <netinet/in_systm.h>
95 #include <netinet/in.h>
96 #include <netinet/ip.h>
97 #include <netinet/ip6.h>
98 #include <netinet/tcp.h>
99 #include <netinet/tcp_lro.h>
100 #include <netinet/udp.h>
102 #include <dev/hyperv/include/hyperv.h>
103 #include <dev/hyperv/include/hyperv_busdma.h>
104 #include <dev/hyperv/include/vmbus.h>
105 #include <dev/hyperv/include/vmbus_xact.h>
107 #include <dev/hyperv/netvsc/ndis.h>
108 #include <dev/hyperv/netvsc/if_hnreg.h>
109 #include <dev/hyperv/netvsc/if_hnvar.h>
110 #include <dev/hyperv/netvsc/hn_nvs.h>
111 #include <dev/hyperv/netvsc/hn_rndis.h>
113 #include "vmbus_if.h"
115 #define HN_IFSTART_SUPPORT
117 #define HN_RING_CNT_DEF_MAX 8
119 /* YYY should get it from the underlying channel */
120 #define HN_TX_DESC_CNT 512
122 #define HN_RNDIS_PKT_LEN \
123 (sizeof(struct rndis_packet_msg) + \
124 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
125 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
126 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
127 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
128 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
129 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
131 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
132 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
133 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
134 /* -1 for RNDIS packet message */
135 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
137 #define HN_DIRECT_TX_SIZE_DEF 128
139 #define HN_EARLY_TXEOF_THRESH 8
141 #define HN_PKTBUF_LEN_DEF (16 * 1024)
143 #define HN_LROENT_CNT_DEF 128
145 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
146 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
147 /* YYY 2*MTU is a bit rough, but should be good enough. */
148 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
150 #define HN_LRO_ACKCNT_DEF 1
152 #define HN_LOCK_INIT(sc) \
153 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
154 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
155 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
156 #define HN_LOCK(sc) \
158 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
161 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
163 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
164 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
165 #define HN_CSUM_IP_HWASSIST(sc) \
166 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
167 #define HN_CSUM_IP6_HWASSIST(sc) \
168 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
170 #define HN_PKTSIZE_MIN(align) \
171 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
172 HN_RNDIS_PKT_LEN, (align))
173 #define HN_PKTSIZE(m, align) \
174 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
176 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
179 #ifndef HN_USE_TXDESC_BUFRING
180 SLIST_ENTRY(hn_txdesc) link;
182 STAILQ_ENTRY(hn_txdesc) agg_link;
184 /* Aggregated txdescs, in sending order. */
185 STAILQ_HEAD(, hn_txdesc) agg_list;
187 /* The oldest packet, if transmission aggregation happens. */
189 struct hn_tx_ring *txr;
191 uint32_t flags; /* HN_TXD_FLAG_ */
192 struct hn_nvs_sendctx send_ctx;
196 bus_dmamap_t data_dmap;
198 bus_addr_t rndis_pkt_paddr;
199 struct rndis_packet_msg *rndis_pkt;
200 bus_dmamap_t rndis_pkt_dmap;
203 #define HN_TXD_FLAG_ONLIST 0x0001
204 #define HN_TXD_FLAG_DMAMAP 0x0002
205 #define HN_TXD_FLAG_ONAGG 0x0004
214 #define HN_RXINFO_VLAN 0x0001
215 #define HN_RXINFO_CSUM 0x0002
216 #define HN_RXINFO_HASHINF 0x0004
217 #define HN_RXINFO_HASHVAL 0x0008
218 #define HN_RXINFO_ALL \
221 HN_RXINFO_HASHINF | \
224 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
225 #define HN_NDIS_RXCSUM_INFO_INVALID 0
226 #define HN_NDIS_HASH_INFO_INVALID 0
228 static int hn_probe(device_t);
229 static int hn_attach(device_t);
230 static int hn_detach(device_t);
231 static int hn_shutdown(device_t);
232 static void hn_chan_callback(struct vmbus_channel *,
235 static void hn_init(void *);
236 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
237 #ifdef HN_IFSTART_SUPPORT
238 static void hn_start(struct ifnet *);
240 static int hn_transmit(struct ifnet *, struct mbuf *);
241 static void hn_xmit_qflush(struct ifnet *);
242 static int hn_ifmedia_upd(struct ifnet *);
243 static void hn_ifmedia_sts(struct ifnet *,
244 struct ifmediareq *);
246 static int hn_rndis_rxinfo(const void *, int,
248 static void hn_rndis_rx_data(struct hn_rx_ring *,
250 static void hn_rndis_rx_status(struct hn_softc *,
253 static void hn_nvs_handle_notify(struct hn_softc *,
254 const struct vmbus_chanpkt_hdr *);
255 static void hn_nvs_handle_comp(struct hn_softc *,
256 struct vmbus_channel *,
257 const struct vmbus_chanpkt_hdr *);
258 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
259 struct vmbus_channel *,
260 const struct vmbus_chanpkt_hdr *);
261 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
262 struct vmbus_channel *, uint64_t);
264 #if __FreeBSD_version >= 1100099
265 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
266 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
268 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
269 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
270 #if __FreeBSD_version < 1100095
271 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
273 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
275 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
276 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
277 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
278 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
279 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
280 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
281 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
282 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
283 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
284 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
285 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
286 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
287 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
288 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
290 static void hn_stop(struct hn_softc *);
291 static void hn_init_locked(struct hn_softc *);
292 static int hn_chan_attach(struct hn_softc *,
293 struct vmbus_channel *);
294 static void hn_chan_detach(struct hn_softc *,
295 struct vmbus_channel *);
296 static int hn_attach_subchans(struct hn_softc *);
297 static void hn_detach_allchans(struct hn_softc *);
298 static void hn_chan_rollup(struct hn_rx_ring *,
299 struct hn_tx_ring *);
300 static void hn_set_ring_inuse(struct hn_softc *, int);
301 static int hn_synth_attach(struct hn_softc *, int);
302 static void hn_synth_detach(struct hn_softc *);
303 static int hn_synth_alloc_subchans(struct hn_softc *,
305 static bool hn_synth_attachable(const struct hn_softc *);
306 static void hn_suspend(struct hn_softc *);
307 static void hn_suspend_data(struct hn_softc *);
308 static void hn_suspend_mgmt(struct hn_softc *);
309 static void hn_resume(struct hn_softc *);
310 static void hn_resume_data(struct hn_softc *);
311 static void hn_resume_mgmt(struct hn_softc *);
312 static void hn_suspend_mgmt_taskfunc(void *, int);
313 static void hn_chan_drain(struct hn_softc *,
314 struct vmbus_channel *);
316 static void hn_update_link_status(struct hn_softc *);
317 static void hn_change_network(struct hn_softc *);
318 static void hn_link_taskfunc(void *, int);
319 static void hn_netchg_init_taskfunc(void *, int);
320 static void hn_netchg_status_taskfunc(void *, int);
321 static void hn_link_status(struct hn_softc *);
323 static int hn_create_rx_data(struct hn_softc *, int);
324 static void hn_destroy_rx_data(struct hn_softc *);
325 static int hn_check_iplen(const struct mbuf *, int);
326 static int hn_set_rxfilter(struct hn_softc *);
327 static int hn_rss_reconfig(struct hn_softc *);
328 static void hn_rss_ind_fixup(struct hn_softc *);
329 static int hn_rxpkt(struct hn_rx_ring *, const void *,
330 int, const struct hn_rxinfo *);
332 static int hn_tx_ring_create(struct hn_softc *, int);
333 static void hn_tx_ring_destroy(struct hn_tx_ring *);
334 static int hn_create_tx_data(struct hn_softc *, int);
335 static void hn_fixup_tx_data(struct hn_softc *);
336 static void hn_destroy_tx_data(struct hn_softc *);
337 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
338 static void hn_txdesc_gc(struct hn_tx_ring *,
340 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
341 struct hn_txdesc *, struct mbuf **);
342 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
344 static void hn_set_chim_size(struct hn_softc *, int);
345 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
346 static bool hn_tx_ring_pending(struct hn_tx_ring *);
347 static void hn_tx_ring_qflush(struct hn_tx_ring *);
348 static void hn_resume_tx(struct hn_softc *, int);
349 static void hn_set_txagg(struct hn_softc *);
350 static void *hn_try_txagg(struct ifnet *,
351 struct hn_tx_ring *, struct hn_txdesc *,
353 static int hn_get_txswq_depth(const struct hn_tx_ring *);
354 static void hn_txpkt_done(struct hn_nvs_sendctx *,
355 struct hn_softc *, struct vmbus_channel *,
357 static int hn_txpkt_sglist(struct hn_tx_ring *,
359 static int hn_txpkt_chim(struct hn_tx_ring *,
361 static int hn_xmit(struct hn_tx_ring *, int);
362 static void hn_xmit_taskfunc(void *, int);
363 static void hn_xmit_txeof(struct hn_tx_ring *);
364 static void hn_xmit_txeof_taskfunc(void *, int);
365 #ifdef HN_IFSTART_SUPPORT
366 static int hn_start_locked(struct hn_tx_ring *, int);
367 static void hn_start_taskfunc(void *, int);
368 static void hn_start_txeof(struct hn_tx_ring *);
369 static void hn_start_txeof_taskfunc(void *, int);
372 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
373 "Hyper-V network interface");
375 /* Trust tcp segements verification on host side. */
376 static int hn_trust_hosttcp = 1;
377 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
378 &hn_trust_hosttcp, 0,
379 "Trust tcp segement verification on host side, "
380 "when csum info is missing (global setting)");
382 /* Trust udp datagrams verification on host side. */
383 static int hn_trust_hostudp = 1;
384 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
385 &hn_trust_hostudp, 0,
386 "Trust udp datagram verification on host side, "
387 "when csum info is missing (global setting)");
389 /* Trust ip packets verification on host side. */
390 static int hn_trust_hostip = 1;
391 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
393 "Trust ip packet verification on host side, "
394 "when csum info is missing (global setting)");
396 /* Limit TSO burst size */
397 static int hn_tso_maxlen = IP_MAXPACKET;
398 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
399 &hn_tso_maxlen, 0, "TSO burst limit");
401 /* Limit chimney send size */
402 static int hn_tx_chimney_size = 0;
403 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
404 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
406 /* Limit the size of packet for direct transmission */
407 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
408 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
409 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
411 /* # of LRO entries per RX ring */
412 #if defined(INET) || defined(INET6)
413 #if __FreeBSD_version >= 1100095
414 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
415 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
416 &hn_lro_entry_count, 0, "LRO entry count");
420 static int hn_tx_taskq_cnt = 1;
421 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
422 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
424 #define HN_TX_TASKQ_M_INDEP 0
425 #define HN_TX_TASKQ_M_GLOBAL 1
426 #define HN_TX_TASKQ_M_EVTTQ 2
428 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
429 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
430 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
431 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
433 #ifndef HN_USE_TXDESC_BUFRING
434 static int hn_use_txdesc_bufring = 0;
436 static int hn_use_txdesc_bufring = 1;
438 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
439 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
441 #ifdef HN_IFSTART_SUPPORT
442 /* Use ifnet.if_start instead of ifnet.if_transmit */
443 static int hn_use_if_start = 0;
444 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
445 &hn_use_if_start, 0, "Use if_start TX method");
448 /* # of channels to use */
449 static int hn_chan_cnt = 0;
450 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
452 "# of channels to use; each channel has one RX ring and one TX ring");
454 /* # of transmit rings to use */
455 static int hn_tx_ring_cnt = 0;
456 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
457 &hn_tx_ring_cnt, 0, "# of TX rings to use");
459 /* Software TX ring deptch */
460 static int hn_tx_swq_depth = 0;
461 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
462 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
464 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
465 #if __FreeBSD_version >= 1100095
466 static u_int hn_lro_mbufq_depth = 0;
467 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
468 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
471 /* Packet transmission aggregation size limit */
472 static int hn_tx_agg_size = -1;
473 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
474 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
476 /* Packet transmission aggregation count limit */
477 static int hn_tx_agg_pkts = -1;
478 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
479 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
481 static u_int hn_cpu_index; /* next CPU for channel */
482 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
485 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
486 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
487 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
488 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
489 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
490 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
493 static device_method_t hn_methods[] = {
494 /* Device interface */
495 DEVMETHOD(device_probe, hn_probe),
496 DEVMETHOD(device_attach, hn_attach),
497 DEVMETHOD(device_detach, hn_detach),
498 DEVMETHOD(device_shutdown, hn_shutdown),
502 static driver_t hn_driver = {
505 sizeof(struct hn_softc)
508 static devclass_t hn_devclass;
510 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
511 MODULE_VERSION(hn, 1);
512 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
514 #if __FreeBSD_version >= 1100099
516 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
520 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
521 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
526 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
529 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
530 txd->chim_size == 0, ("invalid rndis sglist txd"));
531 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
532 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
536 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
538 struct hn_nvs_rndis rndis;
540 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
541 txd->chim_size > 0, ("invalid rndis chim txd"));
543 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
544 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
545 rndis.nvs_chim_idx = txd->chim_index;
546 rndis.nvs_chim_sz = txd->chim_size;
548 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
549 &rndis, sizeof(rndis), &txd->send_ctx));
552 static __inline uint32_t
553 hn_chim_alloc(struct hn_softc *sc)
555 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
556 u_long *bmap = sc->hn_chim_bmap;
557 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
559 for (i = 0; i < bmap_cnt; ++i) {
562 idx = ffsl(~bmap[i]);
566 --idx; /* ffsl is 1-based */
567 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
568 ("invalid i %d and idx %d", i, idx));
570 if (atomic_testandset_long(&bmap[i], idx))
573 ret = i * LONG_BIT + idx;
580 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
585 idx = chim_idx / LONG_BIT;
586 KASSERT(idx < sc->hn_chim_bmap_cnt,
587 ("invalid chimney index 0x%x", chim_idx));
589 mask = 1UL << (chim_idx % LONG_BIT);
590 KASSERT(sc->hn_chim_bmap[idx] & mask,
591 ("index bitmap 0x%lx, chimney index %u, "
592 "bitmap idx %d, bitmask 0x%lx",
593 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
595 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
598 #if defined(INET6) || defined(INET)
600 * NOTE: If this function failed, the m_head would be freed.
602 static __inline struct mbuf *
603 hn_tso_fixup(struct mbuf *m_head)
605 struct ether_vlan_header *evl;
609 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
611 #define PULLUP_HDR(m, len) \
613 if (__predict_false((m)->m_len < (len))) { \
614 (m) = m_pullup((m), (len)); \
620 PULLUP_HDR(m_head, sizeof(*evl));
621 evl = mtod(m_head, struct ether_vlan_header *);
622 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
623 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
625 ehlen = ETHER_HDR_LEN;
628 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
632 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
633 ip = mtodo(m_head, ehlen);
634 iphlen = ip->ip_hl << 2;
636 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
637 th = mtodo(m_head, ehlen + iphlen);
641 th->th_sum = in_pseudo(ip->ip_src.s_addr,
642 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
645 #if defined(INET6) && defined(INET)
652 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
653 ip6 = mtodo(m_head, ehlen);
654 if (ip6->ip6_nxt != IPPROTO_TCP) {
659 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
660 th = mtodo(m_head, ehlen + sizeof(*ip6));
663 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
670 #endif /* INET6 || INET */
673 hn_set_rxfilter(struct hn_softc *sc)
675 struct ifnet *ifp = sc->hn_ifp;
681 if (ifp->if_flags & IFF_PROMISC) {
682 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
684 filter = NDIS_PACKET_TYPE_DIRECTED;
685 if (ifp->if_flags & IFF_BROADCAST)
686 filter |= NDIS_PACKET_TYPE_BROADCAST;
687 /* TODO: support multicast list */
688 if ((ifp->if_flags & IFF_ALLMULTI) ||
689 !TAILQ_EMPTY(&ifp->if_multiaddrs))
690 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
693 if (sc->hn_rx_filter != filter) {
694 error = hn_rndis_set_rxfilter(sc, filter);
696 sc->hn_rx_filter = filter;
702 hn_set_txagg(struct hn_softc *sc)
708 * Setup aggregation size.
710 if (sc->hn_agg_size < 0)
713 size = sc->hn_agg_size;
715 if (sc->hn_rndis_agg_size < size)
716 size = sc->hn_rndis_agg_size;
718 /* NOTE: We only aggregate packets using chimney sending buffers. */
719 if (size > (uint32_t)sc->hn_chim_szmax)
720 size = sc->hn_chim_szmax;
722 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
729 /* NOTE: Type of the per TX ring setting is 'int'. */
734 * Setup aggregation packet count.
736 if (sc->hn_agg_pkts < 0)
739 pkts = sc->hn_agg_pkts;
741 if (sc->hn_rndis_agg_pkts < pkts)
742 pkts = sc->hn_rndis_agg_pkts;
751 /* NOTE: Type of the per TX ring setting is 'short'. */
756 /* NOTE: Type of the per TX ring setting is 'short'. */
757 if (sc->hn_rndis_agg_align > SHRT_MAX) {
764 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
765 size, pkts, sc->hn_rndis_agg_align);
768 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
769 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
771 mtx_lock(&txr->hn_tx_lock);
772 txr->hn_agg_szmax = size;
773 txr->hn_agg_pktmax = pkts;
774 txr->hn_agg_align = sc->hn_rndis_agg_align;
775 mtx_unlock(&txr->hn_tx_lock);
780 hn_get_txswq_depth(const struct hn_tx_ring *txr)
783 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
784 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
785 return txr->hn_txdesc_cnt;
786 return hn_tx_swq_depth;
790 hn_rss_reconfig(struct hn_softc *sc)
796 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
803 * Direct reconfiguration by setting the UNCHG flags does
804 * _not_ work properly.
807 if_printf(sc->hn_ifp, "disable RSS\n");
808 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
810 if_printf(sc->hn_ifp, "RSS disable failed\n");
815 * Reenable the RSS w/ the updated RSS key or indirect
819 if_printf(sc->hn_ifp, "reconfig RSS\n");
820 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
822 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
829 hn_rss_ind_fixup(struct hn_softc *sc)
831 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
834 nchan = sc->hn_rx_ring_inuse;
835 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
838 * Check indirect table to make sure that all channels in it
841 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
842 if (rss->rss_ind[i] >= nchan) {
843 if_printf(sc->hn_ifp,
844 "RSS indirect table %d fixup: %u -> %d\n",
845 i, rss->rss_ind[i], nchan - 1);
846 rss->rss_ind[i] = nchan - 1;
852 hn_ifmedia_upd(struct ifnet *ifp __unused)
859 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
861 struct hn_softc *sc = ifp->if_softc;
863 ifmr->ifm_status = IFM_AVALID;
864 ifmr->ifm_active = IFM_ETHER;
866 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
867 ifmr->ifm_active |= IFM_NONE;
870 ifmr->ifm_status |= IFM_ACTIVE;
871 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
874 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
875 static const struct hyperv_guid g_net_vsc_device_type = {
876 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
877 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
881 hn_probe(device_t dev)
884 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
885 &g_net_vsc_device_type) == 0) {
886 device_set_desc(dev, "Hyper-V Network Interface");
887 return BUS_PROBE_DEFAULT;
893 hn_attach(device_t dev)
895 struct hn_softc *sc = device_get_softc(dev);
896 struct sysctl_oid_list *child;
897 struct sysctl_ctx_list *ctx;
898 uint8_t eaddr[ETHER_ADDR_LEN];
899 struct ifnet *ifp = NULL;
900 int error, ring_cnt, tx_ring_cnt;
903 sc->hn_prichan = vmbus_get_channel(dev);
907 * Initialize these tunables once.
909 sc->hn_agg_size = hn_tx_agg_size;
910 sc->hn_agg_pkts = hn_tx_agg_pkts;
913 * Setup taskqueue for transmission.
915 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
919 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
921 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
922 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
923 M_WAITOK, taskqueue_thread_enqueue,
924 &sc->hn_tx_taskqs[i]);
925 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
926 "%s tx%d", device_get_nameunit(dev), i);
928 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
929 sc->hn_tx_taskqs = hn_tx_taskque;
933 * Setup taskqueue for mangement tasks, e.g. link status.
935 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
936 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
937 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
938 device_get_nameunit(dev));
939 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
940 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
941 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
942 hn_netchg_status_taskfunc, sc);
945 * Allocate ifnet and setup its name earlier, so that if_printf
946 * can be used by functions, which will be called after
949 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
951 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
954 * Initialize ifmedia earlier so that it can be unconditionally
955 * destroyed, if error happened later on.
957 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
960 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
961 * to use (tx_ring_cnt).
964 * The # of RX rings to use is same as the # of channels to use.
966 ring_cnt = hn_chan_cnt;
970 if (ring_cnt > HN_RING_CNT_DEF_MAX)
971 ring_cnt = HN_RING_CNT_DEF_MAX;
972 } else if (ring_cnt > mp_ncpus) {
976 tx_ring_cnt = hn_tx_ring_cnt;
977 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
978 tx_ring_cnt = ring_cnt;
979 #ifdef HN_IFSTART_SUPPORT
980 if (hn_use_if_start) {
981 /* ifnet.if_start only needs one TX ring. */
987 * Set the leader CPU for channels.
989 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
992 * Create enough TX/RX rings, even if only limited number of
993 * channels can be allocated.
995 error = hn_create_tx_data(sc, tx_ring_cnt);
998 error = hn_create_rx_data(sc, ring_cnt);
1003 * Create transaction context for NVS and RNDIS transactions.
1005 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1006 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1007 if (sc->hn_xact == NULL) {
1013 * Install orphan handler for the revocation of this device's
1017 * The processing order is critical here:
1018 * Install the orphan handler, _before_ testing whether this
1019 * device's primary channel has been revoked or not.
1021 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1022 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1028 * Attach the synthetic parts, i.e. NVS and RNDIS.
1030 error = hn_synth_attach(sc, ETHERMTU);
1034 error = hn_rndis_get_eaddr(sc, eaddr);
1038 #if __FreeBSD_version >= 1100099
1039 if (sc->hn_rx_ring_inuse > 1) {
1041 * Reduce TCP segment aggregation limit for multiple
1042 * RX rings to increase ACK timeliness.
1044 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1049 * Fixup TX stuffs after synthetic parts are attached.
1051 hn_fixup_tx_data(sc);
1053 ctx = device_get_sysctl_ctx(dev);
1054 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1055 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1056 &sc->hn_nvs_ver, 0, "NVS version");
1057 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1058 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1059 hn_ndis_version_sysctl, "A", "NDIS version");
1060 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1061 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1062 hn_caps_sysctl, "A", "capabilities");
1063 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1064 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1065 hn_hwassist_sysctl, "A", "hwassist");
1066 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1067 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1068 hn_rxfilter_sysctl, "A", "rxfilter");
1069 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1070 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1071 hn_rss_hash_sysctl, "A", "RSS hash");
1072 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1073 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1074 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1075 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1076 hn_rss_key_sysctl, "IU", "RSS key");
1077 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1078 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1079 hn_rss_ind_sysctl, "IU", "RSS indirect table");
1080 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1081 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1082 "RNDIS offered packet transmission aggregation size limit");
1083 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1084 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1085 "RNDIS offered packet transmission aggregation count limit");
1086 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1087 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1088 "RNDIS packet transmission aggregation alignment");
1089 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1090 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1091 hn_txagg_size_sysctl, "I",
1092 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1093 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1094 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1095 hn_txagg_pkts_sysctl, "I",
1096 "Packet transmission aggregation packets, "
1097 "0 -- disable, -1 -- auto");
1100 * Setup the ifmedia, which has been initialized earlier.
1102 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1103 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1104 /* XXX ifmedia_set really should do this for us */
1105 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1108 * Setup the ifnet for this interface.
1112 ifp->if_baudrate = IF_Gbps(10);
1114 /* if_baudrate is 32bits on 32bit system. */
1115 ifp->if_baudrate = IF_Gbps(1);
1117 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1118 ifp->if_ioctl = hn_ioctl;
1119 ifp->if_init = hn_init;
1120 #ifdef HN_IFSTART_SUPPORT
1121 if (hn_use_if_start) {
1122 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1124 ifp->if_start = hn_start;
1125 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1126 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1127 IFQ_SET_READY(&ifp->if_snd);
1131 ifp->if_transmit = hn_transmit;
1132 ifp->if_qflush = hn_xmit_qflush;
1135 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1137 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1138 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1140 if (sc->hn_caps & HN_CAP_VLAN) {
1141 /* XXX not sure about VLAN_MTU. */
1142 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1145 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1146 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1147 ifp->if_capabilities |= IFCAP_TXCSUM;
1148 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1149 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1150 if (sc->hn_caps & HN_CAP_TSO4) {
1151 ifp->if_capabilities |= IFCAP_TSO4;
1152 ifp->if_hwassist |= CSUM_IP_TSO;
1154 if (sc->hn_caps & HN_CAP_TSO6) {
1155 ifp->if_capabilities |= IFCAP_TSO6;
1156 ifp->if_hwassist |= CSUM_IP6_TSO;
1159 /* Enable all available capabilities by default. */
1160 ifp->if_capenable = ifp->if_capabilities;
1163 * Disable IPv6 TSO and TXCSUM by default, they still can
1164 * be enabled through SIOCSIFCAP.
1166 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1167 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1169 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1170 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1171 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1172 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1175 ether_ifattach(ifp, eaddr);
1177 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1178 if_printf(ifp, "TSO segcnt %u segsz %u\n",
1179 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1182 /* Inform the upper layer about the long frame support. */
1183 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1186 * Kick off link status check.
1188 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1189 hn_update_link_status(sc);
1193 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1194 hn_synth_detach(sc);
1200 hn_detach(device_t dev)
1202 struct hn_softc *sc = device_get_softc(dev);
1203 struct ifnet *ifp = sc->hn_ifp;
1205 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1207 * In case that the vmbus missed the orphan handler
1210 vmbus_xact_ctx_orphan(sc->hn_xact);
1213 if (device_is_attached(dev)) {
1215 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1216 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1220 * hn_stop() only suspends data, so managment
1221 * stuffs have to be suspended manually here.
1223 hn_suspend_mgmt(sc);
1224 hn_synth_detach(sc);
1227 ether_ifdetach(ifp);
1230 ifmedia_removeall(&sc->hn_media);
1231 hn_destroy_rx_data(sc);
1232 hn_destroy_tx_data(sc);
1234 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1237 for (i = 0; i < hn_tx_taskq_cnt; ++i)
1238 taskqueue_free(sc->hn_tx_taskqs[i]);
1239 free(sc->hn_tx_taskqs, M_DEVBUF);
1241 taskqueue_free(sc->hn_mgmt_taskq0);
1243 if (sc->hn_xact != NULL) {
1245 * Uninstall the orphan handler _before_ the xact is
1248 vmbus_chan_unset_orphan(sc->hn_prichan);
1249 vmbus_xact_ctx_destroy(sc->hn_xact);
1254 HN_LOCK_DESTROY(sc);
1259 hn_shutdown(device_t dev)
1266 hn_link_status(struct hn_softc *sc)
1268 uint32_t link_status;
1271 error = hn_rndis_get_linkstatus(sc, &link_status);
1273 /* XXX what to do? */
1277 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1278 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1280 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1281 if_link_state_change(sc->hn_ifp,
1282 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1283 LINK_STATE_UP : LINK_STATE_DOWN);
1287 hn_link_taskfunc(void *xsc, int pending __unused)
1289 struct hn_softc *sc = xsc;
1291 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1297 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1299 struct hn_softc *sc = xsc;
1301 /* Prevent any link status checks from running. */
1302 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1305 * Fake up a [link down --> link up] state change; 5 seconds
1306 * delay is used, which closely simulates miibus reaction
1307 * upon link down event.
1309 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1310 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1311 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1312 &sc->hn_netchg_status, 5 * hz);
1316 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1318 struct hn_softc *sc = xsc;
1320 /* Re-allow link status checks. */
1321 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1326 hn_update_link_status(struct hn_softc *sc)
1329 if (sc->hn_mgmt_taskq != NULL)
1330 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1334 hn_change_network(struct hn_softc *sc)
1337 if (sc->hn_mgmt_taskq != NULL)
1338 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1342 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1343 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1345 struct mbuf *m = *m_head;
1348 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1350 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1351 m, segs, nsegs, BUS_DMA_NOWAIT);
1352 if (error == EFBIG) {
1355 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1359 *m_head = m = m_new;
1360 txr->hn_tx_collapsed++;
1362 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1363 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1366 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1367 BUS_DMASYNC_PREWRITE);
1368 txd->flags |= HN_TXD_FLAG_DMAMAP;
1374 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1377 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1378 ("put an onlist txd %#x", txd->flags));
1379 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1380 ("put an onagg txd %#x", txd->flags));
1382 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1383 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1386 if (!STAILQ_EMPTY(&txd->agg_list)) {
1387 struct hn_txdesc *tmp_txd;
1389 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1392 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1393 ("resursive aggregation on aggregated txdesc"));
1394 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1395 ("not aggregated txdesc"));
1396 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1397 ("aggregated txdesc uses dmamap"));
1398 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1399 ("aggregated txdesc consumes "
1400 "chimney sending buffer"));
1401 KASSERT(tmp_txd->chim_size == 0,
1402 ("aggregated txdesc has non-zero "
1403 "chimney sending size"));
1405 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1406 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1407 freed = hn_txdesc_put(txr, tmp_txd);
1408 KASSERT(freed, ("failed to free aggregated txdesc"));
1412 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1413 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1414 ("chim txd uses dmamap"));
1415 hn_chim_free(txr->hn_sc, txd->chim_index);
1416 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1418 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1419 bus_dmamap_sync(txr->hn_tx_data_dtag,
1420 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1421 bus_dmamap_unload(txr->hn_tx_data_dtag,
1423 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1426 if (txd->m != NULL) {
1431 txd->flags |= HN_TXD_FLAG_ONLIST;
1432 #ifndef HN_USE_TXDESC_BUFRING
1433 mtx_lock_spin(&txr->hn_txlist_spin);
1434 KASSERT(txr->hn_txdesc_avail >= 0 &&
1435 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1436 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1437 txr->hn_txdesc_avail++;
1438 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1439 mtx_unlock_spin(&txr->hn_txlist_spin);
1440 #else /* HN_USE_TXDESC_BUFRING */
1442 atomic_add_int(&txr->hn_txdesc_avail, 1);
1444 buf_ring_enqueue(txr->hn_txdesc_br, txd);
1445 #endif /* !HN_USE_TXDESC_BUFRING */
1450 static __inline struct hn_txdesc *
1451 hn_txdesc_get(struct hn_tx_ring *txr)
1453 struct hn_txdesc *txd;
1455 #ifndef HN_USE_TXDESC_BUFRING
1456 mtx_lock_spin(&txr->hn_txlist_spin);
1457 txd = SLIST_FIRST(&txr->hn_txlist);
1459 KASSERT(txr->hn_txdesc_avail > 0,
1460 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1461 txr->hn_txdesc_avail--;
1462 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1464 mtx_unlock_spin(&txr->hn_txlist_spin);
1466 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1470 #ifdef HN_USE_TXDESC_BUFRING
1472 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1474 #endif /* HN_USE_TXDESC_BUFRING */
1475 KASSERT(txd->m == NULL && txd->refs == 0 &&
1476 STAILQ_EMPTY(&txd->agg_list) &&
1477 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1478 txd->chim_size == 0 &&
1479 (txd->flags & HN_TXD_FLAG_ONLIST) &&
1480 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1481 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1482 txd->flags &= ~HN_TXD_FLAG_ONLIST;
1488 static __inline void
1489 hn_txdesc_hold(struct hn_txdesc *txd)
1492 /* 0->1 transition will never work */
1493 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1494 atomic_add_int(&txd->refs, 1);
1497 static __inline void
1498 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1501 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1502 ("recursive aggregation on aggregating txdesc"));
1504 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1505 ("already aggregated"));
1506 KASSERT(STAILQ_EMPTY(&txd->agg_list),
1507 ("recursive aggregation on to-be-aggregated txdesc"));
1509 txd->flags |= HN_TXD_FLAG_ONAGG;
1510 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1514 hn_tx_ring_pending(struct hn_tx_ring *txr)
1516 bool pending = false;
1518 #ifndef HN_USE_TXDESC_BUFRING
1519 mtx_lock_spin(&txr->hn_txlist_spin);
1520 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1522 mtx_unlock_spin(&txr->hn_txlist_spin);
1524 if (!buf_ring_full(txr->hn_txdesc_br))
1530 static __inline void
1531 hn_txeof(struct hn_tx_ring *txr)
1533 txr->hn_has_txeof = 0;
1538 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1539 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1541 struct hn_txdesc *txd = sndc->hn_cbarg;
1542 struct hn_tx_ring *txr;
1545 KASSERT(txr->hn_chan == chan,
1546 ("channel mismatch, on chan%u, should be chan%u",
1547 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1549 txr->hn_has_txeof = 1;
1550 hn_txdesc_put(txr, txd);
1552 ++txr->hn_txdone_cnt;
1553 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1554 txr->hn_txdone_cnt = 0;
1555 if (txr->hn_oactive)
1561 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1563 #if defined(INET) || defined(INET6)
1564 struct lro_ctrl *lro = &rxr->hn_lro;
1565 struct lro_entry *queued;
1567 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1568 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1569 tcp_lro_flush(lro, queued);
1575 * 'txr' could be NULL, if multiple channels and
1576 * ifnet.if_start method are enabled.
1578 if (txr == NULL || !txr->hn_has_txeof)
1581 txr->hn_txdone_cnt = 0;
1585 static __inline uint32_t
1586 hn_rndis_pktmsg_offset(uint32_t ofs)
1589 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1590 ("invalid RNDIS packet msg offset %u", ofs));
1591 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1594 static __inline void *
1595 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1596 size_t pi_dlen, uint32_t pi_type)
1598 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1599 struct rndis_pktinfo *pi;
1601 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1602 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1605 * Per-packet-info does not move; it only grows.
1608 * rm_pktinfooffset in this phase counts from the beginning
1609 * of rndis_packet_msg.
1611 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1612 ("%u pktinfo overflows RNDIS packet msg", pi_type));
1613 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1614 pkt->rm_pktinfolen);
1615 pkt->rm_pktinfolen += pi_size;
1617 pi->rm_size = pi_size;
1618 pi->rm_type = pi_type;
1619 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1621 /* Data immediately follow per-packet-info. */
1622 pkt->rm_dataoffset += pi_size;
1624 /* Update RNDIS packet msg length */
1625 pkt->rm_len += pi_size;
1627 return (pi->rm_data);
1631 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1633 struct hn_txdesc *txd;
1637 txd = txr->hn_agg_txd;
1638 KASSERT(txd != NULL, ("no aggregate txdesc"));
1641 * Since hn_txpkt() will reset this temporary stat, save
1642 * it now, so that oerrors can be updated properly, if
1643 * hn_txpkt() ever fails.
1645 pkts = txr->hn_stat_pkts;
1648 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1649 * failure, save it for later freeing, if hn_txpkt() ever
1653 error = hn_txpkt(ifp, txr, txd);
1654 if (__predict_false(error)) {
1655 /* txd is freed, but m is not. */
1658 txr->hn_flush_failed++;
1659 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1662 /* Reset all aggregation states. */
1663 txr->hn_agg_txd = NULL;
1664 txr->hn_agg_szleft = 0;
1665 txr->hn_agg_pktleft = 0;
1666 txr->hn_agg_prevpkt = NULL;
1672 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1677 if (txr->hn_agg_txd != NULL) {
1678 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1679 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1680 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1684 * Update the previous RNDIS packet's total length,
1685 * it can be increased due to the mandatory alignment
1686 * padding for this RNDIS packet. And update the
1687 * aggregating txdesc's chimney sending buffer size
1691 * Zero-out the padding, as required by the RNDIS spec.
1694 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1695 agg_txd->chim_size += pkt->rm_len - olen;
1697 /* Link this txdesc to the parent. */
1698 hn_txdesc_agg(agg_txd, txd);
1700 chim = (uint8_t *)pkt + pkt->rm_len;
1701 /* Save the current packet for later fixup. */
1702 txr->hn_agg_prevpkt = chim;
1704 txr->hn_agg_pktleft--;
1705 txr->hn_agg_szleft -= pktsize;
1706 if (txr->hn_agg_szleft <=
1707 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1709 * Probably can't aggregate more packets,
1710 * flush this aggregating txdesc proactively.
1712 txr->hn_agg_pktleft = 0;
1717 hn_flush_txagg(ifp, txr);
1719 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1721 txr->hn_tx_chimney_tried++;
1722 txd->chim_index = hn_chim_alloc(txr->hn_sc);
1723 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1725 txr->hn_tx_chimney++;
1727 chim = txr->hn_sc->hn_chim +
1728 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1730 if (txr->hn_agg_pktmax > 1 &&
1731 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1732 txr->hn_agg_txd = txd;
1733 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1734 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1735 txr->hn_agg_prevpkt = chim;
1742 * If this function fails, then both txd and m_head0 will be freed.
1745 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1746 struct mbuf **m_head0)
1748 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1749 int error, nsegs, i;
1750 struct mbuf *m_head = *m_head0;
1751 struct rndis_packet_msg *pkt;
1754 int pkt_hlen, pkt_size;
1756 pkt = txd->rndis_pkt;
1757 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1758 if (pkt_size < txr->hn_chim_size) {
1759 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1763 if (txr->hn_agg_txd != NULL)
1764 hn_flush_txagg(ifp, txr);
1767 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1768 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1769 pkt->rm_dataoffset = sizeof(*pkt);
1770 pkt->rm_datalen = m_head->m_pkthdr.len;
1771 pkt->rm_oobdataoffset = 0;
1772 pkt->rm_oobdatalen = 0;
1773 pkt->rm_oobdataelements = 0;
1774 pkt->rm_pktinfooffset = sizeof(*pkt);
1775 pkt->rm_pktinfolen = 0;
1776 pkt->rm_vchandle = 0;
1777 pkt->rm_reserved = 0;
1779 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1781 * Set the hash value for this packet, so that the host could
1782 * dispatch the TX done event for this packet back to this TX
1785 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1786 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1787 *pi_data = txr->hn_tx_idx;
1790 if (m_head->m_flags & M_VLANTAG) {
1791 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1792 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1793 *pi_data = NDIS_VLAN_INFO_MAKE(
1794 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1795 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1796 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1799 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1800 #if defined(INET6) || defined(INET)
1801 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1802 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1804 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1805 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1806 m_head->m_pkthdr.tso_segsz);
1809 #if defined(INET6) && defined(INET)
1814 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1815 m_head->m_pkthdr.tso_segsz);
1818 #endif /* INET6 || INET */
1819 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1820 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1821 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1822 if (m_head->m_pkthdr.csum_flags &
1823 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1824 *pi_data = NDIS_TXCSUM_INFO_IPV6;
1826 *pi_data = NDIS_TXCSUM_INFO_IPV4;
1827 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1828 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1831 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1832 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1833 else if (m_head->m_pkthdr.csum_flags &
1834 (CSUM_IP_UDP | CSUM_IP6_UDP))
1835 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1838 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1839 /* Convert RNDIS packet message offsets */
1840 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1841 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1844 * Fast path: Chimney sending.
1847 struct hn_txdesc *tgt_txd = txd;
1849 if (txr->hn_agg_txd != NULL) {
1850 tgt_txd = txr->hn_agg_txd;
1856 KASSERT(pkt == chim,
1857 ("RNDIS pkt not in chimney sending buffer"));
1858 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1859 ("chimney sending buffer is not used"));
1860 tgt_txd->chim_size += pkt->rm_len;
1862 m_copydata(m_head, 0, m_head->m_pkthdr.len,
1863 ((uint8_t *)chim) + pkt_hlen);
1865 txr->hn_gpa_cnt = 0;
1866 txr->hn_sendpkt = hn_txpkt_chim;
1870 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1871 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1872 ("chimney buffer is used"));
1873 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1875 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1876 if (__predict_false(error)) {
1880 * This mbuf is not linked w/ the txd yet, so free it now.
1885 freed = hn_txdesc_put(txr, txd);
1887 ("fail to free txd upon txdma error"));
1889 txr->hn_txdma_failed++;
1890 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1895 /* +1 RNDIS packet message */
1896 txr->hn_gpa_cnt = nsegs + 1;
1898 /* send packet with page buffer */
1899 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1900 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1901 txr->hn_gpa[0].gpa_len = pkt_hlen;
1904 * Fill the page buffers with mbuf info after the page
1905 * buffer for RNDIS packet message.
1907 for (i = 0; i < nsegs; ++i) {
1908 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1910 gpa->gpa_page = atop(segs[i].ds_addr);
1911 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1912 gpa->gpa_len = segs[i].ds_len;
1915 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1917 txr->hn_sendpkt = hn_txpkt_sglist;
1921 /* Set the completion routine */
1922 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1924 /* Update temporary stats for later use. */
1925 txr->hn_stat_pkts++;
1926 txr->hn_stat_size += m_head->m_pkthdr.len;
1927 if (m_head->m_flags & M_MCAST)
1928 txr->hn_stat_mcasts++;
1935 * If this function fails, then txd will be freed, but the mbuf
1936 * associated w/ the txd will _not_ be freed.
1939 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1941 int error, send_failed = 0, has_bpf;
1944 has_bpf = bpf_peers_present(ifp->if_bpf);
1947 * Make sure that this txd and any aggregated txds are not
1948 * freed before ETHER_BPF_MTAP.
1950 hn_txdesc_hold(txd);
1952 error = txr->hn_sendpkt(txr, txd);
1955 const struct hn_txdesc *tmp_txd;
1957 ETHER_BPF_MTAP(ifp, txd->m);
1958 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1959 ETHER_BPF_MTAP(ifp, tmp_txd->m);
1962 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1963 #ifdef HN_IFSTART_SUPPORT
1964 if (!hn_use_if_start)
1967 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1969 if (txr->hn_stat_mcasts != 0) {
1970 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1971 txr->hn_stat_mcasts);
1974 txr->hn_pkts += txr->hn_stat_pkts;
1978 hn_txdesc_put(txr, txd);
1980 if (__predict_false(error)) {
1984 * This should "really rarely" happen.
1986 * XXX Too many RX to be acked or too many sideband
1987 * commands to run? Ask netvsc_channel_rollup()
1988 * to kick start later.
1990 txr->hn_has_txeof = 1;
1992 txr->hn_send_failed++;
1995 * Try sending again after set hn_has_txeof;
1996 * in case that we missed the last
1997 * netvsc_channel_rollup().
2001 if_printf(ifp, "send failed\n");
2004 * Caller will perform further processing on the
2005 * associated mbuf, so don't free it in hn_txdesc_put();
2006 * only unload it from the DMA map in hn_txdesc_put(),
2010 freed = hn_txdesc_put(txr, txd);
2012 ("fail to free txd upon send error"));
2014 txr->hn_send_failed++;
2017 /* Reset temporary stats, after this sending is done. */
2018 txr->hn_stat_size = 0;
2019 txr->hn_stat_pkts = 0;
2020 txr->hn_stat_mcasts = 0;
2026 * Append the specified data to the indicated mbuf chain,
2027 * Extend the mbuf chain if the new data does not fit in
2030 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2031 * There should be an equivalent in the kernel mbuf code,
2032 * but there does not appear to be one yet.
2034 * Differs from m_append() in that additional mbufs are
2035 * allocated with cluster size MJUMPAGESIZE, and filled
2038 * Return 1 if able to complete the job; otherwise 0.
2041 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2044 int remainder, space;
2046 for (m = m0; m->m_next != NULL; m = m->m_next)
2049 space = M_TRAILINGSPACE(m);
2052 * Copy into available space.
2054 if (space > remainder)
2056 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2061 while (remainder > 0) {
2063 * Allocate a new mbuf; could check space
2064 * and allocate a cluster instead.
2066 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
2069 n->m_len = min(MJUMPAGESIZE, remainder);
2070 bcopy(cp, mtod(n, caddr_t), n->m_len);
2072 remainder -= n->m_len;
2076 if (m0->m_flags & M_PKTHDR)
2077 m0->m_pkthdr.len += len - remainder;
2079 return (remainder == 0);
2082 #if defined(INET) || defined(INET6)
2084 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2086 #if __FreeBSD_version >= 1100095
2087 if (hn_lro_mbufq_depth) {
2088 tcp_lro_queue_mbuf(lc, m);
2092 return tcp_lro_rx(lc, m, 0);
2097 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2098 const struct hn_rxinfo *info)
2100 struct ifnet *ifp = rxr->hn_ifp;
2102 int size, do_lro = 0, do_csum = 1;
2103 int hash_type = M_HASHTYPE_OPAQUE;
2105 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2109 * Bail out if packet contains more data than configured MTU.
2111 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
2113 } else if (dlen <= MHLEN) {
2114 m_new = m_gethdr(M_NOWAIT, MT_DATA);
2115 if (m_new == NULL) {
2116 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2119 memcpy(mtod(m_new, void *), data, dlen);
2120 m_new->m_pkthdr.len = m_new->m_len = dlen;
2121 rxr->hn_small_pkts++;
2124 * Get an mbuf with a cluster. For packets 2K or less,
2125 * get a standard 2K cluster. For anything larger, get a
2126 * 4K cluster. Any buffers larger than 4K can cause problems
2127 * if looped around to the Hyper-V TX channel, so avoid them.
2130 if (dlen > MCLBYTES) {
2132 size = MJUMPAGESIZE;
2135 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2136 if (m_new == NULL) {
2137 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2141 hv_m_append(m_new, dlen, data);
2143 m_new->m_pkthdr.rcvif = ifp;
2145 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2148 /* receive side checksum offload */
2149 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2150 /* IP csum offload */
2151 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2152 m_new->m_pkthdr.csum_flags |=
2153 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2157 /* TCP/UDP csum offload */
2158 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2159 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2160 m_new->m_pkthdr.csum_flags |=
2161 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2162 m_new->m_pkthdr.csum_data = 0xffff;
2163 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2171 * As of this write (Oct 28th, 2016), host side will turn
2172 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2173 * the do_lro setting here is actually _not_ accurate. We
2174 * depend on the RSS hash type check to reset do_lro.
2176 if ((info->csum_info &
2177 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2178 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2181 const struct ether_header *eh;
2186 if (m_new->m_len < hoff)
2188 eh = mtod(m_new, struct ether_header *);
2189 etype = ntohs(eh->ether_type);
2190 if (etype == ETHERTYPE_VLAN) {
2191 const struct ether_vlan_header *evl;
2193 hoff = sizeof(*evl);
2194 if (m_new->m_len < hoff)
2196 evl = mtod(m_new, struct ether_vlan_header *);
2197 etype = ntohs(evl->evl_proto);
2200 if (etype == ETHERTYPE_IP) {
2203 pr = hn_check_iplen(m_new, hoff);
2204 if (pr == IPPROTO_TCP) {
2206 (rxr->hn_trust_hcsum &
2207 HN_TRUST_HCSUM_TCP)) {
2208 rxr->hn_csum_trusted++;
2209 m_new->m_pkthdr.csum_flags |=
2210 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2211 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2212 m_new->m_pkthdr.csum_data = 0xffff;
2215 } else if (pr == IPPROTO_UDP) {
2217 (rxr->hn_trust_hcsum &
2218 HN_TRUST_HCSUM_UDP)) {
2219 rxr->hn_csum_trusted++;
2220 m_new->m_pkthdr.csum_flags |=
2221 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2222 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2223 m_new->m_pkthdr.csum_data = 0xffff;
2225 } else if (pr != IPPROTO_DONE && do_csum &&
2226 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2227 rxr->hn_csum_trusted++;
2228 m_new->m_pkthdr.csum_flags |=
2229 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2234 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2235 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2236 NDIS_VLAN_INFO_ID(info->vlan_info),
2237 NDIS_VLAN_INFO_PRI(info->vlan_info),
2238 NDIS_VLAN_INFO_CFI(info->vlan_info));
2239 m_new->m_flags |= M_VLANTAG;
2242 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2244 m_new->m_pkthdr.flowid = info->hash_value;
2245 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2246 NDIS_HASH_FUNCTION_TOEPLITZ) {
2247 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2251 * do_lro is resetted, if the hash types are not TCP
2252 * related. See the comment in the above csum_flags
2256 case NDIS_HASH_IPV4:
2257 hash_type = M_HASHTYPE_RSS_IPV4;
2261 case NDIS_HASH_TCP_IPV4:
2262 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2265 case NDIS_HASH_IPV6:
2266 hash_type = M_HASHTYPE_RSS_IPV6;
2270 case NDIS_HASH_IPV6_EX:
2271 hash_type = M_HASHTYPE_RSS_IPV6_EX;
2275 case NDIS_HASH_TCP_IPV6:
2276 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2279 case NDIS_HASH_TCP_IPV6_EX:
2280 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2285 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2287 M_HASHTYPE_SET(m_new, hash_type);
2290 * Note: Moved RX completion back to hv_nv_on_receive() so all
2291 * messages (not just data messages) will trigger a response.
2297 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2298 #if defined(INET) || defined(INET6)
2299 struct lro_ctrl *lro = &rxr->hn_lro;
2302 rxr->hn_lro_tried++;
2303 if (hn_lro_rx(lro, m_new) == 0) {
2311 /* We're not holding the lock here, so don't release it */
2312 (*ifp->if_input)(ifp, m_new);
2318 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2320 struct hn_softc *sc = ifp->if_softc;
2321 struct ifreq *ifr = (struct ifreq *)data;
2322 int mask, error = 0;
2326 if (ifr->ifr_mtu > HN_MTU_MAX) {
2333 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2338 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2339 /* Can't change MTU */
2345 if (ifp->if_mtu == ifr->ifr_mtu) {
2351 * Suspend this interface before the synthetic parts
2357 * Detach the synthetics parts, i.e. NVS and RNDIS.
2359 hn_synth_detach(sc);
2362 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2363 * with the new MTU setting.
2365 error = hn_synth_attach(sc, ifr->ifr_mtu);
2372 * Commit the requested MTU, after the synthetic parts
2373 * have been successfully attached.
2375 ifp->if_mtu = ifr->ifr_mtu;
2378 * Make sure that various parameters based on MTU are
2379 * still valid, after the MTU change.
2381 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2382 hn_set_chim_size(sc, sc->hn_chim_szmax);
2383 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2384 #if __FreeBSD_version >= 1100099
2385 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2386 HN_LRO_LENLIM_MIN(ifp))
2387 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2391 * All done! Resume the interface now.
2401 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2406 if (ifp->if_flags & IFF_UP) {
2407 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2409 * Caller meight hold mutex, e.g.
2410 * bpf; use busy-wait for the RNDIS
2414 hn_set_rxfilter(sc);
2420 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2423 sc->hn_if_flags = ifp->if_flags;
2430 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2432 if (mask & IFCAP_TXCSUM) {
2433 ifp->if_capenable ^= IFCAP_TXCSUM;
2434 if (ifp->if_capenable & IFCAP_TXCSUM)
2435 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2437 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2439 if (mask & IFCAP_TXCSUM_IPV6) {
2440 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2441 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2442 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2444 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2447 /* TODO: flip RNDIS offload parameters for RXCSUM. */
2448 if (mask & IFCAP_RXCSUM)
2449 ifp->if_capenable ^= IFCAP_RXCSUM;
2451 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2452 if (mask & IFCAP_RXCSUM_IPV6)
2453 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2456 if (mask & IFCAP_LRO)
2457 ifp->if_capenable ^= IFCAP_LRO;
2459 if (mask & IFCAP_TSO4) {
2460 ifp->if_capenable ^= IFCAP_TSO4;
2461 if (ifp->if_capenable & IFCAP_TSO4)
2462 ifp->if_hwassist |= CSUM_IP_TSO;
2464 ifp->if_hwassist &= ~CSUM_IP_TSO;
2466 if (mask & IFCAP_TSO6) {
2467 ifp->if_capenable ^= IFCAP_TSO6;
2468 if (ifp->if_capenable & IFCAP_TSO6)
2469 ifp->if_hwassist |= CSUM_IP6_TSO;
2471 ifp->if_hwassist &= ~CSUM_IP6_TSO;
2481 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2485 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2487 * Multicast uses mutex; use busy-wait for
2491 hn_set_rxfilter(sc);
2500 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2504 error = ether_ioctl(ifp, cmd, data);
2511 hn_stop(struct hn_softc *sc)
2513 struct ifnet *ifp = sc->hn_ifp;
2518 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2519 ("synthetic parts were not attached"));
2521 /* Clear RUNNING bit _before_ hn_suspend_data() */
2522 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2523 hn_suspend_data(sc);
2525 /* Clear OACTIVE bit. */
2526 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2527 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2528 sc->hn_tx_ring[i].hn_oactive = 0;
2532 hn_init_locked(struct hn_softc *sc)
2534 struct ifnet *ifp = sc->hn_ifp;
2539 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2542 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2545 /* Configure RX filter */
2546 hn_set_rxfilter(sc);
2548 /* Clear OACTIVE bit. */
2549 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2550 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2551 sc->hn_tx_ring[i].hn_oactive = 0;
2553 /* Clear TX 'suspended' bit. */
2554 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2556 /* Everything is ready; unleash! */
2557 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2563 struct hn_softc *sc = xsc;
2570 #if __FreeBSD_version >= 1100099
2573 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2575 struct hn_softc *sc = arg1;
2576 unsigned int lenlim;
2579 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2580 error = sysctl_handle_int(oidp, &lenlim, 0, req);
2581 if (error || req->newptr == NULL)
2585 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2586 lenlim > TCP_LRO_LENGTH_MAX) {
2590 hn_set_lro_lenlim(sc, lenlim);
2597 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2599 struct hn_softc *sc = arg1;
2600 int ackcnt, error, i;
2603 * lro_ackcnt_lim is append count limit,
2604 * +1 to turn it into aggregation limit.
2606 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2607 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2608 if (error || req->newptr == NULL)
2611 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2615 * Convert aggregation limit back to append
2620 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2621 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2629 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2631 struct hn_softc *sc = arg1;
2636 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2639 error = sysctl_handle_int(oidp, &on, 0, req);
2640 if (error || req->newptr == NULL)
2644 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2645 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2648 rxr->hn_trust_hcsum |= hcsum;
2650 rxr->hn_trust_hcsum &= ~hcsum;
2657 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2659 struct hn_softc *sc = arg1;
2660 int chim_size, error;
2662 chim_size = sc->hn_tx_ring[0].hn_chim_size;
2663 error = sysctl_handle_int(oidp, &chim_size, 0, req);
2664 if (error || req->newptr == NULL)
2667 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2671 hn_set_chim_size(sc, chim_size);
2676 #if __FreeBSD_version < 1100095
2678 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2680 struct hn_softc *sc = arg1;
2681 int ofs = arg2, i, error;
2682 struct hn_rx_ring *rxr;
2686 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2687 rxr = &sc->hn_rx_ring[i];
2688 stat += *((int *)((uint8_t *)rxr + ofs));
2691 error = sysctl_handle_64(oidp, &stat, 0, req);
2692 if (error || req->newptr == NULL)
2695 /* Zero out this stat. */
2696 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2697 rxr = &sc->hn_rx_ring[i];
2698 *((int *)((uint8_t *)rxr + ofs)) = 0;
2704 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2706 struct hn_softc *sc = arg1;
2707 int ofs = arg2, i, error;
2708 struct hn_rx_ring *rxr;
2712 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2713 rxr = &sc->hn_rx_ring[i];
2714 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2717 error = sysctl_handle_64(oidp, &stat, 0, req);
2718 if (error || req->newptr == NULL)
2721 /* Zero out this stat. */
2722 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2723 rxr = &sc->hn_rx_ring[i];
2724 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2732 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2734 struct hn_softc *sc = arg1;
2735 int ofs = arg2, i, error;
2736 struct hn_rx_ring *rxr;
2740 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2741 rxr = &sc->hn_rx_ring[i];
2742 stat += *((u_long *)((uint8_t *)rxr + ofs));
2745 error = sysctl_handle_long(oidp, &stat, 0, req);
2746 if (error || req->newptr == NULL)
2749 /* Zero out this stat. */
2750 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2751 rxr = &sc->hn_rx_ring[i];
2752 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
2758 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2760 struct hn_softc *sc = arg1;
2761 int ofs = arg2, i, error;
2762 struct hn_tx_ring *txr;
2766 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2767 txr = &sc->hn_tx_ring[i];
2768 stat += *((u_long *)((uint8_t *)txr + ofs));
2771 error = sysctl_handle_long(oidp, &stat, 0, req);
2772 if (error || req->newptr == NULL)
2775 /* Zero out this stat. */
2776 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2777 txr = &sc->hn_tx_ring[i];
2778 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2784 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2786 struct hn_softc *sc = arg1;
2787 int ofs = arg2, i, error, conf;
2788 struct hn_tx_ring *txr;
2790 txr = &sc->hn_tx_ring[0];
2791 conf = *((int *)((uint8_t *)txr + ofs));
2793 error = sysctl_handle_int(oidp, &conf, 0, req);
2794 if (error || req->newptr == NULL)
2798 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2799 txr = &sc->hn_tx_ring[i];
2800 *((int *)((uint8_t *)txr + ofs)) = conf;
2808 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2810 struct hn_softc *sc = arg1;
2813 size = sc->hn_agg_size;
2814 error = sysctl_handle_int(oidp, &size, 0, req);
2815 if (error || req->newptr == NULL)
2819 sc->hn_agg_size = size;
2827 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2829 struct hn_softc *sc = arg1;
2832 pkts = sc->hn_agg_pkts;
2833 error = sysctl_handle_int(oidp, &pkts, 0, req);
2834 if (error || req->newptr == NULL)
2838 sc->hn_agg_pkts = pkts;
2846 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2848 struct hn_softc *sc = arg1;
2851 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2852 return (sysctl_handle_int(oidp, &pkts, 0, req));
2856 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2858 struct hn_softc *sc = arg1;
2861 align = sc->hn_tx_ring[0].hn_agg_align;
2862 return (sysctl_handle_int(oidp, &align, 0, req));
2866 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2868 struct hn_softc *sc = arg1;
2871 snprintf(verstr, sizeof(verstr), "%u.%u",
2872 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2873 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2874 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2878 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2880 struct hn_softc *sc = arg1;
2887 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2888 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2892 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2894 struct hn_softc *sc = arg1;
2895 char assist_str[128];
2899 hwassist = sc->hn_ifp->if_hwassist;
2901 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2902 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2906 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2908 struct hn_softc *sc = arg1;
2909 char filter_str[128];
2913 filter = sc->hn_rx_filter;
2915 snprintf(filter_str, sizeof(filter_str), "%b", filter,
2917 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2921 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2923 struct hn_softc *sc = arg1;
2928 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2929 if (error || req->newptr == NULL)
2932 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2935 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2937 if (sc->hn_rx_ring_inuse > 1) {
2938 error = hn_rss_reconfig(sc);
2940 /* Not RSS capable, at least for now; just save the RSS key. */
2949 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2951 struct hn_softc *sc = arg1;
2956 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2957 if (error || req->newptr == NULL)
2961 * Don't allow RSS indirect table change, if this interface is not
2962 * RSS capable currently.
2964 if (sc->hn_rx_ring_inuse == 1) {
2969 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2972 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2974 hn_rss_ind_fixup(sc);
2975 error = hn_rss_reconfig(sc);
2982 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
2984 struct hn_softc *sc = arg1;
2989 hash = sc->hn_rss_hash;
2991 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
2992 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
2996 hn_check_iplen(const struct mbuf *m, int hoff)
2998 const struct ip *ip;
2999 int len, iphlen, iplen;
3000 const struct tcphdr *th;
3001 int thoff; /* TCP data offset */
3003 len = hoff + sizeof(struct ip);
3005 /* The packet must be at least the size of an IP header. */
3006 if (m->m_pkthdr.len < len)
3007 return IPPROTO_DONE;
3009 /* The fixed IP header must reside completely in the first mbuf. */
3011 return IPPROTO_DONE;
3013 ip = mtodo(m, hoff);
3015 /* Bound check the packet's stated IP header length. */
3016 iphlen = ip->ip_hl << 2;
3017 if (iphlen < sizeof(struct ip)) /* minimum header length */
3018 return IPPROTO_DONE;
3020 /* The full IP header must reside completely in the one mbuf. */
3021 if (m->m_len < hoff + iphlen)
3022 return IPPROTO_DONE;
3024 iplen = ntohs(ip->ip_len);
3027 * Check that the amount of data in the buffers is as
3028 * at least much as the IP header would have us expect.
3030 if (m->m_pkthdr.len < hoff + iplen)
3031 return IPPROTO_DONE;
3034 * Ignore IP fragments.
3036 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3037 return IPPROTO_DONE;
3040 * The TCP/IP or UDP/IP header must be entirely contained within
3041 * the first fragment of a packet.
3045 if (iplen < iphlen + sizeof(struct tcphdr))
3046 return IPPROTO_DONE;
3047 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3048 return IPPROTO_DONE;
3049 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3050 thoff = th->th_off << 2;
3051 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3052 return IPPROTO_DONE;
3053 if (m->m_len < hoff + iphlen + thoff)
3054 return IPPROTO_DONE;
3057 if (iplen < iphlen + sizeof(struct udphdr))
3058 return IPPROTO_DONE;
3059 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3060 return IPPROTO_DONE;
3064 return IPPROTO_DONE;
3071 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3073 struct sysctl_oid_list *child;
3074 struct sysctl_ctx_list *ctx;
3075 device_t dev = sc->hn_dev;
3076 #if defined(INET) || defined(INET6)
3077 #if __FreeBSD_version >= 1100095
3084 * Create RXBUF for reception.
3087 * - It is shared by all channels.
3088 * - A large enough buffer is allocated, certain version of NVSes
3089 * may further limit the usable space.
3091 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3092 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3093 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3094 if (sc->hn_rxbuf == NULL) {
3095 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3099 sc->hn_rx_ring_cnt = ring_cnt;
3100 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3102 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3103 M_DEVBUF, M_WAITOK | M_ZERO);
3105 #if defined(INET) || defined(INET6)
3106 #if __FreeBSD_version >= 1100095
3107 lroent_cnt = hn_lro_entry_count;
3108 if (lroent_cnt < TCP_LRO_ENTRIES)
3109 lroent_cnt = TCP_LRO_ENTRIES;
3111 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3113 #endif /* INET || INET6 */
3115 ctx = device_get_sysctl_ctx(dev);
3116 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3118 /* Create dev.hn.UNIT.rx sysctl tree */
3119 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3120 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3122 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3123 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3125 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3126 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3127 &rxr->hn_br_dma, BUS_DMA_WAITOK);
3128 if (rxr->hn_br == NULL) {
3129 device_printf(dev, "allocate bufring failed\n");
3133 if (hn_trust_hosttcp)
3134 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3135 if (hn_trust_hostudp)
3136 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3137 if (hn_trust_hostip)
3138 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3139 rxr->hn_ifp = sc->hn_ifp;
3140 if (i < sc->hn_tx_ring_cnt)
3141 rxr->hn_txr = &sc->hn_tx_ring[i];
3142 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3143 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3145 rxr->hn_rxbuf = sc->hn_rxbuf;
3150 #if defined(INET) || defined(INET6)
3151 #if __FreeBSD_version >= 1100095
3152 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3153 hn_lro_mbufq_depth);
3155 tcp_lro_init(&rxr->hn_lro);
3156 rxr->hn_lro.ifp = sc->hn_ifp;
3158 #if __FreeBSD_version >= 1100099
3159 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3160 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3162 #endif /* INET || INET6 */
3164 if (sc->hn_rx_sysctl_tree != NULL) {
3168 * Create per RX ring sysctl tree:
3169 * dev.hn.UNIT.rx.RINGID
3171 snprintf(name, sizeof(name), "%d", i);
3172 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3173 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3174 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3176 if (rxr->hn_rx_sysctl_tree != NULL) {
3177 SYSCTL_ADD_ULONG(ctx,
3178 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3179 OID_AUTO, "packets", CTLFLAG_RW,
3180 &rxr->hn_pkts, "# of packets received");
3181 SYSCTL_ADD_ULONG(ctx,
3182 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3183 OID_AUTO, "rss_pkts", CTLFLAG_RW,
3185 "# of packets w/ RSS info received");
3187 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3188 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3189 &rxr->hn_pktbuf_len, 0,
3190 "Temporary channel packet buffer length");
3195 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3196 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3197 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3198 #if __FreeBSD_version < 1100095
3199 hn_rx_stat_int_sysctl,
3201 hn_rx_stat_u64_sysctl,
3203 "LU", "LRO queued");
3204 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3205 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3206 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3207 #if __FreeBSD_version < 1100095
3208 hn_rx_stat_int_sysctl,
3210 hn_rx_stat_u64_sysctl,
3212 "LU", "LRO flushed");
3213 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3214 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3215 __offsetof(struct hn_rx_ring, hn_lro_tried),
3216 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3217 #if __FreeBSD_version >= 1100099
3218 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3219 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3220 hn_lro_lenlim_sysctl, "IU",
3221 "Max # of data bytes to be aggregated by LRO");
3222 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3223 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3224 hn_lro_ackcnt_sysctl, "I",
3225 "Max # of ACKs to be aggregated by LRO");
3227 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3228 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3229 hn_trust_hcsum_sysctl, "I",
3230 "Trust tcp segement verification on host side, "
3231 "when csum info is missing");
3232 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3233 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3234 hn_trust_hcsum_sysctl, "I",
3235 "Trust udp datagram verification on host side, "
3236 "when csum info is missing");
3237 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3238 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3239 hn_trust_hcsum_sysctl, "I",
3240 "Trust ip packet verification on host side, "
3241 "when csum info is missing");
3242 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3243 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3244 __offsetof(struct hn_rx_ring, hn_csum_ip),
3245 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3246 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3247 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3248 __offsetof(struct hn_rx_ring, hn_csum_tcp),
3249 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3250 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3251 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3252 __offsetof(struct hn_rx_ring, hn_csum_udp),
3253 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3254 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3255 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3256 __offsetof(struct hn_rx_ring, hn_csum_trusted),
3257 hn_rx_stat_ulong_sysctl, "LU",
3258 "# of packets that we trust host's csum verification");
3259 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3260 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3261 __offsetof(struct hn_rx_ring, hn_small_pkts),
3262 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3263 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3264 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3265 __offsetof(struct hn_rx_ring, hn_ack_failed),
3266 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3267 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3268 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3269 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3270 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3276 hn_destroy_rx_data(struct hn_softc *sc)
3280 if (sc->hn_rxbuf != NULL) {
3281 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3282 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3284 device_printf(sc->hn_dev, "RXBUF is referenced\n");
3285 sc->hn_rxbuf = NULL;
3288 if (sc->hn_rx_ring_cnt == 0)
3291 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3292 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3294 if (rxr->hn_br == NULL)
3296 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3297 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3299 device_printf(sc->hn_dev,
3300 "%dth channel bufring is referenced", i);
3304 #if defined(INET) || defined(INET6)
3305 tcp_lro_free(&rxr->hn_lro);
3307 free(rxr->hn_pktbuf, M_DEVBUF);
3309 free(sc->hn_rx_ring, M_DEVBUF);
3310 sc->hn_rx_ring = NULL;
3312 sc->hn_rx_ring_cnt = 0;
3313 sc->hn_rx_ring_inuse = 0;
3317 hn_tx_ring_create(struct hn_softc *sc, int id)
3319 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3320 device_t dev = sc->hn_dev;
3321 bus_dma_tag_t parent_dtag;
3325 txr->hn_tx_idx = id;
3327 #ifndef HN_USE_TXDESC_BUFRING
3328 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3330 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3332 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3333 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3334 M_DEVBUF, M_WAITOK | M_ZERO);
3335 #ifndef HN_USE_TXDESC_BUFRING
3336 SLIST_INIT(&txr->hn_txlist);
3338 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3339 M_WAITOK, &txr->hn_tx_lock);
3342 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3343 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3344 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3346 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3349 #ifdef HN_IFSTART_SUPPORT
3350 if (hn_use_if_start) {
3351 txr->hn_txeof = hn_start_txeof;
3352 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3353 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3359 txr->hn_txeof = hn_xmit_txeof;
3360 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3361 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3363 br_depth = hn_get_txswq_depth(txr);
3364 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3365 M_WAITOK, &txr->hn_tx_lock);
3368 txr->hn_direct_tx_size = hn_direct_tx_size;
3371 * Always schedule transmission instead of trying to do direct
3372 * transmission. This one gives the best performance so far.
3374 txr->hn_sched_tx = 1;
3376 parent_dtag = bus_get_dma_tag(dev);
3378 /* DMA tag for RNDIS packet messages. */
3379 error = bus_dma_tag_create(parent_dtag, /* parent */
3380 HN_RNDIS_PKT_ALIGN, /* alignment */
3381 HN_RNDIS_PKT_BOUNDARY, /* boundary */
3382 BUS_SPACE_MAXADDR, /* lowaddr */
3383 BUS_SPACE_MAXADDR, /* highaddr */
3384 NULL, NULL, /* filter, filterarg */
3385 HN_RNDIS_PKT_LEN, /* maxsize */
3387 HN_RNDIS_PKT_LEN, /* maxsegsize */
3389 NULL, /* lockfunc */
3390 NULL, /* lockfuncarg */
3391 &txr->hn_tx_rndis_dtag);
3393 device_printf(dev, "failed to create rndis dmatag\n");
3397 /* DMA tag for data. */
3398 error = bus_dma_tag_create(parent_dtag, /* parent */
3400 HN_TX_DATA_BOUNDARY, /* boundary */
3401 BUS_SPACE_MAXADDR, /* lowaddr */
3402 BUS_SPACE_MAXADDR, /* highaddr */
3403 NULL, NULL, /* filter, filterarg */
3404 HN_TX_DATA_MAXSIZE, /* maxsize */
3405 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
3406 HN_TX_DATA_SEGSIZE, /* maxsegsize */
3408 NULL, /* lockfunc */
3409 NULL, /* lockfuncarg */
3410 &txr->hn_tx_data_dtag);
3412 device_printf(dev, "failed to create data dmatag\n");
3416 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3417 struct hn_txdesc *txd = &txr->hn_txdesc[i];
3420 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3421 STAILQ_INIT(&txd->agg_list);
3424 * Allocate and load RNDIS packet message.
3426 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3427 (void **)&txd->rndis_pkt,
3428 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3429 &txd->rndis_pkt_dmap);
3432 "failed to allocate rndis_packet_msg, %d\n", i);
3436 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3437 txd->rndis_pkt_dmap,
3438 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3439 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3443 "failed to load rndis_packet_msg, %d\n", i);
3444 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3445 txd->rndis_pkt, txd->rndis_pkt_dmap);
3449 /* DMA map for TX data. */
3450 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3454 "failed to allocate tx data dmamap\n");
3455 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3456 txd->rndis_pkt_dmap);
3457 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3458 txd->rndis_pkt, txd->rndis_pkt_dmap);
3462 /* All set, put it to list */
3463 txd->flags |= HN_TXD_FLAG_ONLIST;
3464 #ifndef HN_USE_TXDESC_BUFRING
3465 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3467 buf_ring_enqueue(txr->hn_txdesc_br, txd);
3470 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3472 if (sc->hn_tx_sysctl_tree != NULL) {
3473 struct sysctl_oid_list *child;
3474 struct sysctl_ctx_list *ctx;
3478 * Create per TX ring sysctl tree:
3479 * dev.hn.UNIT.tx.RINGID
3481 ctx = device_get_sysctl_ctx(dev);
3482 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3484 snprintf(name, sizeof(name), "%d", id);
3485 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3486 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3488 if (txr->hn_tx_sysctl_tree != NULL) {
3489 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3492 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3493 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3494 "# of available TX descs");
3496 #ifdef HN_IFSTART_SUPPORT
3497 if (!hn_use_if_start)
3500 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3501 CTLFLAG_RD, &txr->hn_oactive, 0,
3504 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3505 CTLFLAG_RW, &txr->hn_pkts,
3506 "# of packets transmitted");
3507 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3508 CTLFLAG_RW, &txr->hn_sends, "# of sends");
3516 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3518 struct hn_tx_ring *txr = txd->txr;
3520 KASSERT(txd->m == NULL, ("still has mbuf installed"));
3521 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3523 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3524 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3525 txd->rndis_pkt_dmap);
3526 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3530 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3533 KASSERT(txd->refs == 0 || txd->refs == 1,
3534 ("invalid txd refs %d", txd->refs));
3536 /* Aggregated txds will be freed by their aggregating txd. */
3537 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3540 freed = hn_txdesc_put(txr, txd);
3541 KASSERT(freed, ("can't free txdesc"));
3546 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3550 if (txr->hn_txdesc == NULL)
3555 * Because the freeing of aggregated txds will be deferred
3556 * to the aggregating txd, two passes are used here:
3557 * - The first pass GCes any pending txds. This GC is necessary,
3558 * since if the channels are revoked, hypervisor will not
3559 * deliver send-done for all pending txds.
3560 * - The second pass frees the busdma stuffs, i.e. after all txds
3563 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3564 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3565 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3566 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3568 if (txr->hn_tx_data_dtag != NULL)
3569 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3570 if (txr->hn_tx_rndis_dtag != NULL)
3571 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3573 #ifdef HN_USE_TXDESC_BUFRING
3574 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3577 free(txr->hn_txdesc, M_DEVBUF);
3578 txr->hn_txdesc = NULL;
3580 if (txr->hn_mbuf_br != NULL)
3581 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3583 #ifndef HN_USE_TXDESC_BUFRING
3584 mtx_destroy(&txr->hn_txlist_spin);
3586 mtx_destroy(&txr->hn_tx_lock);
3590 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3592 struct sysctl_oid_list *child;
3593 struct sysctl_ctx_list *ctx;
3597 * Create TXBUF for chimney sending.
3599 * NOTE: It is shared by all channels.
3601 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3602 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3603 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3604 if (sc->hn_chim == NULL) {
3605 device_printf(sc->hn_dev, "allocate txbuf failed\n");
3609 sc->hn_tx_ring_cnt = ring_cnt;
3610 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3612 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3613 M_DEVBUF, M_WAITOK | M_ZERO);
3615 ctx = device_get_sysctl_ctx(sc->hn_dev);
3616 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3618 /* Create dev.hn.UNIT.tx sysctl tree */
3619 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3620 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3622 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3625 error = hn_tx_ring_create(sc, i);
3630 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3631 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3632 __offsetof(struct hn_tx_ring, hn_no_txdescs),
3633 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3634 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3635 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3636 __offsetof(struct hn_tx_ring, hn_send_failed),
3637 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3638 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3639 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3640 __offsetof(struct hn_tx_ring, hn_txdma_failed),
3641 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3642 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3643 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3644 __offsetof(struct hn_tx_ring, hn_flush_failed),
3645 hn_tx_stat_ulong_sysctl, "LU",
3646 "# of packet transmission aggregation flush failure");
3647 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3648 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3649 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3650 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3651 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3652 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3653 __offsetof(struct hn_tx_ring, hn_tx_chimney),
3654 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3655 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3656 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3657 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3658 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3659 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3660 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3661 "# of total TX descs");
3662 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3663 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3664 "Chimney send packet size upper boundary");
3665 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3666 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3667 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3668 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3669 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3670 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3671 hn_tx_conf_int_sysctl, "I",
3672 "Size of the packet for direct transmission");
3673 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3674 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3675 __offsetof(struct hn_tx_ring, hn_sched_tx),
3676 hn_tx_conf_int_sysctl, "I",
3677 "Always schedule transmission "
3678 "instead of doing direct transmission");
3679 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3680 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3681 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3682 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3683 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3684 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3685 "Applied packet transmission aggregation size");
3686 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3687 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3688 hn_txagg_pktmax_sysctl, "I",
3689 "Applied packet transmission aggregation packets");
3690 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3691 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3692 hn_txagg_align_sysctl, "I",
3693 "Applied packet transmission aggregation alignment");
3699 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3703 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3704 sc->hn_tx_ring[i].hn_chim_size = chim_size;
3708 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3710 struct ifnet *ifp = sc->hn_ifp;
3713 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3716 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3717 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3718 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3720 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3721 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3722 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3724 if (tso_maxlen < tso_minlen)
3725 tso_maxlen = tso_minlen;
3726 else if (tso_maxlen > IP_MAXPACKET)
3727 tso_maxlen = IP_MAXPACKET;
3728 if (tso_maxlen > sc->hn_ndis_tso_szmax)
3729 tso_maxlen = sc->hn_ndis_tso_szmax;
3730 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3732 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3736 hn_fixup_tx_data(struct hn_softc *sc)
3738 uint64_t csum_assist;
3741 hn_set_chim_size(sc, sc->hn_chim_szmax);
3742 if (hn_tx_chimney_size > 0 &&
3743 hn_tx_chimney_size < sc->hn_chim_szmax)
3744 hn_set_chim_size(sc, hn_tx_chimney_size);
3747 if (sc->hn_caps & HN_CAP_IPCS)
3748 csum_assist |= CSUM_IP;
3749 if (sc->hn_caps & HN_CAP_TCP4CS)
3750 csum_assist |= CSUM_IP_TCP;
3751 if (sc->hn_caps & HN_CAP_UDP4CS)
3752 csum_assist |= CSUM_IP_UDP;
3753 if (sc->hn_caps & HN_CAP_TCP6CS)
3754 csum_assist |= CSUM_IP6_TCP;
3755 if (sc->hn_caps & HN_CAP_UDP6CS)
3756 csum_assist |= CSUM_IP6_UDP;
3757 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3758 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3760 if (sc->hn_caps & HN_CAP_HASHVAL) {
3762 * Support HASHVAL pktinfo on TX path.
3765 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3766 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3767 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3772 hn_destroy_tx_data(struct hn_softc *sc)
3776 if (sc->hn_chim != NULL) {
3777 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
3778 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3780 device_printf(sc->hn_dev,
3781 "chimney sending buffer is referenced");
3786 if (sc->hn_tx_ring_cnt == 0)
3789 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3790 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3792 free(sc->hn_tx_ring, M_DEVBUF);
3793 sc->hn_tx_ring = NULL;
3795 sc->hn_tx_ring_cnt = 0;
3796 sc->hn_tx_ring_inuse = 0;
3799 #ifdef HN_IFSTART_SUPPORT
3802 hn_start_taskfunc(void *xtxr, int pending __unused)
3804 struct hn_tx_ring *txr = xtxr;
3806 mtx_lock(&txr->hn_tx_lock);
3807 hn_start_locked(txr, 0);
3808 mtx_unlock(&txr->hn_tx_lock);
3812 hn_start_locked(struct hn_tx_ring *txr, int len)
3814 struct hn_softc *sc = txr->hn_sc;
3815 struct ifnet *ifp = sc->hn_ifp;
3818 KASSERT(hn_use_if_start,
3819 ("hn_start_locked is called, when if_start is disabled"));
3820 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3821 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3822 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3824 if (__predict_false(txr->hn_suspended))
3827 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3831 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3832 struct hn_txdesc *txd;
3833 struct mbuf *m_head;
3836 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3840 if (len > 0 && m_head->m_pkthdr.len > len) {
3842 * This sending could be time consuming; let callers
3843 * dispatch this packet sending (and sending of any
3844 * following up packets) to tx taskqueue.
3846 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3851 #if defined(INET6) || defined(INET)
3852 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3853 m_head = hn_tso_fixup(m_head);
3854 if (__predict_false(m_head == NULL)) {
3855 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3861 txd = hn_txdesc_get(txr);
3863 txr->hn_no_txdescs++;
3864 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3865 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3869 error = hn_encap(ifp, txr, txd, &m_head);
3871 /* Both txd and m_head are freed */
3872 KASSERT(txr->hn_agg_txd == NULL,
3873 ("encap failed w/ pending aggregating txdesc"));
3877 if (txr->hn_agg_pktleft == 0) {
3878 if (txr->hn_agg_txd != NULL) {
3879 KASSERT(m_head == NULL,
3880 ("pending mbuf for aggregating txdesc"));
3881 error = hn_flush_txagg(ifp, txr);
3882 if (__predict_false(error)) {
3883 atomic_set_int(&ifp->if_drv_flags,
3888 KASSERT(m_head != NULL, ("mbuf was freed"));
3889 error = hn_txpkt(ifp, txr, txd);
3890 if (__predict_false(error)) {
3891 /* txd is freed, but m_head is not */
3892 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3893 atomic_set_int(&ifp->if_drv_flags,
3901 KASSERT(txr->hn_agg_txd != NULL,
3902 ("no aggregating txdesc"));
3903 KASSERT(m_head == NULL,
3904 ("pending mbuf for aggregating txdesc"));
3909 /* Flush pending aggerated transmission. */
3910 if (txr->hn_agg_txd != NULL)
3911 hn_flush_txagg(ifp, txr);
3916 hn_start(struct ifnet *ifp)
3918 struct hn_softc *sc = ifp->if_softc;
3919 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
3921 if (txr->hn_sched_tx)
3924 if (mtx_trylock(&txr->hn_tx_lock)) {
3927 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3928 mtx_unlock(&txr->hn_tx_lock);
3933 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3937 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3939 struct hn_tx_ring *txr = xtxr;
3941 mtx_lock(&txr->hn_tx_lock);
3942 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3943 hn_start_locked(txr, 0);
3944 mtx_unlock(&txr->hn_tx_lock);
3948 hn_start_txeof(struct hn_tx_ring *txr)
3950 struct hn_softc *sc = txr->hn_sc;
3951 struct ifnet *ifp = sc->hn_ifp;
3953 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3955 if (txr->hn_sched_tx)
3958 if (mtx_trylock(&txr->hn_tx_lock)) {
3961 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3962 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3963 mtx_unlock(&txr->hn_tx_lock);
3965 taskqueue_enqueue(txr->hn_tx_taskq,
3971 * Release the OACTIVE earlier, with the hope, that
3972 * others could catch up. The task will clear the
3973 * flag again with the hn_tx_lock to avoid possible
3976 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3977 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3981 #endif /* HN_IFSTART_SUPPORT */
3984 hn_xmit(struct hn_tx_ring *txr, int len)
3986 struct hn_softc *sc = txr->hn_sc;
3987 struct ifnet *ifp = sc->hn_ifp;
3988 struct mbuf *m_head;
3991 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3992 #ifdef HN_IFSTART_SUPPORT
3993 KASSERT(hn_use_if_start == 0,
3994 ("hn_xmit is called, when if_start is enabled"));
3996 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3998 if (__predict_false(txr->hn_suspended))
4001 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4004 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4005 struct hn_txdesc *txd;
4008 if (len > 0 && m_head->m_pkthdr.len > len) {
4010 * This sending could be time consuming; let callers
4011 * dispatch this packet sending (and sending of any
4012 * following up packets) to tx taskqueue.
4014 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4019 txd = hn_txdesc_get(txr);
4021 txr->hn_no_txdescs++;
4022 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4023 txr->hn_oactive = 1;
4027 error = hn_encap(ifp, txr, txd, &m_head);
4029 /* Both txd and m_head are freed; discard */
4030 KASSERT(txr->hn_agg_txd == NULL,
4031 ("encap failed w/ pending aggregating txdesc"));
4032 drbr_advance(ifp, txr->hn_mbuf_br);
4036 if (txr->hn_agg_pktleft == 0) {
4037 if (txr->hn_agg_txd != NULL) {
4038 KASSERT(m_head == NULL,
4039 ("pending mbuf for aggregating txdesc"));
4040 error = hn_flush_txagg(ifp, txr);
4041 if (__predict_false(error)) {
4042 txr->hn_oactive = 1;
4046 KASSERT(m_head != NULL, ("mbuf was freed"));
4047 error = hn_txpkt(ifp, txr, txd);
4048 if (__predict_false(error)) {
4049 /* txd is freed, but m_head is not */
4050 drbr_putback(ifp, txr->hn_mbuf_br,
4052 txr->hn_oactive = 1;
4059 KASSERT(txr->hn_agg_txd != NULL,
4060 ("no aggregating txdesc"));
4061 KASSERT(m_head == NULL,
4062 ("pending mbuf for aggregating txdesc"));
4067 drbr_advance(ifp, txr->hn_mbuf_br);
4070 /* Flush pending aggerated transmission. */
4071 if (txr->hn_agg_txd != NULL)
4072 hn_flush_txagg(ifp, txr);
4077 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4079 struct hn_softc *sc = ifp->if_softc;
4080 struct hn_tx_ring *txr;
4083 #if defined(INET6) || defined(INET)
4085 * Perform TSO packet header fixup now, since the TSO
4086 * packet header should be cache-hot.
4088 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4089 m = hn_tso_fixup(m);
4090 if (__predict_false(m == NULL)) {
4091 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4098 * Select the TX ring based on flowid
4100 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
4101 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4102 txr = &sc->hn_tx_ring[idx];
4104 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4106 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4110 if (txr->hn_oactive)
4113 if (txr->hn_sched_tx)
4116 if (mtx_trylock(&txr->hn_tx_lock)) {
4119 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4120 mtx_unlock(&txr->hn_tx_lock);
4125 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4130 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4134 mtx_lock(&txr->hn_tx_lock);
4135 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4137 mtx_unlock(&txr->hn_tx_lock);
4141 hn_xmit_qflush(struct ifnet *ifp)
4143 struct hn_softc *sc = ifp->if_softc;
4146 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4147 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4152 hn_xmit_txeof(struct hn_tx_ring *txr)
4155 if (txr->hn_sched_tx)
4158 if (mtx_trylock(&txr->hn_tx_lock)) {
4161 txr->hn_oactive = 0;
4162 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4163 mtx_unlock(&txr->hn_tx_lock);
4165 taskqueue_enqueue(txr->hn_tx_taskq,
4171 * Release the oactive earlier, with the hope, that
4172 * others could catch up. The task will clear the
4173 * oactive again with the hn_tx_lock to avoid possible
4176 txr->hn_oactive = 0;
4177 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4182 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4184 struct hn_tx_ring *txr = xtxr;
4186 mtx_lock(&txr->hn_tx_lock);
4188 mtx_unlock(&txr->hn_tx_lock);
4192 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4194 struct hn_tx_ring *txr = xtxr;
4196 mtx_lock(&txr->hn_tx_lock);
4197 txr->hn_oactive = 0;
4199 mtx_unlock(&txr->hn_tx_lock);
4203 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4205 struct vmbus_chan_br cbr;
4206 struct hn_rx_ring *rxr;
4207 struct hn_tx_ring *txr = NULL;
4210 idx = vmbus_chan_subidx(chan);
4213 * Link this channel to RX/TX ring.
4215 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4216 ("invalid channel index %d, should > 0 && < %d",
4217 idx, sc->hn_rx_ring_inuse));
4218 rxr = &sc->hn_rx_ring[idx];
4219 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4220 ("RX ring %d already attached", idx));
4221 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4224 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4225 idx, vmbus_chan_id(chan));
4228 if (idx < sc->hn_tx_ring_inuse) {
4229 txr = &sc->hn_tx_ring[idx];
4230 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4231 ("TX ring %d already attached", idx));
4232 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4234 txr->hn_chan = chan;
4236 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4237 idx, vmbus_chan_id(chan));
4241 /* Bind this channel to a proper CPU. */
4242 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4247 cbr.cbr = rxr->hn_br;
4248 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4249 cbr.cbr_txsz = HN_TXBR_SIZE;
4250 cbr.cbr_rxsz = HN_RXBR_SIZE;
4251 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4253 if (error == EISCONN) {
4254 if_printf(sc->hn_ifp, "bufring is connected after "
4255 "chan%u open failure\n", vmbus_chan_id(chan));
4256 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4258 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4259 vmbus_chan_id(chan), error);
4266 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4268 struct hn_rx_ring *rxr;
4271 idx = vmbus_chan_subidx(chan);
4274 * Link this channel to RX/TX ring.
4276 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4277 ("invalid channel index %d, should > 0 && < %d",
4278 idx, sc->hn_rx_ring_inuse));
4279 rxr = &sc->hn_rx_ring[idx];
4280 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4281 ("RX ring %d is not attached", idx));
4282 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4284 if (idx < sc->hn_tx_ring_inuse) {
4285 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4287 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4288 ("TX ring %d is not attached attached", idx));
4289 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4293 * Close this channel.
4296 * Channel closing does _not_ destroy the target channel.
4298 error = vmbus_chan_close_direct(chan);
4299 if (error == EISCONN) {
4300 if_printf(sc->hn_ifp, "chan%u bufring is connected "
4301 "after being closed\n", vmbus_chan_id(chan));
4302 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4304 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4305 vmbus_chan_id(chan), error);
4310 hn_attach_subchans(struct hn_softc *sc)
4312 struct vmbus_channel **subchans;
4313 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4316 KASSERT(subchan_cnt > 0, ("no sub-channels"));
4318 /* Attach the sub-channels. */
4319 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4320 for (i = 0; i < subchan_cnt; ++i) {
4323 error1 = hn_chan_attach(sc, subchans[i]);
4326 /* Move on; all channels will be detached later. */
4329 vmbus_subchan_rel(subchans, subchan_cnt);
4332 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4335 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4343 hn_detach_allchans(struct hn_softc *sc)
4345 struct vmbus_channel **subchans;
4346 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4349 if (subchan_cnt == 0)
4352 /* Detach the sub-channels. */
4353 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4354 for (i = 0; i < subchan_cnt; ++i)
4355 hn_chan_detach(sc, subchans[i]);
4356 vmbus_subchan_rel(subchans, subchan_cnt);
4360 * Detach the primary channel, _after_ all sub-channels
4363 hn_chan_detach(sc, sc->hn_prichan);
4365 /* Wait for sub-channels to be destroyed, if any. */
4366 vmbus_subchan_drain(sc->hn_prichan);
4369 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4370 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4371 HN_RX_FLAG_ATTACHED) == 0,
4372 ("%dth RX ring is still attached", i));
4374 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4375 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4376 HN_TX_FLAG_ATTACHED) == 0,
4377 ("%dth TX ring is still attached", i));
4383 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4385 struct vmbus_channel **subchans;
4386 int nchan, rxr_cnt, error;
4388 nchan = *nsubch + 1;
4391 * Multiple RX/TX rings are not requested.
4398 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4401 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4403 /* No RSS; this is benign. */
4408 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4412 if (nchan > rxr_cnt)
4415 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4421 * Allocate sub-channels from NVS.
4423 *nsubch = nchan - 1;
4424 error = hn_nvs_alloc_subchans(sc, nsubch);
4425 if (error || *nsubch == 0) {
4426 /* Failed to allocate sub-channels. */
4432 * Wait for all sub-channels to become ready before moving on.
4434 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4435 vmbus_subchan_rel(subchans, *nsubch);
4440 hn_synth_attachable(const struct hn_softc *sc)
4444 if (sc->hn_flags & HN_FLAG_ERRORS)
4447 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4448 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4450 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4457 hn_synth_attach(struct hn_softc *sc, int mtu)
4459 #define ATTACHED_NVS 0x0002
4460 #define ATTACHED_RNDIS 0x0004
4462 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4463 int error, nsubch, nchan, i;
4464 uint32_t old_caps, attached = 0;
4466 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4467 ("synthetic parts were attached"));
4469 if (!hn_synth_attachable(sc))
4472 /* Save capabilities for later verification. */
4473 old_caps = sc->hn_caps;
4476 /* Clear RSS stuffs. */
4477 sc->hn_rss_ind_size = 0;
4478 sc->hn_rss_hash = 0;
4481 * Attach the primary channel _before_ attaching NVS and RNDIS.
4483 error = hn_chan_attach(sc, sc->hn_prichan);
4490 error = hn_nvs_attach(sc, mtu);
4493 attached |= ATTACHED_NVS;
4496 * Attach RNDIS _after_ NVS is attached.
4498 error = hn_rndis_attach(sc, mtu);
4501 attached |= ATTACHED_RNDIS;
4504 * Make sure capabilities are not changed.
4506 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4507 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4508 old_caps, sc->hn_caps);
4514 * Allocate sub-channels for multi-TX/RX rings.
4517 * The # of RX rings that can be used is equivalent to the # of
4518 * channels to be requested.
4520 nsubch = sc->hn_rx_ring_cnt - 1;
4521 error = hn_synth_alloc_subchans(sc, &nsubch);
4524 /* NOTE: _Full_ synthetic parts detach is required now. */
4525 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4528 * Set the # of TX/RX rings that could be used according to
4529 * the # of channels that NVS offered.
4532 hn_set_ring_inuse(sc, nchan);
4534 /* Only the primary channel can be used; done */
4539 * Attach the sub-channels.
4541 * NOTE: hn_set_ring_inuse() _must_ have been called.
4543 error = hn_attach_subchans(sc);
4548 * Configure RSS key and indirect table _after_ all sub-channels
4551 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4553 * RSS key is not set yet; set it to the default RSS key.
4556 if_printf(sc->hn_ifp, "setup default RSS key\n");
4557 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4558 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4561 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4563 * RSS indirect table is not set yet; set it up in round-
4567 if_printf(sc->hn_ifp, "setup default RSS indirect "
4570 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
4571 rss->rss_ind[i] = i % nchan;
4572 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4575 * # of usable channels may be changed, so we have to
4576 * make sure that all entries in RSS indirect table
4579 * NOTE: hn_set_ring_inuse() _must_ have been called.
4581 hn_rss_ind_fixup(sc);
4584 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4589 * Fixup transmission aggregation setup.
4595 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4596 hn_synth_detach(sc);
4598 if (attached & ATTACHED_RNDIS)
4599 hn_rndis_detach(sc);
4600 if (attached & ATTACHED_NVS)
4602 hn_chan_detach(sc, sc->hn_prichan);
4603 /* Restore old capabilities. */
4604 sc->hn_caps = old_caps;
4608 #undef ATTACHED_RNDIS
4614 * The interface must have been suspended though hn_suspend(), before
4615 * this function get called.
4618 hn_synth_detach(struct hn_softc *sc)
4621 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4622 ("synthetic parts were not attached"));
4624 /* Detach the RNDIS first. */
4625 hn_rndis_detach(sc);
4630 /* Detach all of the channels. */
4631 hn_detach_allchans(sc);
4633 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4637 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4639 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4640 ("invalid ring count %d", ring_cnt));
4642 if (sc->hn_tx_ring_cnt > ring_cnt)
4643 sc->hn_tx_ring_inuse = ring_cnt;
4645 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4646 sc->hn_rx_ring_inuse = ring_cnt;
4649 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4650 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4655 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4660 * The TX bufring will not be drained by the hypervisor,
4661 * if the primary channel is revoked.
4663 while (!vmbus_chan_rx_empty(chan) ||
4664 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4665 !vmbus_chan_tx_empty(chan)))
4667 vmbus_chan_intr_drain(chan);
4671 hn_suspend_data(struct hn_softc *sc)
4673 struct vmbus_channel **subch = NULL;
4674 struct hn_tx_ring *txr;
4682 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4683 txr = &sc->hn_tx_ring[i];
4685 mtx_lock(&txr->hn_tx_lock);
4686 txr->hn_suspended = 1;
4687 mtx_unlock(&txr->hn_tx_lock);
4688 /* No one is able send more packets now. */
4691 * Wait for all pending sends to finish.
4694 * We will _not_ receive all pending send-done, if the
4695 * primary channel is revoked.
4697 while (hn_tx_ring_pending(txr) &&
4698 !vmbus_chan_is_revoked(sc->hn_prichan))
4699 pause("hnwtx", 1 /* 1 tick */);
4703 * Disable RX by clearing RX filter.
4705 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
4706 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
4709 * Give RNDIS enough time to flush all pending data packets.
4711 pause("waitrx", (200 * hz) / 1000);
4714 * Drain RX/TX bufrings and interrupts.
4716 nsubch = sc->hn_rx_ring_inuse - 1;
4718 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4720 if (subch != NULL) {
4721 for (i = 0; i < nsubch; ++i)
4722 hn_chan_drain(sc, subch[i]);
4724 hn_chan_drain(sc, sc->hn_prichan);
4727 vmbus_subchan_rel(subch, nsubch);
4730 * Drain any pending TX tasks.
4733 * The above hn_chan_drain() can dispatch TX tasks, so the TX
4734 * tasks will have to be drained _after_ the above hn_chan_drain()
4737 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4738 txr = &sc->hn_tx_ring[i];
4740 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4741 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4746 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4749 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4753 hn_suspend_mgmt(struct hn_softc *sc)
4760 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4761 * through hn_mgmt_taskq.
4763 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4764 vmbus_chan_run_task(sc->hn_prichan, &task);
4767 * Make sure that all pending management tasks are completed.
4769 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4770 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4771 taskqueue_drain_all(sc->hn_mgmt_taskq0);
4775 hn_suspend(struct hn_softc *sc)
4778 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4779 hn_suspend_data(sc);
4780 hn_suspend_mgmt(sc);
4784 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4788 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4789 ("invalid TX ring count %d", tx_ring_cnt));
4791 for (i = 0; i < tx_ring_cnt; ++i) {
4792 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4794 mtx_lock(&txr->hn_tx_lock);
4795 txr->hn_suspended = 0;
4796 mtx_unlock(&txr->hn_tx_lock);
4801 hn_resume_data(struct hn_softc *sc)
4810 hn_set_rxfilter(sc);
4813 * Make sure to clear suspend status on "all" TX rings,
4814 * since hn_tx_ring_inuse can be changed after
4815 * hn_suspend_data().
4817 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4819 #ifdef HN_IFSTART_SUPPORT
4820 if (!hn_use_if_start)
4824 * Flush unused drbrs, since hn_tx_ring_inuse may be
4827 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4828 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4834 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4835 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4838 * Use txeof task, so that any pending oactive can be
4841 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4846 hn_resume_mgmt(struct hn_softc *sc)
4849 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4852 * Kick off network change detection, if it was pending.
4853 * If no network change was pending, start link status
4854 * checks, which is more lightweight than network change
4857 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4858 hn_change_network(sc);
4860 hn_update_link_status(sc);
4864 hn_resume(struct hn_softc *sc)
4867 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4873 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4875 const struct rndis_status_msg *msg;
4878 if (dlen < sizeof(*msg)) {
4879 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4884 switch (msg->rm_status) {
4885 case RNDIS_STATUS_MEDIA_CONNECT:
4886 case RNDIS_STATUS_MEDIA_DISCONNECT:
4887 hn_update_link_status(sc);
4890 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4891 /* Not really useful; ignore. */
4894 case RNDIS_STATUS_NETWORK_CHANGE:
4895 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4896 if (dlen < ofs + msg->rm_stbuflen ||
4897 msg->rm_stbuflen < sizeof(uint32_t)) {
4898 if_printf(sc->hn_ifp, "network changed\n");
4902 memcpy(&change, ((const uint8_t *)msg) + ofs,
4904 if_printf(sc->hn_ifp, "network changed, change %u\n",
4907 hn_change_network(sc);
4911 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4918 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
4920 const struct rndis_pktinfo *pi = info_data;
4923 while (info_dlen != 0) {
4927 if (__predict_false(info_dlen < sizeof(*pi)))
4929 if (__predict_false(info_dlen < pi->rm_size))
4931 info_dlen -= pi->rm_size;
4933 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
4935 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
4937 dlen = pi->rm_size - pi->rm_pktinfooffset;
4940 switch (pi->rm_type) {
4941 case NDIS_PKTINFO_TYPE_VLAN:
4942 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
4944 info->vlan_info = *((const uint32_t *)data);
4945 mask |= HN_RXINFO_VLAN;
4948 case NDIS_PKTINFO_TYPE_CSUM:
4949 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
4951 info->csum_info = *((const uint32_t *)data);
4952 mask |= HN_RXINFO_CSUM;
4955 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
4956 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
4958 info->hash_value = *((const uint32_t *)data);
4959 mask |= HN_RXINFO_HASHVAL;
4962 case HN_NDIS_PKTINFO_TYPE_HASHINF:
4963 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
4965 info->hash_info = *((const uint32_t *)data);
4966 mask |= HN_RXINFO_HASHINF;
4973 if (mask == HN_RXINFO_ALL) {
4974 /* All found; done */
4978 pi = (const struct rndis_pktinfo *)
4979 ((const uint8_t *)pi + pi->rm_size);
4984 * - If there is no hash value, invalidate the hash info.
4986 if ((mask & HN_RXINFO_HASHVAL) == 0)
4987 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
4991 static __inline bool
4992 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
4995 if (off < check_off) {
4996 if (__predict_true(off + len <= check_off))
4998 } else if (off > check_off) {
4999 if (__predict_true(check_off + check_len <= off))
5006 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5008 const struct rndis_packet_msg *pkt;
5009 struct hn_rxinfo info;
5010 int data_off, pktinfo_off, data_len, pktinfo_len;
5015 if (__predict_false(dlen < sizeof(*pkt))) {
5016 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5021 if (__predict_false(dlen < pkt->rm_len)) {
5022 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5023 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5026 if (__predict_false(pkt->rm_len <
5027 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5028 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5029 "msglen %u, data %u, oob %u, pktinfo %u\n",
5030 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5031 pkt->rm_pktinfolen);
5034 if (__predict_false(pkt->rm_datalen == 0)) {
5035 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5042 #define IS_OFFSET_INVALID(ofs) \
5043 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
5044 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5046 /* XXX Hyper-V does not meet data offset alignment requirement */
5047 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5048 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5049 "data offset %u\n", pkt->rm_dataoffset);
5052 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5053 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5054 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5055 "oob offset %u\n", pkt->rm_oobdataoffset);
5058 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5059 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5060 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5061 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5065 #undef IS_OFFSET_INVALID
5067 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5068 data_len = pkt->rm_datalen;
5069 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5070 pktinfo_len = pkt->rm_pktinfolen;
5073 * Check OOB coverage.
5075 if (__predict_false(pkt->rm_oobdatalen != 0)) {
5076 int oob_off, oob_len;
5078 if_printf(rxr->hn_ifp, "got oobdata\n");
5079 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5080 oob_len = pkt->rm_oobdatalen;
5082 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5083 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5084 "oob overflow, msglen %u, oob abs %d len %d\n",
5085 pkt->rm_len, oob_off, oob_len);
5090 * Check against data.
5092 if (hn_rndis_check_overlap(oob_off, oob_len,
5093 data_off, data_len)) {
5094 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5095 "oob overlaps data, oob abs %d len %d, "
5096 "data abs %d len %d\n",
5097 oob_off, oob_len, data_off, data_len);
5102 * Check against pktinfo.
5104 if (pktinfo_len != 0 &&
5105 hn_rndis_check_overlap(oob_off, oob_len,
5106 pktinfo_off, pktinfo_len)) {
5107 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5108 "oob overlaps pktinfo, oob abs %d len %d, "
5109 "pktinfo abs %d len %d\n",
5110 oob_off, oob_len, pktinfo_off, pktinfo_len);
5116 * Check per-packet-info coverage and find useful per-packet-info.
5118 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5119 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5120 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5121 if (__predict_true(pktinfo_len != 0)) {
5125 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5126 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5127 "pktinfo overflow, msglen %u, "
5128 "pktinfo abs %d len %d\n",
5129 pkt->rm_len, pktinfo_off, pktinfo_len);
5134 * Check packet info coverage.
5136 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5137 data_off, data_len);
5138 if (__predict_false(overlap)) {
5139 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5140 "pktinfo overlap data, pktinfo abs %d len %d, "
5141 "data abs %d len %d\n",
5142 pktinfo_off, pktinfo_len, data_off, data_len);
5147 * Find useful per-packet-info.
5149 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5150 pktinfo_len, &info);
5151 if (__predict_false(error)) {
5152 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5158 if (__predict_false(data_off + data_len > pkt->rm_len)) {
5159 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5160 "data overflow, msglen %u, data abs %d len %d\n",
5161 pkt->rm_len, data_off, data_len);
5164 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5167 static __inline void
5168 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5170 const struct rndis_msghdr *hdr;
5172 if (__predict_false(dlen < sizeof(*hdr))) {
5173 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5178 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5179 /* Hot data path. */
5180 hn_rndis_rx_data(rxr, data, dlen);
5185 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5186 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5188 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5192 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5194 const struct hn_nvs_hdr *hdr;
5196 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5197 if_printf(sc->hn_ifp, "invalid nvs notify\n");
5200 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5202 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5203 /* Useless; ignore */
5206 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5210 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5211 const struct vmbus_chanpkt_hdr *pkt)
5213 struct hn_nvs_sendctx *sndc;
5215 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5216 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5217 VMBUS_CHANPKT_DATALEN(pkt));
5220 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5226 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5227 const struct vmbus_chanpkt_hdr *pkthdr)
5229 const struct vmbus_chanpkt_rxbuf *pkt;
5230 const struct hn_nvs_hdr *nvs_hdr;
5233 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5234 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5237 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5239 /* Make sure that this is a RNDIS message. */
5240 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5241 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5246 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5247 if (__predict_false(hlen < sizeof(*pkt))) {
5248 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5251 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5253 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5254 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5259 count = pkt->cp_rxbuf_cnt;
5260 if (__predict_false(hlen <
5261 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5262 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5266 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5267 for (i = 0; i < count; ++i) {
5270 ofs = pkt->cp_rxbuf[i].rb_ofs;
5271 len = pkt->cp_rxbuf[i].rb_len;
5272 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5273 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5274 "ofs %d, len %d\n", i, ofs, len);
5277 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5281 * Ack the consumed RXBUF associated w/ this channel packet,
5282 * so that this RXBUF can be recycled by the hypervisor.
5284 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5288 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5291 struct hn_nvs_rndis_ack ack;
5294 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5295 ack.nvs_status = HN_NVS_STATUS_OK;
5299 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5300 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5301 if (__predict_false(error == EAGAIN)) {
5304 * This should _not_ happen in real world, since the
5305 * consumption of the TX bufring from the TX path is
5308 if (rxr->hn_ack_failed == 0)
5309 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5310 rxr->hn_ack_failed++;
5317 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5322 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5324 struct hn_rx_ring *rxr = xrxr;
5325 struct hn_softc *sc = rxr->hn_ifp->if_softc;
5328 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5331 pktlen = rxr->hn_pktbuf_len;
5332 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5333 if (__predict_false(error == ENOBUFS)) {
5338 * Expand channel packet buffer.
5341 * Use M_WAITOK here, since allocation failure
5344 nlen = rxr->hn_pktbuf_len * 2;
5345 while (nlen < pktlen)
5347 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5349 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5350 rxr->hn_pktbuf_len, nlen);
5352 free(rxr->hn_pktbuf, M_DEVBUF);
5353 rxr->hn_pktbuf = nbuf;
5354 rxr->hn_pktbuf_len = nlen;
5357 } else if (__predict_false(error == EAGAIN)) {
5358 /* No more channel packets; done! */
5361 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5363 switch (pkt->cph_type) {
5364 case VMBUS_CHANPKT_TYPE_COMP:
5365 hn_nvs_handle_comp(sc, chan, pkt);
5368 case VMBUS_CHANPKT_TYPE_RXBUF:
5369 hn_nvs_handle_rxbuf(rxr, chan, pkt);
5372 case VMBUS_CHANPKT_TYPE_INBAND:
5373 hn_nvs_handle_notify(sc, pkt);
5377 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5382 hn_chan_rollup(rxr, rxr->hn_txr);
5386 hn_tx_taskq_create(void *arg __unused)
5391 * Fix the # of TX taskqueues.
5393 if (hn_tx_taskq_cnt <= 0)
5394 hn_tx_taskq_cnt = 1;
5395 else if (hn_tx_taskq_cnt > mp_ncpus)
5396 hn_tx_taskq_cnt = mp_ncpus;
5399 * Fix the TX taskqueue mode.
5401 switch (hn_tx_taskq_mode) {
5402 case HN_TX_TASKQ_M_INDEP:
5403 case HN_TX_TASKQ_M_GLOBAL:
5404 case HN_TX_TASKQ_M_EVTTQ:
5407 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
5411 if (vm_guest != VM_GUEST_HV)
5414 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
5417 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
5418 M_DEVBUF, M_WAITOK);
5419 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
5420 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
5421 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
5422 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
5426 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5427 hn_tx_taskq_create, NULL);
5430 hn_tx_taskq_destroy(void *arg __unused)
5433 if (hn_tx_taskque != NULL) {
5436 for (i = 0; i < hn_tx_taskq_cnt; ++i)
5437 taskqueue_free(hn_tx_taskque[i]);
5438 free(hn_tx_taskque, M_DEVBUF);
5441 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5442 hn_tx_taskq_destroy, NULL);