2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
59 #include "opt_inet6.h"
63 #include <sys/param.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
73 #include <sys/socket.h>
74 #include <sys/sockio.h>
76 #include <sys/sysctl.h>
77 #include <sys/systm.h>
78 #include <sys/taskqueue.h>
79 #include <sys/buf_ring.h>
81 #include <machine/atomic.h>
82 #include <machine/in_cksum.h>
85 #include <net/ethernet.h>
87 #include <net/if_media.h>
88 #include <net/if_types.h>
89 #include <net/if_var.h>
90 #include <net/rndis.h>
92 #include <net/rss_config.h>
95 #include <netinet/in_systm.h>
96 #include <netinet/in.h>
97 #include <netinet/ip.h>
98 #include <netinet/ip6.h>
99 #include <netinet/tcp.h>
100 #include <netinet/tcp_lro.h>
101 #include <netinet/udp.h>
103 #include <dev/hyperv/include/hyperv.h>
104 #include <dev/hyperv/include/hyperv_busdma.h>
105 #include <dev/hyperv/include/vmbus.h>
106 #include <dev/hyperv/include/vmbus_xact.h>
108 #include <dev/hyperv/netvsc/ndis.h>
109 #include <dev/hyperv/netvsc/if_hnreg.h>
110 #include <dev/hyperv/netvsc/if_hnvar.h>
111 #include <dev/hyperv/netvsc/hn_nvs.h>
112 #include <dev/hyperv/netvsc/hn_rndis.h>
114 #include "vmbus_if.h"
116 #define HN_IFSTART_SUPPORT
118 #define HN_RING_CNT_DEF_MAX 8
120 /* YYY should get it from the underlying channel */
121 #define HN_TX_DESC_CNT 512
123 #define HN_RNDIS_PKT_LEN \
124 (sizeof(struct rndis_packet_msg) + \
125 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
126 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
127 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
128 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
129 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
130 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
132 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
133 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
134 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
135 /* -1 for RNDIS packet message */
136 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
138 #define HN_DIRECT_TX_SIZE_DEF 128
140 #define HN_EARLY_TXEOF_THRESH 8
142 #define HN_PKTBUF_LEN_DEF (16 * 1024)
144 #define HN_LROENT_CNT_DEF 128
146 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
147 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
148 /* YYY 2*MTU is a bit rough, but should be good enough. */
149 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
151 #define HN_LRO_ACKCNT_DEF 1
153 #define HN_LOCK_INIT(sc) \
154 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
155 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
156 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
157 #define HN_LOCK(sc) \
159 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
162 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
164 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
165 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
166 #define HN_CSUM_IP_HWASSIST(sc) \
167 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
168 #define HN_CSUM_IP6_HWASSIST(sc) \
169 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
171 #define HN_PKTSIZE_MIN(align) \
172 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
173 HN_RNDIS_PKT_LEN, (align))
174 #define HN_PKTSIZE(m, align) \
175 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
178 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets())
180 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
184 #ifndef HN_USE_TXDESC_BUFRING
185 SLIST_ENTRY(hn_txdesc) link;
187 STAILQ_ENTRY(hn_txdesc) agg_link;
189 /* Aggregated txdescs, in sending order. */
190 STAILQ_HEAD(, hn_txdesc) agg_list;
192 /* The oldest packet, if transmission aggregation happens. */
194 struct hn_tx_ring *txr;
196 uint32_t flags; /* HN_TXD_FLAG_ */
197 struct hn_nvs_sendctx send_ctx;
201 bus_dmamap_t data_dmap;
203 bus_addr_t rndis_pkt_paddr;
204 struct rndis_packet_msg *rndis_pkt;
205 bus_dmamap_t rndis_pkt_dmap;
208 #define HN_TXD_FLAG_ONLIST 0x0001
209 #define HN_TXD_FLAG_DMAMAP 0x0002
210 #define HN_TXD_FLAG_ONAGG 0x0004
219 #define HN_RXINFO_VLAN 0x0001
220 #define HN_RXINFO_CSUM 0x0002
221 #define HN_RXINFO_HASHINF 0x0004
222 #define HN_RXINFO_HASHVAL 0x0008
223 #define HN_RXINFO_ALL \
226 HN_RXINFO_HASHINF | \
229 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
230 #define HN_NDIS_RXCSUM_INFO_INVALID 0
231 #define HN_NDIS_HASH_INFO_INVALID 0
233 static int hn_probe(device_t);
234 static int hn_attach(device_t);
235 static int hn_detach(device_t);
236 static int hn_shutdown(device_t);
237 static void hn_chan_callback(struct vmbus_channel *,
240 static void hn_init(void *);
241 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
242 #ifdef HN_IFSTART_SUPPORT
243 static void hn_start(struct ifnet *);
245 static int hn_transmit(struct ifnet *, struct mbuf *);
246 static void hn_xmit_qflush(struct ifnet *);
247 static int hn_ifmedia_upd(struct ifnet *);
248 static void hn_ifmedia_sts(struct ifnet *,
249 struct ifmediareq *);
251 static int hn_rndis_rxinfo(const void *, int,
253 static void hn_rndis_rx_data(struct hn_rx_ring *,
255 static void hn_rndis_rx_status(struct hn_softc *,
258 static void hn_nvs_handle_notify(struct hn_softc *,
259 const struct vmbus_chanpkt_hdr *);
260 static void hn_nvs_handle_comp(struct hn_softc *,
261 struct vmbus_channel *,
262 const struct vmbus_chanpkt_hdr *);
263 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
264 struct vmbus_channel *,
265 const struct vmbus_chanpkt_hdr *);
266 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
267 struct vmbus_channel *, uint64_t);
269 #if __FreeBSD_version >= 1100099
270 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
271 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
273 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
274 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
275 #if __FreeBSD_version < 1100095
276 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
278 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
280 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
281 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
282 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
283 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
284 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
285 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
286 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
288 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
289 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
291 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
292 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
293 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
294 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
295 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
297 static void hn_stop(struct hn_softc *);
298 static void hn_init_locked(struct hn_softc *);
299 static int hn_chan_attach(struct hn_softc *,
300 struct vmbus_channel *);
301 static void hn_chan_detach(struct hn_softc *,
302 struct vmbus_channel *);
303 static int hn_attach_subchans(struct hn_softc *);
304 static void hn_detach_allchans(struct hn_softc *);
305 static void hn_chan_rollup(struct hn_rx_ring *,
306 struct hn_tx_ring *);
307 static void hn_set_ring_inuse(struct hn_softc *, int);
308 static int hn_synth_attach(struct hn_softc *, int);
309 static void hn_synth_detach(struct hn_softc *);
310 static int hn_synth_alloc_subchans(struct hn_softc *,
312 static bool hn_synth_attachable(const struct hn_softc *);
313 static void hn_suspend(struct hn_softc *);
314 static void hn_suspend_data(struct hn_softc *);
315 static void hn_suspend_mgmt(struct hn_softc *);
316 static void hn_resume(struct hn_softc *);
317 static void hn_resume_data(struct hn_softc *);
318 static void hn_resume_mgmt(struct hn_softc *);
319 static void hn_suspend_mgmt_taskfunc(void *, int);
320 static void hn_chan_drain(struct hn_softc *,
321 struct vmbus_channel *);
323 static void hn_update_link_status(struct hn_softc *);
324 static void hn_change_network(struct hn_softc *);
325 static void hn_link_taskfunc(void *, int);
326 static void hn_netchg_init_taskfunc(void *, int);
327 static void hn_netchg_status_taskfunc(void *, int);
328 static void hn_link_status(struct hn_softc *);
330 static int hn_create_rx_data(struct hn_softc *, int);
331 static void hn_destroy_rx_data(struct hn_softc *);
332 static int hn_check_iplen(const struct mbuf *, int);
333 static int hn_set_rxfilter(struct hn_softc *);
335 static int hn_rss_reconfig(struct hn_softc *);
337 static void hn_rss_ind_fixup(struct hn_softc *);
338 static int hn_rxpkt(struct hn_rx_ring *, const void *,
339 int, const struct hn_rxinfo *);
341 static int hn_tx_ring_create(struct hn_softc *, int);
342 static void hn_tx_ring_destroy(struct hn_tx_ring *);
343 static int hn_create_tx_data(struct hn_softc *, int);
344 static void hn_fixup_tx_data(struct hn_softc *);
345 static void hn_destroy_tx_data(struct hn_softc *);
346 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
347 static void hn_txdesc_gc(struct hn_tx_ring *,
349 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
350 struct hn_txdesc *, struct mbuf **);
351 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
353 static void hn_set_chim_size(struct hn_softc *, int);
354 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
355 static bool hn_tx_ring_pending(struct hn_tx_ring *);
356 static void hn_tx_ring_qflush(struct hn_tx_ring *);
357 static void hn_resume_tx(struct hn_softc *, int);
358 static void hn_set_txagg(struct hn_softc *);
359 static void *hn_try_txagg(struct ifnet *,
360 struct hn_tx_ring *, struct hn_txdesc *,
362 static int hn_get_txswq_depth(const struct hn_tx_ring *);
363 static void hn_txpkt_done(struct hn_nvs_sendctx *,
364 struct hn_softc *, struct vmbus_channel *,
366 static int hn_txpkt_sglist(struct hn_tx_ring *,
368 static int hn_txpkt_chim(struct hn_tx_ring *,
370 static int hn_xmit(struct hn_tx_ring *, int);
371 static void hn_xmit_taskfunc(void *, int);
372 static void hn_xmit_txeof(struct hn_tx_ring *);
373 static void hn_xmit_txeof_taskfunc(void *, int);
374 #ifdef HN_IFSTART_SUPPORT
375 static int hn_start_locked(struct hn_tx_ring *, int);
376 static void hn_start_taskfunc(void *, int);
377 static void hn_start_txeof(struct hn_tx_ring *);
378 static void hn_start_txeof_taskfunc(void *, int);
381 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
382 "Hyper-V network interface");
384 /* Trust tcp segements verification on host side. */
385 static int hn_trust_hosttcp = 1;
386 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
387 &hn_trust_hosttcp, 0,
388 "Trust tcp segement verification on host side, "
389 "when csum info is missing (global setting)");
391 /* Trust udp datagrams verification on host side. */
392 static int hn_trust_hostudp = 1;
393 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
394 &hn_trust_hostudp, 0,
395 "Trust udp datagram verification on host side, "
396 "when csum info is missing (global setting)");
398 /* Trust ip packets verification on host side. */
399 static int hn_trust_hostip = 1;
400 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
402 "Trust ip packet verification on host side, "
403 "when csum info is missing (global setting)");
405 /* Limit TSO burst size */
406 static int hn_tso_maxlen = IP_MAXPACKET;
407 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
408 &hn_tso_maxlen, 0, "TSO burst limit");
410 /* Limit chimney send size */
411 static int hn_tx_chimney_size = 0;
412 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
413 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
415 /* Limit the size of packet for direct transmission */
416 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
417 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
418 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
420 /* # of LRO entries per RX ring */
421 #if defined(INET) || defined(INET6)
422 #if __FreeBSD_version >= 1100095
423 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
424 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
425 &hn_lro_entry_count, 0, "LRO entry count");
429 static int hn_tx_taskq_cnt = 1;
430 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
431 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
433 #define HN_TX_TASKQ_M_INDEP 0
434 #define HN_TX_TASKQ_M_GLOBAL 1
435 #define HN_TX_TASKQ_M_EVTTQ 2
437 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
438 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
439 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
440 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
442 #ifndef HN_USE_TXDESC_BUFRING
443 static int hn_use_txdesc_bufring = 0;
445 static int hn_use_txdesc_bufring = 1;
447 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
448 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
450 #ifdef HN_IFSTART_SUPPORT
451 /* Use ifnet.if_start instead of ifnet.if_transmit */
452 static int hn_use_if_start = 0;
453 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
454 &hn_use_if_start, 0, "Use if_start TX method");
457 /* # of channels to use */
458 static int hn_chan_cnt = 0;
459 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
461 "# of channels to use; each channel has one RX ring and one TX ring");
463 /* # of transmit rings to use */
464 static int hn_tx_ring_cnt = 0;
465 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
466 &hn_tx_ring_cnt, 0, "# of TX rings to use");
468 /* Software TX ring deptch */
469 static int hn_tx_swq_depth = 0;
470 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
471 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
473 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
474 #if __FreeBSD_version >= 1100095
475 static u_int hn_lro_mbufq_depth = 0;
476 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
477 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
480 /* Packet transmission aggregation size limit */
481 static int hn_tx_agg_size = -1;
482 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
483 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
485 /* Packet transmission aggregation count limit */
486 static int hn_tx_agg_pkts = -1;
487 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
488 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
490 static u_int hn_cpu_index; /* next CPU for channel */
491 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
495 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
496 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
497 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
498 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
499 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
500 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
504 static device_method_t hn_methods[] = {
505 /* Device interface */
506 DEVMETHOD(device_probe, hn_probe),
507 DEVMETHOD(device_attach, hn_attach),
508 DEVMETHOD(device_detach, hn_detach),
509 DEVMETHOD(device_shutdown, hn_shutdown),
513 static driver_t hn_driver = {
516 sizeof(struct hn_softc)
519 static devclass_t hn_devclass;
521 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
522 MODULE_VERSION(hn, 1);
523 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
525 #if __FreeBSD_version >= 1100099
527 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
531 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
532 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
537 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
540 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
541 txd->chim_size == 0, ("invalid rndis sglist txd"));
542 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
543 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
547 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
549 struct hn_nvs_rndis rndis;
551 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
552 txd->chim_size > 0, ("invalid rndis chim txd"));
554 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
555 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
556 rndis.nvs_chim_idx = txd->chim_index;
557 rndis.nvs_chim_sz = txd->chim_size;
559 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
560 &rndis, sizeof(rndis), &txd->send_ctx));
563 static __inline uint32_t
564 hn_chim_alloc(struct hn_softc *sc)
566 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
567 u_long *bmap = sc->hn_chim_bmap;
568 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
570 for (i = 0; i < bmap_cnt; ++i) {
573 idx = ffsl(~bmap[i]);
577 --idx; /* ffsl is 1-based */
578 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
579 ("invalid i %d and idx %d", i, idx));
581 if (atomic_testandset_long(&bmap[i], idx))
584 ret = i * LONG_BIT + idx;
591 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
596 idx = chim_idx / LONG_BIT;
597 KASSERT(idx < sc->hn_chim_bmap_cnt,
598 ("invalid chimney index 0x%x", chim_idx));
600 mask = 1UL << (chim_idx % LONG_BIT);
601 KASSERT(sc->hn_chim_bmap[idx] & mask,
602 ("index bitmap 0x%lx, chimney index %u, "
603 "bitmap idx %d, bitmask 0x%lx",
604 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
606 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
609 #if defined(INET6) || defined(INET)
611 * NOTE: If this function failed, the m_head would be freed.
613 static __inline struct mbuf *
614 hn_tso_fixup(struct mbuf *m_head)
616 struct ether_vlan_header *evl;
620 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
622 #define PULLUP_HDR(m, len) \
624 if (__predict_false((m)->m_len < (len))) { \
625 (m) = m_pullup((m), (len)); \
631 PULLUP_HDR(m_head, sizeof(*evl));
632 evl = mtod(m_head, struct ether_vlan_header *);
633 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
634 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
636 ehlen = ETHER_HDR_LEN;
639 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
643 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
644 ip = mtodo(m_head, ehlen);
645 iphlen = ip->ip_hl << 2;
647 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
648 th = mtodo(m_head, ehlen + iphlen);
652 th->th_sum = in_pseudo(ip->ip_src.s_addr,
653 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
656 #if defined(INET6) && defined(INET)
663 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
664 ip6 = mtodo(m_head, ehlen);
665 if (ip6->ip6_nxt != IPPROTO_TCP) {
670 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
671 th = mtodo(m_head, ehlen + sizeof(*ip6));
674 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
681 #endif /* INET6 || INET */
684 hn_set_rxfilter(struct hn_softc *sc)
686 struct ifnet *ifp = sc->hn_ifp;
692 if (ifp->if_flags & IFF_PROMISC) {
693 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
695 filter = NDIS_PACKET_TYPE_DIRECTED;
696 if (ifp->if_flags & IFF_BROADCAST)
697 filter |= NDIS_PACKET_TYPE_BROADCAST;
698 /* TODO: support multicast list */
699 if ((ifp->if_flags & IFF_ALLMULTI) ||
700 !TAILQ_EMPTY(&ifp->if_multiaddrs))
701 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
704 if (sc->hn_rx_filter != filter) {
705 error = hn_rndis_set_rxfilter(sc, filter);
707 sc->hn_rx_filter = filter;
713 hn_set_txagg(struct hn_softc *sc)
719 * Setup aggregation size.
721 if (sc->hn_agg_size < 0)
724 size = sc->hn_agg_size;
726 if (sc->hn_rndis_agg_size < size)
727 size = sc->hn_rndis_agg_size;
729 /* NOTE: We only aggregate packets using chimney sending buffers. */
730 if (size > (uint32_t)sc->hn_chim_szmax)
731 size = sc->hn_chim_szmax;
733 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
740 /* NOTE: Type of the per TX ring setting is 'int'. */
745 * Setup aggregation packet count.
747 if (sc->hn_agg_pkts < 0)
750 pkts = sc->hn_agg_pkts;
752 if (sc->hn_rndis_agg_pkts < pkts)
753 pkts = sc->hn_rndis_agg_pkts;
762 /* NOTE: Type of the per TX ring setting is 'short'. */
767 /* NOTE: Type of the per TX ring setting is 'short'. */
768 if (sc->hn_rndis_agg_align > SHRT_MAX) {
775 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
776 size, pkts, sc->hn_rndis_agg_align);
779 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
780 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
782 mtx_lock(&txr->hn_tx_lock);
783 txr->hn_agg_szmax = size;
784 txr->hn_agg_pktmax = pkts;
785 txr->hn_agg_align = sc->hn_rndis_agg_align;
786 mtx_unlock(&txr->hn_tx_lock);
791 hn_get_txswq_depth(const struct hn_tx_ring *txr)
794 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
795 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
796 return txr->hn_txdesc_cnt;
797 return hn_tx_swq_depth;
802 hn_rss_reconfig(struct hn_softc *sc)
808 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
815 * Direct reconfiguration by setting the UNCHG flags does
816 * _not_ work properly.
819 if_printf(sc->hn_ifp, "disable RSS\n");
820 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
822 if_printf(sc->hn_ifp, "RSS disable failed\n");
827 * Reenable the RSS w/ the updated RSS key or indirect
831 if_printf(sc->hn_ifp, "reconfig RSS\n");
832 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
834 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
842 hn_rss_ind_fixup(struct hn_softc *sc)
844 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
847 nchan = sc->hn_rx_ring_inuse;
848 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
851 * Check indirect table to make sure that all channels in it
854 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
855 if (rss->rss_ind[i] >= nchan) {
856 if_printf(sc->hn_ifp,
857 "RSS indirect table %d fixup: %u -> %d\n",
858 i, rss->rss_ind[i], nchan - 1);
859 rss->rss_ind[i] = nchan - 1;
865 hn_ifmedia_upd(struct ifnet *ifp __unused)
872 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
874 struct hn_softc *sc = ifp->if_softc;
876 ifmr->ifm_status = IFM_AVALID;
877 ifmr->ifm_active = IFM_ETHER;
879 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
880 ifmr->ifm_active |= IFM_NONE;
883 ifmr->ifm_status |= IFM_ACTIVE;
884 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
887 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
888 static const struct hyperv_guid g_net_vsc_device_type = {
889 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
890 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
894 hn_probe(device_t dev)
897 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
898 &g_net_vsc_device_type) == 0) {
899 device_set_desc(dev, "Hyper-V Network Interface");
900 return BUS_PROBE_DEFAULT;
906 hn_attach(device_t dev)
908 struct hn_softc *sc = device_get_softc(dev);
909 struct sysctl_oid_list *child;
910 struct sysctl_ctx_list *ctx;
911 uint8_t eaddr[ETHER_ADDR_LEN];
912 struct ifnet *ifp = NULL;
913 int error, ring_cnt, tx_ring_cnt;
916 sc->hn_prichan = vmbus_get_channel(dev);
920 * Initialize these tunables once.
922 sc->hn_agg_size = hn_tx_agg_size;
923 sc->hn_agg_pkts = hn_tx_agg_pkts;
926 * Setup taskqueue for transmission.
928 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
932 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
934 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
935 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
936 M_WAITOK, taskqueue_thread_enqueue,
937 &sc->hn_tx_taskqs[i]);
938 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
939 "%s tx%d", device_get_nameunit(dev), i);
941 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
942 sc->hn_tx_taskqs = hn_tx_taskque;
946 * Setup taskqueue for mangement tasks, e.g. link status.
948 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
949 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
950 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
951 device_get_nameunit(dev));
952 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
953 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
954 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
955 hn_netchg_status_taskfunc, sc);
958 * Allocate ifnet and setup its name earlier, so that if_printf
959 * can be used by functions, which will be called after
962 ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
964 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
967 * Initialize ifmedia earlier so that it can be unconditionally
968 * destroyed, if error happened later on.
970 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
973 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
974 * to use (tx_ring_cnt).
977 * The # of RX rings to use is same as the # of channels to use.
979 ring_cnt = hn_chan_cnt;
983 if (ring_cnt > HN_RING_CNT_DEF_MAX)
984 ring_cnt = HN_RING_CNT_DEF_MAX;
985 } else if (ring_cnt > mp_ncpus) {
989 if (ring_cnt > rss_getnumbuckets())
990 ring_cnt = rss_getnumbuckets();
993 tx_ring_cnt = hn_tx_ring_cnt;
994 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
995 tx_ring_cnt = ring_cnt;
996 #ifdef HN_IFSTART_SUPPORT
997 if (hn_use_if_start) {
998 /* ifnet.if_start only needs one TX ring. */
1004 * Set the leader CPU for channels.
1006 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1009 * Create enough TX/RX rings, even if only limited number of
1010 * channels can be allocated.
1012 error = hn_create_tx_data(sc, tx_ring_cnt);
1015 error = hn_create_rx_data(sc, ring_cnt);
1020 * Create transaction context for NVS and RNDIS transactions.
1022 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1023 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1024 if (sc->hn_xact == NULL) {
1030 * Install orphan handler for the revocation of this device's
1034 * The processing order is critical here:
1035 * Install the orphan handler, _before_ testing whether this
1036 * device's primary channel has been revoked or not.
1038 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1039 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1045 * Attach the synthetic parts, i.e. NVS and RNDIS.
1047 error = hn_synth_attach(sc, ETHERMTU);
1051 error = hn_rndis_get_eaddr(sc, eaddr);
1055 #if __FreeBSD_version >= 1100099
1056 if (sc->hn_rx_ring_inuse > 1) {
1058 * Reduce TCP segment aggregation limit for multiple
1059 * RX rings to increase ACK timeliness.
1061 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1066 * Fixup TX stuffs after synthetic parts are attached.
1068 hn_fixup_tx_data(sc);
1070 ctx = device_get_sysctl_ctx(dev);
1071 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1072 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1073 &sc->hn_nvs_ver, 0, "NVS version");
1074 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1075 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1076 hn_ndis_version_sysctl, "A", "NDIS version");
1077 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1078 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1079 hn_caps_sysctl, "A", "capabilities");
1080 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1081 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1082 hn_hwassist_sysctl, "A", "hwassist");
1083 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1084 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1085 hn_rxfilter_sysctl, "A", "rxfilter");
1086 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1087 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1088 hn_rss_hash_sysctl, "A", "RSS hash");
1089 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1090 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1093 * Don't allow RSS key/indirect table changes, if RSS is defined.
1095 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1096 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1097 hn_rss_key_sysctl, "IU", "RSS key");
1098 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1099 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1100 hn_rss_ind_sysctl, "IU", "RSS indirect table");
1102 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1103 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1104 "RNDIS offered packet transmission aggregation size limit");
1105 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1106 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1107 "RNDIS offered packet transmission aggregation count limit");
1108 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1109 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1110 "RNDIS packet transmission aggregation alignment");
1111 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1112 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1113 hn_txagg_size_sysctl, "I",
1114 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1115 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1116 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1117 hn_txagg_pkts_sysctl, "I",
1118 "Packet transmission aggregation packets, "
1119 "0 -- disable, -1 -- auto");
1122 * Setup the ifmedia, which has been initialized earlier.
1124 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1125 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1126 /* XXX ifmedia_set really should do this for us */
1127 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1130 * Setup the ifnet for this interface.
1133 ifp->if_baudrate = IF_Gbps(10);
1134 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1135 ifp->if_ioctl = hn_ioctl;
1136 ifp->if_init = hn_init;
1137 #ifdef HN_IFSTART_SUPPORT
1138 if (hn_use_if_start) {
1139 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1141 ifp->if_start = hn_start;
1142 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1143 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1144 IFQ_SET_READY(&ifp->if_snd);
1148 ifp->if_transmit = hn_transmit;
1149 ifp->if_qflush = hn_xmit_qflush;
1152 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1154 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1155 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1157 if (sc->hn_caps & HN_CAP_VLAN) {
1158 /* XXX not sure about VLAN_MTU. */
1159 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1162 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1163 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1164 ifp->if_capabilities |= IFCAP_TXCSUM;
1165 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1166 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1167 if (sc->hn_caps & HN_CAP_TSO4) {
1168 ifp->if_capabilities |= IFCAP_TSO4;
1169 ifp->if_hwassist |= CSUM_IP_TSO;
1171 if (sc->hn_caps & HN_CAP_TSO6) {
1172 ifp->if_capabilities |= IFCAP_TSO6;
1173 ifp->if_hwassist |= CSUM_IP6_TSO;
1176 /* Enable all available capabilities by default. */
1177 ifp->if_capenable = ifp->if_capabilities;
1180 * Disable IPv6 TSO and TXCSUM by default, they still can
1181 * be enabled through SIOCSIFCAP.
1183 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1184 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1186 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1187 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1188 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1189 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1192 ether_ifattach(ifp, eaddr);
1194 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1195 if_printf(ifp, "TSO segcnt %u segsz %u\n",
1196 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1199 /* Inform the upper layer about the long frame support. */
1200 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1203 * Kick off link status check.
1205 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1206 hn_update_link_status(sc);
1210 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1211 hn_synth_detach(sc);
1217 hn_detach(device_t dev)
1219 struct hn_softc *sc = device_get_softc(dev);
1220 struct ifnet *ifp = sc->hn_ifp;
1222 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1224 * In case that the vmbus missed the orphan handler
1227 vmbus_xact_ctx_orphan(sc->hn_xact);
1230 if (device_is_attached(dev)) {
1232 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1233 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1237 * hn_stop() only suspends data, so managment
1238 * stuffs have to be suspended manually here.
1240 hn_suspend_mgmt(sc);
1241 hn_synth_detach(sc);
1244 ether_ifdetach(ifp);
1247 ifmedia_removeall(&sc->hn_media);
1248 hn_destroy_rx_data(sc);
1249 hn_destroy_tx_data(sc);
1251 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1254 for (i = 0; i < hn_tx_taskq_cnt; ++i)
1255 taskqueue_free(sc->hn_tx_taskqs[i]);
1256 free(sc->hn_tx_taskqs, M_DEVBUF);
1258 taskqueue_free(sc->hn_mgmt_taskq0);
1260 if (sc->hn_xact != NULL) {
1262 * Uninstall the orphan handler _before_ the xact is
1265 vmbus_chan_unset_orphan(sc->hn_prichan);
1266 vmbus_xact_ctx_destroy(sc->hn_xact);
1271 HN_LOCK_DESTROY(sc);
1276 hn_shutdown(device_t dev)
1283 hn_link_status(struct hn_softc *sc)
1285 uint32_t link_status;
1288 error = hn_rndis_get_linkstatus(sc, &link_status);
1290 /* XXX what to do? */
1294 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1295 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1297 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1298 if_link_state_change(sc->hn_ifp,
1299 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1300 LINK_STATE_UP : LINK_STATE_DOWN);
1304 hn_link_taskfunc(void *xsc, int pending __unused)
1306 struct hn_softc *sc = xsc;
1308 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1314 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1316 struct hn_softc *sc = xsc;
1318 /* Prevent any link status checks from running. */
1319 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1322 * Fake up a [link down --> link up] state change; 5 seconds
1323 * delay is used, which closely simulates miibus reaction
1324 * upon link down event.
1326 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1327 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1328 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1329 &sc->hn_netchg_status, 5 * hz);
1333 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1335 struct hn_softc *sc = xsc;
1337 /* Re-allow link status checks. */
1338 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1343 hn_update_link_status(struct hn_softc *sc)
1346 if (sc->hn_mgmt_taskq != NULL)
1347 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1351 hn_change_network(struct hn_softc *sc)
1354 if (sc->hn_mgmt_taskq != NULL)
1355 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1359 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1360 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1362 struct mbuf *m = *m_head;
1365 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1367 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1368 m, segs, nsegs, BUS_DMA_NOWAIT);
1369 if (error == EFBIG) {
1372 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1376 *m_head = m = m_new;
1377 txr->hn_tx_collapsed++;
1379 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1380 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1383 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1384 BUS_DMASYNC_PREWRITE);
1385 txd->flags |= HN_TXD_FLAG_DMAMAP;
1391 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1394 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1395 ("put an onlist txd %#x", txd->flags));
1396 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1397 ("put an onagg txd %#x", txd->flags));
1399 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1400 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1403 if (!STAILQ_EMPTY(&txd->agg_list)) {
1404 struct hn_txdesc *tmp_txd;
1406 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1409 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1410 ("resursive aggregation on aggregated txdesc"));
1411 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1412 ("not aggregated txdesc"));
1413 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1414 ("aggregated txdesc uses dmamap"));
1415 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1416 ("aggregated txdesc consumes "
1417 "chimney sending buffer"));
1418 KASSERT(tmp_txd->chim_size == 0,
1419 ("aggregated txdesc has non-zero "
1420 "chimney sending size"));
1422 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1423 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1424 freed = hn_txdesc_put(txr, tmp_txd);
1425 KASSERT(freed, ("failed to free aggregated txdesc"));
1429 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1430 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1431 ("chim txd uses dmamap"));
1432 hn_chim_free(txr->hn_sc, txd->chim_index);
1433 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1435 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1436 bus_dmamap_sync(txr->hn_tx_data_dtag,
1437 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1438 bus_dmamap_unload(txr->hn_tx_data_dtag,
1440 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1443 if (txd->m != NULL) {
1448 txd->flags |= HN_TXD_FLAG_ONLIST;
1449 #ifndef HN_USE_TXDESC_BUFRING
1450 mtx_lock_spin(&txr->hn_txlist_spin);
1451 KASSERT(txr->hn_txdesc_avail >= 0 &&
1452 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1453 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1454 txr->hn_txdesc_avail++;
1455 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1456 mtx_unlock_spin(&txr->hn_txlist_spin);
1457 #else /* HN_USE_TXDESC_BUFRING */
1459 atomic_add_int(&txr->hn_txdesc_avail, 1);
1461 buf_ring_enqueue(txr->hn_txdesc_br, txd);
1462 #endif /* !HN_USE_TXDESC_BUFRING */
1467 static __inline struct hn_txdesc *
1468 hn_txdesc_get(struct hn_tx_ring *txr)
1470 struct hn_txdesc *txd;
1472 #ifndef HN_USE_TXDESC_BUFRING
1473 mtx_lock_spin(&txr->hn_txlist_spin);
1474 txd = SLIST_FIRST(&txr->hn_txlist);
1476 KASSERT(txr->hn_txdesc_avail > 0,
1477 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1478 txr->hn_txdesc_avail--;
1479 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1481 mtx_unlock_spin(&txr->hn_txlist_spin);
1483 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1487 #ifdef HN_USE_TXDESC_BUFRING
1489 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1491 #endif /* HN_USE_TXDESC_BUFRING */
1492 KASSERT(txd->m == NULL && txd->refs == 0 &&
1493 STAILQ_EMPTY(&txd->agg_list) &&
1494 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1495 txd->chim_size == 0 &&
1496 (txd->flags & HN_TXD_FLAG_ONLIST) &&
1497 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1498 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1499 txd->flags &= ~HN_TXD_FLAG_ONLIST;
1505 static __inline void
1506 hn_txdesc_hold(struct hn_txdesc *txd)
1509 /* 0->1 transition will never work */
1510 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1511 atomic_add_int(&txd->refs, 1);
1514 static __inline void
1515 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1518 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1519 ("recursive aggregation on aggregating txdesc"));
1521 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1522 ("already aggregated"));
1523 KASSERT(STAILQ_EMPTY(&txd->agg_list),
1524 ("recursive aggregation on to-be-aggregated txdesc"));
1526 txd->flags |= HN_TXD_FLAG_ONAGG;
1527 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1531 hn_tx_ring_pending(struct hn_tx_ring *txr)
1533 bool pending = false;
1535 #ifndef HN_USE_TXDESC_BUFRING
1536 mtx_lock_spin(&txr->hn_txlist_spin);
1537 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1539 mtx_unlock_spin(&txr->hn_txlist_spin);
1541 if (!buf_ring_full(txr->hn_txdesc_br))
1547 static __inline void
1548 hn_txeof(struct hn_tx_ring *txr)
1550 txr->hn_has_txeof = 0;
1555 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1556 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1558 struct hn_txdesc *txd = sndc->hn_cbarg;
1559 struct hn_tx_ring *txr;
1562 KASSERT(txr->hn_chan == chan,
1563 ("channel mismatch, on chan%u, should be chan%u",
1564 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1566 txr->hn_has_txeof = 1;
1567 hn_txdesc_put(txr, txd);
1569 ++txr->hn_txdone_cnt;
1570 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1571 txr->hn_txdone_cnt = 0;
1572 if (txr->hn_oactive)
1578 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1580 #if defined(INET) || defined(INET6)
1581 tcp_lro_flush_all(&rxr->hn_lro);
1586 * 'txr' could be NULL, if multiple channels and
1587 * ifnet.if_start method are enabled.
1589 if (txr == NULL || !txr->hn_has_txeof)
1592 txr->hn_txdone_cnt = 0;
1596 static __inline uint32_t
1597 hn_rndis_pktmsg_offset(uint32_t ofs)
1600 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1601 ("invalid RNDIS packet msg offset %u", ofs));
1602 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1605 static __inline void *
1606 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1607 size_t pi_dlen, uint32_t pi_type)
1609 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1610 struct rndis_pktinfo *pi;
1612 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1613 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1616 * Per-packet-info does not move; it only grows.
1619 * rm_pktinfooffset in this phase counts from the beginning
1620 * of rndis_packet_msg.
1622 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1623 ("%u pktinfo overflows RNDIS packet msg", pi_type));
1624 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1625 pkt->rm_pktinfolen);
1626 pkt->rm_pktinfolen += pi_size;
1628 pi->rm_size = pi_size;
1629 pi->rm_type = pi_type;
1630 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1632 /* Data immediately follow per-packet-info. */
1633 pkt->rm_dataoffset += pi_size;
1635 /* Update RNDIS packet msg length */
1636 pkt->rm_len += pi_size;
1638 return (pi->rm_data);
1642 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1644 struct hn_txdesc *txd;
1648 txd = txr->hn_agg_txd;
1649 KASSERT(txd != NULL, ("no aggregate txdesc"));
1652 * Since hn_txpkt() will reset this temporary stat, save
1653 * it now, so that oerrors can be updated properly, if
1654 * hn_txpkt() ever fails.
1656 pkts = txr->hn_stat_pkts;
1659 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1660 * failure, save it for later freeing, if hn_txpkt() ever
1664 error = hn_txpkt(ifp, txr, txd);
1665 if (__predict_false(error)) {
1666 /* txd is freed, but m is not. */
1669 txr->hn_flush_failed++;
1670 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1673 /* Reset all aggregation states. */
1674 txr->hn_agg_txd = NULL;
1675 txr->hn_agg_szleft = 0;
1676 txr->hn_agg_pktleft = 0;
1677 txr->hn_agg_prevpkt = NULL;
1683 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1688 if (txr->hn_agg_txd != NULL) {
1689 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1690 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1691 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1695 * Update the previous RNDIS packet's total length,
1696 * it can be increased due to the mandatory alignment
1697 * padding for this RNDIS packet. And update the
1698 * aggregating txdesc's chimney sending buffer size
1702 * Zero-out the padding, as required by the RNDIS spec.
1705 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1706 agg_txd->chim_size += pkt->rm_len - olen;
1708 /* Link this txdesc to the parent. */
1709 hn_txdesc_agg(agg_txd, txd);
1711 chim = (uint8_t *)pkt + pkt->rm_len;
1712 /* Save the current packet for later fixup. */
1713 txr->hn_agg_prevpkt = chim;
1715 txr->hn_agg_pktleft--;
1716 txr->hn_agg_szleft -= pktsize;
1717 if (txr->hn_agg_szleft <=
1718 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1720 * Probably can't aggregate more packets,
1721 * flush this aggregating txdesc proactively.
1723 txr->hn_agg_pktleft = 0;
1728 hn_flush_txagg(ifp, txr);
1730 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1732 txr->hn_tx_chimney_tried++;
1733 txd->chim_index = hn_chim_alloc(txr->hn_sc);
1734 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1736 txr->hn_tx_chimney++;
1738 chim = txr->hn_sc->hn_chim +
1739 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1741 if (txr->hn_agg_pktmax > 1 &&
1742 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1743 txr->hn_agg_txd = txd;
1744 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1745 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1746 txr->hn_agg_prevpkt = chim;
1753 * If this function fails, then both txd and m_head0 will be freed.
1756 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1757 struct mbuf **m_head0)
1759 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1760 int error, nsegs, i;
1761 struct mbuf *m_head = *m_head0;
1762 struct rndis_packet_msg *pkt;
1765 int pkt_hlen, pkt_size;
1767 pkt = txd->rndis_pkt;
1768 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1769 if (pkt_size < txr->hn_chim_size) {
1770 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1774 if (txr->hn_agg_txd != NULL)
1775 hn_flush_txagg(ifp, txr);
1778 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1779 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1780 pkt->rm_dataoffset = sizeof(*pkt);
1781 pkt->rm_datalen = m_head->m_pkthdr.len;
1782 pkt->rm_oobdataoffset = 0;
1783 pkt->rm_oobdatalen = 0;
1784 pkt->rm_oobdataelements = 0;
1785 pkt->rm_pktinfooffset = sizeof(*pkt);
1786 pkt->rm_pktinfolen = 0;
1787 pkt->rm_vchandle = 0;
1788 pkt->rm_reserved = 0;
1790 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1792 * Set the hash value for this packet, so that the host could
1793 * dispatch the TX done event for this packet back to this TX
1796 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1797 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1798 *pi_data = txr->hn_tx_idx;
1801 if (m_head->m_flags & M_VLANTAG) {
1802 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1803 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1804 *pi_data = NDIS_VLAN_INFO_MAKE(
1805 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1806 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1807 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1810 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1811 #if defined(INET6) || defined(INET)
1812 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1813 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1815 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1816 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1817 m_head->m_pkthdr.tso_segsz);
1820 #if defined(INET6) && defined(INET)
1825 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1826 m_head->m_pkthdr.tso_segsz);
1829 #endif /* INET6 || INET */
1830 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1831 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1832 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1833 if (m_head->m_pkthdr.csum_flags &
1834 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1835 *pi_data = NDIS_TXCSUM_INFO_IPV6;
1837 *pi_data = NDIS_TXCSUM_INFO_IPV4;
1838 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1839 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1842 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1843 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1844 else if (m_head->m_pkthdr.csum_flags &
1845 (CSUM_IP_UDP | CSUM_IP6_UDP))
1846 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1849 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1850 /* Convert RNDIS packet message offsets */
1851 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1852 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1855 * Fast path: Chimney sending.
1858 struct hn_txdesc *tgt_txd = txd;
1860 if (txr->hn_agg_txd != NULL) {
1861 tgt_txd = txr->hn_agg_txd;
1867 KASSERT(pkt == chim,
1868 ("RNDIS pkt not in chimney sending buffer"));
1869 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1870 ("chimney sending buffer is not used"));
1871 tgt_txd->chim_size += pkt->rm_len;
1873 m_copydata(m_head, 0, m_head->m_pkthdr.len,
1874 ((uint8_t *)chim) + pkt_hlen);
1876 txr->hn_gpa_cnt = 0;
1877 txr->hn_sendpkt = hn_txpkt_chim;
1881 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1882 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1883 ("chimney buffer is used"));
1884 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1886 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1887 if (__predict_false(error)) {
1891 * This mbuf is not linked w/ the txd yet, so free it now.
1896 freed = hn_txdesc_put(txr, txd);
1898 ("fail to free txd upon txdma error"));
1900 txr->hn_txdma_failed++;
1901 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1906 /* +1 RNDIS packet message */
1907 txr->hn_gpa_cnt = nsegs + 1;
1909 /* send packet with page buffer */
1910 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1911 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1912 txr->hn_gpa[0].gpa_len = pkt_hlen;
1915 * Fill the page buffers with mbuf info after the page
1916 * buffer for RNDIS packet message.
1918 for (i = 0; i < nsegs; ++i) {
1919 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1921 gpa->gpa_page = atop(segs[i].ds_addr);
1922 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1923 gpa->gpa_len = segs[i].ds_len;
1926 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1928 txr->hn_sendpkt = hn_txpkt_sglist;
1932 /* Set the completion routine */
1933 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1935 /* Update temporary stats for later use. */
1936 txr->hn_stat_pkts++;
1937 txr->hn_stat_size += m_head->m_pkthdr.len;
1938 if (m_head->m_flags & M_MCAST)
1939 txr->hn_stat_mcasts++;
1946 * If this function fails, then txd will be freed, but the mbuf
1947 * associated w/ the txd will _not_ be freed.
1950 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1952 int error, send_failed = 0, has_bpf;
1955 has_bpf = bpf_peers_present(ifp->if_bpf);
1958 * Make sure that this txd and any aggregated txds are not
1959 * freed before ETHER_BPF_MTAP.
1961 hn_txdesc_hold(txd);
1963 error = txr->hn_sendpkt(txr, txd);
1966 const struct hn_txdesc *tmp_txd;
1968 ETHER_BPF_MTAP(ifp, txd->m);
1969 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1970 ETHER_BPF_MTAP(ifp, tmp_txd->m);
1973 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1974 #ifdef HN_IFSTART_SUPPORT
1975 if (!hn_use_if_start)
1978 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1980 if (txr->hn_stat_mcasts != 0) {
1981 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1982 txr->hn_stat_mcasts);
1985 txr->hn_pkts += txr->hn_stat_pkts;
1989 hn_txdesc_put(txr, txd);
1991 if (__predict_false(error)) {
1995 * This should "really rarely" happen.
1997 * XXX Too many RX to be acked or too many sideband
1998 * commands to run? Ask netvsc_channel_rollup()
1999 * to kick start later.
2001 txr->hn_has_txeof = 1;
2003 txr->hn_send_failed++;
2006 * Try sending again after set hn_has_txeof;
2007 * in case that we missed the last
2008 * netvsc_channel_rollup().
2012 if_printf(ifp, "send failed\n");
2015 * Caller will perform further processing on the
2016 * associated mbuf, so don't free it in hn_txdesc_put();
2017 * only unload it from the DMA map in hn_txdesc_put(),
2021 freed = hn_txdesc_put(txr, txd);
2023 ("fail to free txd upon send error"));
2025 txr->hn_send_failed++;
2028 /* Reset temporary stats, after this sending is done. */
2029 txr->hn_stat_size = 0;
2030 txr->hn_stat_pkts = 0;
2031 txr->hn_stat_mcasts = 0;
2037 * Append the specified data to the indicated mbuf chain,
2038 * Extend the mbuf chain if the new data does not fit in
2041 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2042 * There should be an equivalent in the kernel mbuf code,
2043 * but there does not appear to be one yet.
2045 * Differs from m_append() in that additional mbufs are
2046 * allocated with cluster size MJUMPAGESIZE, and filled
2049 * Return 1 if able to complete the job; otherwise 0.
2052 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2055 int remainder, space;
2057 for (m = m0; m->m_next != NULL; m = m->m_next)
2060 space = M_TRAILINGSPACE(m);
2063 * Copy into available space.
2065 if (space > remainder)
2067 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2072 while (remainder > 0) {
2074 * Allocate a new mbuf; could check space
2075 * and allocate a cluster instead.
2077 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
2080 n->m_len = min(MJUMPAGESIZE, remainder);
2081 bcopy(cp, mtod(n, caddr_t), n->m_len);
2083 remainder -= n->m_len;
2087 if (m0->m_flags & M_PKTHDR)
2088 m0->m_pkthdr.len += len - remainder;
2090 return (remainder == 0);
2093 #if defined(INET) || defined(INET6)
2095 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2097 #if __FreeBSD_version >= 1100095
2098 if (hn_lro_mbufq_depth) {
2099 tcp_lro_queue_mbuf(lc, m);
2103 return tcp_lro_rx(lc, m, 0);
2108 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2109 const struct hn_rxinfo *info)
2111 struct ifnet *ifp = rxr->hn_ifp;
2113 int size, do_lro = 0, do_csum = 1;
2116 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2120 * Bail out if packet contains more data than configured MTU.
2122 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
2124 } else if (dlen <= MHLEN) {
2125 m_new = m_gethdr(M_NOWAIT, MT_DATA);
2126 if (m_new == NULL) {
2127 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2130 memcpy(mtod(m_new, void *), data, dlen);
2131 m_new->m_pkthdr.len = m_new->m_len = dlen;
2132 rxr->hn_small_pkts++;
2135 * Get an mbuf with a cluster. For packets 2K or less,
2136 * get a standard 2K cluster. For anything larger, get a
2137 * 4K cluster. Any buffers larger than 4K can cause problems
2138 * if looped around to the Hyper-V TX channel, so avoid them.
2141 if (dlen > MCLBYTES) {
2143 size = MJUMPAGESIZE;
2146 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2147 if (m_new == NULL) {
2148 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2152 hv_m_append(m_new, dlen, data);
2154 m_new->m_pkthdr.rcvif = ifp;
2156 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2159 /* receive side checksum offload */
2160 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2161 /* IP csum offload */
2162 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2163 m_new->m_pkthdr.csum_flags |=
2164 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2168 /* TCP/UDP csum offload */
2169 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2170 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2171 m_new->m_pkthdr.csum_flags |=
2172 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2173 m_new->m_pkthdr.csum_data = 0xffff;
2174 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2182 * As of this write (Oct 28th, 2016), host side will turn
2183 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2184 * the do_lro setting here is actually _not_ accurate. We
2185 * depend on the RSS hash type check to reset do_lro.
2187 if ((info->csum_info &
2188 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2189 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2192 const struct ether_header *eh;
2197 if (m_new->m_len < hoff)
2199 eh = mtod(m_new, struct ether_header *);
2200 etype = ntohs(eh->ether_type);
2201 if (etype == ETHERTYPE_VLAN) {
2202 const struct ether_vlan_header *evl;
2204 hoff = sizeof(*evl);
2205 if (m_new->m_len < hoff)
2207 evl = mtod(m_new, struct ether_vlan_header *);
2208 etype = ntohs(evl->evl_proto);
2211 if (etype == ETHERTYPE_IP) {
2214 pr = hn_check_iplen(m_new, hoff);
2215 if (pr == IPPROTO_TCP) {
2217 (rxr->hn_trust_hcsum &
2218 HN_TRUST_HCSUM_TCP)) {
2219 rxr->hn_csum_trusted++;
2220 m_new->m_pkthdr.csum_flags |=
2221 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2222 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2223 m_new->m_pkthdr.csum_data = 0xffff;
2226 } else if (pr == IPPROTO_UDP) {
2228 (rxr->hn_trust_hcsum &
2229 HN_TRUST_HCSUM_UDP)) {
2230 rxr->hn_csum_trusted++;
2231 m_new->m_pkthdr.csum_flags |=
2232 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2233 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2234 m_new->m_pkthdr.csum_data = 0xffff;
2236 } else if (pr != IPPROTO_DONE && do_csum &&
2237 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2238 rxr->hn_csum_trusted++;
2239 m_new->m_pkthdr.csum_flags |=
2240 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2245 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2246 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2247 NDIS_VLAN_INFO_ID(info->vlan_info),
2248 NDIS_VLAN_INFO_PRI(info->vlan_info),
2249 NDIS_VLAN_INFO_CFI(info->vlan_info));
2250 m_new->m_flags |= M_VLANTAG;
2253 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2255 m_new->m_pkthdr.flowid = info->hash_value;
2256 hash_type = M_HASHTYPE_OPAQUE_HASH;
2257 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2258 NDIS_HASH_FUNCTION_TOEPLITZ) {
2259 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2263 * do_lro is resetted, if the hash types are not TCP
2264 * related. See the comment in the above csum_flags
2268 case NDIS_HASH_IPV4:
2269 hash_type = M_HASHTYPE_RSS_IPV4;
2273 case NDIS_HASH_TCP_IPV4:
2274 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2277 case NDIS_HASH_IPV6:
2278 hash_type = M_HASHTYPE_RSS_IPV6;
2282 case NDIS_HASH_IPV6_EX:
2283 hash_type = M_HASHTYPE_RSS_IPV6_EX;
2287 case NDIS_HASH_TCP_IPV6:
2288 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2291 case NDIS_HASH_TCP_IPV6_EX:
2292 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2297 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2298 hash_type = M_HASHTYPE_OPAQUE;
2300 M_HASHTYPE_SET(m_new, hash_type);
2303 * Note: Moved RX completion back to hv_nv_on_receive() so all
2304 * messages (not just data messages) will trigger a response.
2307 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
2310 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2311 #if defined(INET) || defined(INET6)
2312 struct lro_ctrl *lro = &rxr->hn_lro;
2315 rxr->hn_lro_tried++;
2316 if (hn_lro_rx(lro, m_new) == 0) {
2324 /* We're not holding the lock here, so don't release it */
2325 (*ifp->if_input)(ifp, m_new);
2331 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2333 struct hn_softc *sc = ifp->if_softc;
2334 struct ifreq *ifr = (struct ifreq *)data;
2335 int mask, error = 0;
2339 if (ifr->ifr_mtu > HN_MTU_MAX) {
2346 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2351 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2352 /* Can't change MTU */
2358 if (ifp->if_mtu == ifr->ifr_mtu) {
2364 * Suspend this interface before the synthetic parts
2370 * Detach the synthetics parts, i.e. NVS and RNDIS.
2372 hn_synth_detach(sc);
2375 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2376 * with the new MTU setting.
2378 error = hn_synth_attach(sc, ifr->ifr_mtu);
2385 * Commit the requested MTU, after the synthetic parts
2386 * have been successfully attached.
2388 ifp->if_mtu = ifr->ifr_mtu;
2391 * Make sure that various parameters based on MTU are
2392 * still valid, after the MTU change.
2394 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2395 hn_set_chim_size(sc, sc->hn_chim_szmax);
2396 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2397 #if __FreeBSD_version >= 1100099
2398 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2399 HN_LRO_LENLIM_MIN(ifp))
2400 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2404 * All done! Resume the interface now.
2414 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2419 if (ifp->if_flags & IFF_UP) {
2420 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2422 * Caller meight hold mutex, e.g.
2423 * bpf; use busy-wait for the RNDIS
2427 hn_set_rxfilter(sc);
2433 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2436 sc->hn_if_flags = ifp->if_flags;
2443 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2445 if (mask & IFCAP_TXCSUM) {
2446 ifp->if_capenable ^= IFCAP_TXCSUM;
2447 if (ifp->if_capenable & IFCAP_TXCSUM)
2448 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2450 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2452 if (mask & IFCAP_TXCSUM_IPV6) {
2453 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2454 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2455 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2457 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2460 /* TODO: flip RNDIS offload parameters for RXCSUM. */
2461 if (mask & IFCAP_RXCSUM)
2462 ifp->if_capenable ^= IFCAP_RXCSUM;
2464 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2465 if (mask & IFCAP_RXCSUM_IPV6)
2466 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2469 if (mask & IFCAP_LRO)
2470 ifp->if_capenable ^= IFCAP_LRO;
2472 if (mask & IFCAP_TSO4) {
2473 ifp->if_capenable ^= IFCAP_TSO4;
2474 if (ifp->if_capenable & IFCAP_TSO4)
2475 ifp->if_hwassist |= CSUM_IP_TSO;
2477 ifp->if_hwassist &= ~CSUM_IP_TSO;
2479 if (mask & IFCAP_TSO6) {
2480 ifp->if_capenable ^= IFCAP_TSO6;
2481 if (ifp->if_capenable & IFCAP_TSO6)
2482 ifp->if_hwassist |= CSUM_IP6_TSO;
2484 ifp->if_hwassist &= ~CSUM_IP6_TSO;
2494 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2498 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2500 * Multicast uses mutex; use busy-wait for
2504 hn_set_rxfilter(sc);
2513 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2517 error = ether_ioctl(ifp, cmd, data);
2524 hn_stop(struct hn_softc *sc)
2526 struct ifnet *ifp = sc->hn_ifp;
2531 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2532 ("synthetic parts were not attached"));
2534 /* Clear RUNNING bit _before_ hn_suspend_data() */
2535 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2536 hn_suspend_data(sc);
2538 /* Clear OACTIVE bit. */
2539 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2540 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2541 sc->hn_tx_ring[i].hn_oactive = 0;
2545 hn_init_locked(struct hn_softc *sc)
2547 struct ifnet *ifp = sc->hn_ifp;
2552 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2555 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2558 /* Configure RX filter */
2559 hn_set_rxfilter(sc);
2561 /* Clear OACTIVE bit. */
2562 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2563 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2564 sc->hn_tx_ring[i].hn_oactive = 0;
2566 /* Clear TX 'suspended' bit. */
2567 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2569 /* Everything is ready; unleash! */
2570 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2576 struct hn_softc *sc = xsc;
2583 #if __FreeBSD_version >= 1100099
2586 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2588 struct hn_softc *sc = arg1;
2589 unsigned int lenlim;
2592 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2593 error = sysctl_handle_int(oidp, &lenlim, 0, req);
2594 if (error || req->newptr == NULL)
2598 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2599 lenlim > TCP_LRO_LENGTH_MAX) {
2603 hn_set_lro_lenlim(sc, lenlim);
2610 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2612 struct hn_softc *sc = arg1;
2613 int ackcnt, error, i;
2616 * lro_ackcnt_lim is append count limit,
2617 * +1 to turn it into aggregation limit.
2619 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2620 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2621 if (error || req->newptr == NULL)
2624 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2628 * Convert aggregation limit back to append
2633 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2634 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2642 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2644 struct hn_softc *sc = arg1;
2649 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2652 error = sysctl_handle_int(oidp, &on, 0, req);
2653 if (error || req->newptr == NULL)
2657 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2658 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2661 rxr->hn_trust_hcsum |= hcsum;
2663 rxr->hn_trust_hcsum &= ~hcsum;
2670 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2672 struct hn_softc *sc = arg1;
2673 int chim_size, error;
2675 chim_size = sc->hn_tx_ring[0].hn_chim_size;
2676 error = sysctl_handle_int(oidp, &chim_size, 0, req);
2677 if (error || req->newptr == NULL)
2680 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2684 hn_set_chim_size(sc, chim_size);
2689 #if __FreeBSD_version < 1100095
2691 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2693 struct hn_softc *sc = arg1;
2694 int ofs = arg2, i, error;
2695 struct hn_rx_ring *rxr;
2699 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2700 rxr = &sc->hn_rx_ring[i];
2701 stat += *((int *)((uint8_t *)rxr + ofs));
2704 error = sysctl_handle_64(oidp, &stat, 0, req);
2705 if (error || req->newptr == NULL)
2708 /* Zero out this stat. */
2709 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2710 rxr = &sc->hn_rx_ring[i];
2711 *((int *)((uint8_t *)rxr + ofs)) = 0;
2717 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2719 struct hn_softc *sc = arg1;
2720 int ofs = arg2, i, error;
2721 struct hn_rx_ring *rxr;
2725 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2726 rxr = &sc->hn_rx_ring[i];
2727 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2730 error = sysctl_handle_64(oidp, &stat, 0, req);
2731 if (error || req->newptr == NULL)
2734 /* Zero out this stat. */
2735 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2736 rxr = &sc->hn_rx_ring[i];
2737 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2745 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2747 struct hn_softc *sc = arg1;
2748 int ofs = arg2, i, error;
2749 struct hn_rx_ring *rxr;
2753 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2754 rxr = &sc->hn_rx_ring[i];
2755 stat += *((u_long *)((uint8_t *)rxr + ofs));
2758 error = sysctl_handle_long(oidp, &stat, 0, req);
2759 if (error || req->newptr == NULL)
2762 /* Zero out this stat. */
2763 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2764 rxr = &sc->hn_rx_ring[i];
2765 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
2771 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2773 struct hn_softc *sc = arg1;
2774 int ofs = arg2, i, error;
2775 struct hn_tx_ring *txr;
2779 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2780 txr = &sc->hn_tx_ring[i];
2781 stat += *((u_long *)((uint8_t *)txr + ofs));
2784 error = sysctl_handle_long(oidp, &stat, 0, req);
2785 if (error || req->newptr == NULL)
2788 /* Zero out this stat. */
2789 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2790 txr = &sc->hn_tx_ring[i];
2791 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2797 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2799 struct hn_softc *sc = arg1;
2800 int ofs = arg2, i, error, conf;
2801 struct hn_tx_ring *txr;
2803 txr = &sc->hn_tx_ring[0];
2804 conf = *((int *)((uint8_t *)txr + ofs));
2806 error = sysctl_handle_int(oidp, &conf, 0, req);
2807 if (error || req->newptr == NULL)
2811 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2812 txr = &sc->hn_tx_ring[i];
2813 *((int *)((uint8_t *)txr + ofs)) = conf;
2821 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2823 struct hn_softc *sc = arg1;
2826 size = sc->hn_agg_size;
2827 error = sysctl_handle_int(oidp, &size, 0, req);
2828 if (error || req->newptr == NULL)
2832 sc->hn_agg_size = size;
2840 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2842 struct hn_softc *sc = arg1;
2845 pkts = sc->hn_agg_pkts;
2846 error = sysctl_handle_int(oidp, &pkts, 0, req);
2847 if (error || req->newptr == NULL)
2851 sc->hn_agg_pkts = pkts;
2859 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2861 struct hn_softc *sc = arg1;
2864 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2865 return (sysctl_handle_int(oidp, &pkts, 0, req));
2869 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2871 struct hn_softc *sc = arg1;
2874 align = sc->hn_tx_ring[0].hn_agg_align;
2875 return (sysctl_handle_int(oidp, &align, 0, req));
2879 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2881 struct hn_softc *sc = arg1;
2884 snprintf(verstr, sizeof(verstr), "%u.%u",
2885 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2886 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2887 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2891 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2893 struct hn_softc *sc = arg1;
2900 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2901 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2905 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2907 struct hn_softc *sc = arg1;
2908 char assist_str[128];
2912 hwassist = sc->hn_ifp->if_hwassist;
2914 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2915 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2919 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2921 struct hn_softc *sc = arg1;
2922 char filter_str[128];
2926 filter = sc->hn_rx_filter;
2928 snprintf(filter_str, sizeof(filter_str), "%b", filter,
2930 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2936 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2938 struct hn_softc *sc = arg1;
2943 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2944 if (error || req->newptr == NULL)
2947 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2950 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2952 if (sc->hn_rx_ring_inuse > 1) {
2953 error = hn_rss_reconfig(sc);
2955 /* Not RSS capable, at least for now; just save the RSS key. */
2964 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2966 struct hn_softc *sc = arg1;
2971 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2972 if (error || req->newptr == NULL)
2976 * Don't allow RSS indirect table change, if this interface is not
2977 * RSS capable currently.
2979 if (sc->hn_rx_ring_inuse == 1) {
2984 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2987 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2989 hn_rss_ind_fixup(sc);
2990 error = hn_rss_reconfig(sc);
2999 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3001 struct hn_softc *sc = arg1;
3006 hash = sc->hn_rss_hash;
3008 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3009 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3013 hn_check_iplen(const struct mbuf *m, int hoff)
3015 const struct ip *ip;
3016 int len, iphlen, iplen;
3017 const struct tcphdr *th;
3018 int thoff; /* TCP data offset */
3020 len = hoff + sizeof(struct ip);
3022 /* The packet must be at least the size of an IP header. */
3023 if (m->m_pkthdr.len < len)
3024 return IPPROTO_DONE;
3026 /* The fixed IP header must reside completely in the first mbuf. */
3028 return IPPROTO_DONE;
3030 ip = mtodo(m, hoff);
3032 /* Bound check the packet's stated IP header length. */
3033 iphlen = ip->ip_hl << 2;
3034 if (iphlen < sizeof(struct ip)) /* minimum header length */
3035 return IPPROTO_DONE;
3037 /* The full IP header must reside completely in the one mbuf. */
3038 if (m->m_len < hoff + iphlen)
3039 return IPPROTO_DONE;
3041 iplen = ntohs(ip->ip_len);
3044 * Check that the amount of data in the buffers is as
3045 * at least much as the IP header would have us expect.
3047 if (m->m_pkthdr.len < hoff + iplen)
3048 return IPPROTO_DONE;
3051 * Ignore IP fragments.
3053 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3054 return IPPROTO_DONE;
3057 * The TCP/IP or UDP/IP header must be entirely contained within
3058 * the first fragment of a packet.
3062 if (iplen < iphlen + sizeof(struct tcphdr))
3063 return IPPROTO_DONE;
3064 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3065 return IPPROTO_DONE;
3066 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3067 thoff = th->th_off << 2;
3068 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3069 return IPPROTO_DONE;
3070 if (m->m_len < hoff + iphlen + thoff)
3071 return IPPROTO_DONE;
3074 if (iplen < iphlen + sizeof(struct udphdr))
3075 return IPPROTO_DONE;
3076 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3077 return IPPROTO_DONE;
3081 return IPPROTO_DONE;
3088 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3090 struct sysctl_oid_list *child;
3091 struct sysctl_ctx_list *ctx;
3092 device_t dev = sc->hn_dev;
3093 #if defined(INET) || defined(INET6)
3094 #if __FreeBSD_version >= 1100095
3101 * Create RXBUF for reception.
3104 * - It is shared by all channels.
3105 * - A large enough buffer is allocated, certain version of NVSes
3106 * may further limit the usable space.
3108 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3109 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3110 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3111 if (sc->hn_rxbuf == NULL) {
3112 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3116 sc->hn_rx_ring_cnt = ring_cnt;
3117 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3119 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3120 M_DEVBUF, M_WAITOK | M_ZERO);
3122 #if defined(INET) || defined(INET6)
3123 #if __FreeBSD_version >= 1100095
3124 lroent_cnt = hn_lro_entry_count;
3125 if (lroent_cnt < TCP_LRO_ENTRIES)
3126 lroent_cnt = TCP_LRO_ENTRIES;
3128 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3130 #endif /* INET || INET6 */
3132 ctx = device_get_sysctl_ctx(dev);
3133 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3135 /* Create dev.hn.UNIT.rx sysctl tree */
3136 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3137 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3139 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3140 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3142 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3143 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3144 &rxr->hn_br_dma, BUS_DMA_WAITOK);
3145 if (rxr->hn_br == NULL) {
3146 device_printf(dev, "allocate bufring failed\n");
3150 if (hn_trust_hosttcp)
3151 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3152 if (hn_trust_hostudp)
3153 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3154 if (hn_trust_hostip)
3155 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3156 rxr->hn_ifp = sc->hn_ifp;
3157 if (i < sc->hn_tx_ring_cnt)
3158 rxr->hn_txr = &sc->hn_tx_ring[i];
3159 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3160 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3162 rxr->hn_rxbuf = sc->hn_rxbuf;
3167 #if defined(INET) || defined(INET6)
3168 #if __FreeBSD_version >= 1100095
3169 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3170 hn_lro_mbufq_depth);
3172 tcp_lro_init(&rxr->hn_lro);
3173 rxr->hn_lro.ifp = sc->hn_ifp;
3175 #if __FreeBSD_version >= 1100099
3176 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3177 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3179 #endif /* INET || INET6 */
3181 if (sc->hn_rx_sysctl_tree != NULL) {
3185 * Create per RX ring sysctl tree:
3186 * dev.hn.UNIT.rx.RINGID
3188 snprintf(name, sizeof(name), "%d", i);
3189 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3190 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3191 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3193 if (rxr->hn_rx_sysctl_tree != NULL) {
3194 SYSCTL_ADD_ULONG(ctx,
3195 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3196 OID_AUTO, "packets", CTLFLAG_RW,
3197 &rxr->hn_pkts, "# of packets received");
3198 SYSCTL_ADD_ULONG(ctx,
3199 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3200 OID_AUTO, "rss_pkts", CTLFLAG_RW,
3202 "# of packets w/ RSS info received");
3204 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3205 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3206 &rxr->hn_pktbuf_len, 0,
3207 "Temporary channel packet buffer length");
3212 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3213 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3214 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3215 #if __FreeBSD_version < 1100095
3216 hn_rx_stat_int_sysctl,
3218 hn_rx_stat_u64_sysctl,
3220 "LU", "LRO queued");
3221 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3222 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3223 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3224 #if __FreeBSD_version < 1100095
3225 hn_rx_stat_int_sysctl,
3227 hn_rx_stat_u64_sysctl,
3229 "LU", "LRO flushed");
3230 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3231 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3232 __offsetof(struct hn_rx_ring, hn_lro_tried),
3233 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3234 #if __FreeBSD_version >= 1100099
3235 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3236 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3237 hn_lro_lenlim_sysctl, "IU",
3238 "Max # of data bytes to be aggregated by LRO");
3239 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3240 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3241 hn_lro_ackcnt_sysctl, "I",
3242 "Max # of ACKs to be aggregated by LRO");
3244 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3245 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3246 hn_trust_hcsum_sysctl, "I",
3247 "Trust tcp segement verification on host side, "
3248 "when csum info is missing");
3249 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3250 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3251 hn_trust_hcsum_sysctl, "I",
3252 "Trust udp datagram verification on host side, "
3253 "when csum info is missing");
3254 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3255 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3256 hn_trust_hcsum_sysctl, "I",
3257 "Trust ip packet verification on host side, "
3258 "when csum info is missing");
3259 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3260 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3261 __offsetof(struct hn_rx_ring, hn_csum_ip),
3262 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3263 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3264 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3265 __offsetof(struct hn_rx_ring, hn_csum_tcp),
3266 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3267 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3268 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3269 __offsetof(struct hn_rx_ring, hn_csum_udp),
3270 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3271 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3272 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3273 __offsetof(struct hn_rx_ring, hn_csum_trusted),
3274 hn_rx_stat_ulong_sysctl, "LU",
3275 "# of packets that we trust host's csum verification");
3276 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3277 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3278 __offsetof(struct hn_rx_ring, hn_small_pkts),
3279 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3280 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3281 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3282 __offsetof(struct hn_rx_ring, hn_ack_failed),
3283 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3284 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3285 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3286 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3287 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3293 hn_destroy_rx_data(struct hn_softc *sc)
3297 if (sc->hn_rxbuf != NULL) {
3298 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3299 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3301 device_printf(sc->hn_dev, "RXBUF is referenced\n");
3302 sc->hn_rxbuf = NULL;
3305 if (sc->hn_rx_ring_cnt == 0)
3308 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3309 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3311 if (rxr->hn_br == NULL)
3313 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3314 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3316 device_printf(sc->hn_dev,
3317 "%dth channel bufring is referenced", i);
3321 #if defined(INET) || defined(INET6)
3322 tcp_lro_free(&rxr->hn_lro);
3324 free(rxr->hn_pktbuf, M_DEVBUF);
3326 free(sc->hn_rx_ring, M_DEVBUF);
3327 sc->hn_rx_ring = NULL;
3329 sc->hn_rx_ring_cnt = 0;
3330 sc->hn_rx_ring_inuse = 0;
3334 hn_tx_ring_create(struct hn_softc *sc, int id)
3336 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3337 device_t dev = sc->hn_dev;
3338 bus_dma_tag_t parent_dtag;
3342 txr->hn_tx_idx = id;
3344 #ifndef HN_USE_TXDESC_BUFRING
3345 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3347 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3349 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3350 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3351 M_DEVBUF, M_WAITOK | M_ZERO);
3352 #ifndef HN_USE_TXDESC_BUFRING
3353 SLIST_INIT(&txr->hn_txlist);
3355 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3356 M_WAITOK, &txr->hn_tx_lock);
3359 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3360 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3361 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3363 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3366 #ifdef HN_IFSTART_SUPPORT
3367 if (hn_use_if_start) {
3368 txr->hn_txeof = hn_start_txeof;
3369 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3370 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3376 txr->hn_txeof = hn_xmit_txeof;
3377 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3378 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3380 br_depth = hn_get_txswq_depth(txr);
3381 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3382 M_WAITOK, &txr->hn_tx_lock);
3385 txr->hn_direct_tx_size = hn_direct_tx_size;
3388 * Always schedule transmission instead of trying to do direct
3389 * transmission. This one gives the best performance so far.
3391 txr->hn_sched_tx = 1;
3393 parent_dtag = bus_get_dma_tag(dev);
3395 /* DMA tag for RNDIS packet messages. */
3396 error = bus_dma_tag_create(parent_dtag, /* parent */
3397 HN_RNDIS_PKT_ALIGN, /* alignment */
3398 HN_RNDIS_PKT_BOUNDARY, /* boundary */
3399 BUS_SPACE_MAXADDR, /* lowaddr */
3400 BUS_SPACE_MAXADDR, /* highaddr */
3401 NULL, NULL, /* filter, filterarg */
3402 HN_RNDIS_PKT_LEN, /* maxsize */
3404 HN_RNDIS_PKT_LEN, /* maxsegsize */
3406 NULL, /* lockfunc */
3407 NULL, /* lockfuncarg */
3408 &txr->hn_tx_rndis_dtag);
3410 device_printf(dev, "failed to create rndis dmatag\n");
3414 /* DMA tag for data. */
3415 error = bus_dma_tag_create(parent_dtag, /* parent */
3417 HN_TX_DATA_BOUNDARY, /* boundary */
3418 BUS_SPACE_MAXADDR, /* lowaddr */
3419 BUS_SPACE_MAXADDR, /* highaddr */
3420 NULL, NULL, /* filter, filterarg */
3421 HN_TX_DATA_MAXSIZE, /* maxsize */
3422 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
3423 HN_TX_DATA_SEGSIZE, /* maxsegsize */
3425 NULL, /* lockfunc */
3426 NULL, /* lockfuncarg */
3427 &txr->hn_tx_data_dtag);
3429 device_printf(dev, "failed to create data dmatag\n");
3433 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3434 struct hn_txdesc *txd = &txr->hn_txdesc[i];
3437 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3438 STAILQ_INIT(&txd->agg_list);
3441 * Allocate and load RNDIS packet message.
3443 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3444 (void **)&txd->rndis_pkt,
3445 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3446 &txd->rndis_pkt_dmap);
3449 "failed to allocate rndis_packet_msg, %d\n", i);
3453 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3454 txd->rndis_pkt_dmap,
3455 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3456 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3460 "failed to load rndis_packet_msg, %d\n", i);
3461 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3462 txd->rndis_pkt, txd->rndis_pkt_dmap);
3466 /* DMA map for TX data. */
3467 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3471 "failed to allocate tx data dmamap\n");
3472 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3473 txd->rndis_pkt_dmap);
3474 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3475 txd->rndis_pkt, txd->rndis_pkt_dmap);
3479 /* All set, put it to list */
3480 txd->flags |= HN_TXD_FLAG_ONLIST;
3481 #ifndef HN_USE_TXDESC_BUFRING
3482 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3484 buf_ring_enqueue(txr->hn_txdesc_br, txd);
3487 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3489 if (sc->hn_tx_sysctl_tree != NULL) {
3490 struct sysctl_oid_list *child;
3491 struct sysctl_ctx_list *ctx;
3495 * Create per TX ring sysctl tree:
3496 * dev.hn.UNIT.tx.RINGID
3498 ctx = device_get_sysctl_ctx(dev);
3499 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3501 snprintf(name, sizeof(name), "%d", id);
3502 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3503 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3505 if (txr->hn_tx_sysctl_tree != NULL) {
3506 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3509 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3510 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3511 "# of available TX descs");
3513 #ifdef HN_IFSTART_SUPPORT
3514 if (!hn_use_if_start)
3517 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3518 CTLFLAG_RD, &txr->hn_oactive, 0,
3521 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3522 CTLFLAG_RW, &txr->hn_pkts,
3523 "# of packets transmitted");
3524 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3525 CTLFLAG_RW, &txr->hn_sends, "# of sends");
3533 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3535 struct hn_tx_ring *txr = txd->txr;
3537 KASSERT(txd->m == NULL, ("still has mbuf installed"));
3538 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3540 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3541 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3542 txd->rndis_pkt_dmap);
3543 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3547 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3550 KASSERT(txd->refs == 0 || txd->refs == 1,
3551 ("invalid txd refs %d", txd->refs));
3553 /* Aggregated txds will be freed by their aggregating txd. */
3554 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3557 freed = hn_txdesc_put(txr, txd);
3558 KASSERT(freed, ("can't free txdesc"));
3563 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3567 if (txr->hn_txdesc == NULL)
3572 * Because the freeing of aggregated txds will be deferred
3573 * to the aggregating txd, two passes are used here:
3574 * - The first pass GCes any pending txds. This GC is necessary,
3575 * since if the channels are revoked, hypervisor will not
3576 * deliver send-done for all pending txds.
3577 * - The second pass frees the busdma stuffs, i.e. after all txds
3580 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3581 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3582 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3583 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3585 if (txr->hn_tx_data_dtag != NULL)
3586 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3587 if (txr->hn_tx_rndis_dtag != NULL)
3588 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3590 #ifdef HN_USE_TXDESC_BUFRING
3591 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3594 free(txr->hn_txdesc, M_DEVBUF);
3595 txr->hn_txdesc = NULL;
3597 if (txr->hn_mbuf_br != NULL)
3598 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3600 #ifndef HN_USE_TXDESC_BUFRING
3601 mtx_destroy(&txr->hn_txlist_spin);
3603 mtx_destroy(&txr->hn_tx_lock);
3607 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3609 struct sysctl_oid_list *child;
3610 struct sysctl_ctx_list *ctx;
3614 * Create TXBUF for chimney sending.
3616 * NOTE: It is shared by all channels.
3618 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3619 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3620 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3621 if (sc->hn_chim == NULL) {
3622 device_printf(sc->hn_dev, "allocate txbuf failed\n");
3626 sc->hn_tx_ring_cnt = ring_cnt;
3627 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3629 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3630 M_DEVBUF, M_WAITOK | M_ZERO);
3632 ctx = device_get_sysctl_ctx(sc->hn_dev);
3633 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3635 /* Create dev.hn.UNIT.tx sysctl tree */
3636 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3637 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3639 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3642 error = hn_tx_ring_create(sc, i);
3647 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3648 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3649 __offsetof(struct hn_tx_ring, hn_no_txdescs),
3650 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3651 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3652 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3653 __offsetof(struct hn_tx_ring, hn_send_failed),
3654 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3655 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3656 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3657 __offsetof(struct hn_tx_ring, hn_txdma_failed),
3658 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3659 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3660 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3661 __offsetof(struct hn_tx_ring, hn_flush_failed),
3662 hn_tx_stat_ulong_sysctl, "LU",
3663 "# of packet transmission aggregation flush failure");
3664 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3665 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3666 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3667 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3668 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3669 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3670 __offsetof(struct hn_tx_ring, hn_tx_chimney),
3671 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3672 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3673 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3674 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3675 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3676 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3677 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3678 "# of total TX descs");
3679 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3680 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3681 "Chimney send packet size upper boundary");
3682 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3683 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3684 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3685 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3686 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3687 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3688 hn_tx_conf_int_sysctl, "I",
3689 "Size of the packet for direct transmission");
3690 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3691 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3692 __offsetof(struct hn_tx_ring, hn_sched_tx),
3693 hn_tx_conf_int_sysctl, "I",
3694 "Always schedule transmission "
3695 "instead of doing direct transmission");
3696 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3697 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3698 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3699 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3700 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3701 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3702 "Applied packet transmission aggregation size");
3703 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3704 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3705 hn_txagg_pktmax_sysctl, "I",
3706 "Applied packet transmission aggregation packets");
3707 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3708 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3709 hn_txagg_align_sysctl, "I",
3710 "Applied packet transmission aggregation alignment");
3716 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3720 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3721 sc->hn_tx_ring[i].hn_chim_size = chim_size;
3725 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3727 struct ifnet *ifp = sc->hn_ifp;
3730 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3733 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3734 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3735 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3737 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3738 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3739 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3741 if (tso_maxlen < tso_minlen)
3742 tso_maxlen = tso_minlen;
3743 else if (tso_maxlen > IP_MAXPACKET)
3744 tso_maxlen = IP_MAXPACKET;
3745 if (tso_maxlen > sc->hn_ndis_tso_szmax)
3746 tso_maxlen = sc->hn_ndis_tso_szmax;
3747 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3749 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3753 hn_fixup_tx_data(struct hn_softc *sc)
3755 uint64_t csum_assist;
3758 hn_set_chim_size(sc, sc->hn_chim_szmax);
3759 if (hn_tx_chimney_size > 0 &&
3760 hn_tx_chimney_size < sc->hn_chim_szmax)
3761 hn_set_chim_size(sc, hn_tx_chimney_size);
3764 if (sc->hn_caps & HN_CAP_IPCS)
3765 csum_assist |= CSUM_IP;
3766 if (sc->hn_caps & HN_CAP_TCP4CS)
3767 csum_assist |= CSUM_IP_TCP;
3768 if (sc->hn_caps & HN_CAP_UDP4CS)
3769 csum_assist |= CSUM_IP_UDP;
3770 if (sc->hn_caps & HN_CAP_TCP6CS)
3771 csum_assist |= CSUM_IP6_TCP;
3772 if (sc->hn_caps & HN_CAP_UDP6CS)
3773 csum_assist |= CSUM_IP6_UDP;
3774 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3775 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3777 if (sc->hn_caps & HN_CAP_HASHVAL) {
3779 * Support HASHVAL pktinfo on TX path.
3782 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3783 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3784 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3789 hn_destroy_tx_data(struct hn_softc *sc)
3793 if (sc->hn_chim != NULL) {
3794 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
3795 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3797 device_printf(sc->hn_dev,
3798 "chimney sending buffer is referenced");
3803 if (sc->hn_tx_ring_cnt == 0)
3806 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3807 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3809 free(sc->hn_tx_ring, M_DEVBUF);
3810 sc->hn_tx_ring = NULL;
3812 sc->hn_tx_ring_cnt = 0;
3813 sc->hn_tx_ring_inuse = 0;
3816 #ifdef HN_IFSTART_SUPPORT
3819 hn_start_taskfunc(void *xtxr, int pending __unused)
3821 struct hn_tx_ring *txr = xtxr;
3823 mtx_lock(&txr->hn_tx_lock);
3824 hn_start_locked(txr, 0);
3825 mtx_unlock(&txr->hn_tx_lock);
3829 hn_start_locked(struct hn_tx_ring *txr, int len)
3831 struct hn_softc *sc = txr->hn_sc;
3832 struct ifnet *ifp = sc->hn_ifp;
3835 KASSERT(hn_use_if_start,
3836 ("hn_start_locked is called, when if_start is disabled"));
3837 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3838 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3839 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3841 if (__predict_false(txr->hn_suspended))
3844 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3848 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3849 struct hn_txdesc *txd;
3850 struct mbuf *m_head;
3853 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3857 if (len > 0 && m_head->m_pkthdr.len > len) {
3859 * This sending could be time consuming; let callers
3860 * dispatch this packet sending (and sending of any
3861 * following up packets) to tx taskqueue.
3863 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3868 #if defined(INET6) || defined(INET)
3869 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3870 m_head = hn_tso_fixup(m_head);
3871 if (__predict_false(m_head == NULL)) {
3872 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3878 txd = hn_txdesc_get(txr);
3880 txr->hn_no_txdescs++;
3881 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3882 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3886 error = hn_encap(ifp, txr, txd, &m_head);
3888 /* Both txd and m_head are freed */
3889 KASSERT(txr->hn_agg_txd == NULL,
3890 ("encap failed w/ pending aggregating txdesc"));
3894 if (txr->hn_agg_pktleft == 0) {
3895 if (txr->hn_agg_txd != NULL) {
3896 KASSERT(m_head == NULL,
3897 ("pending mbuf for aggregating txdesc"));
3898 error = hn_flush_txagg(ifp, txr);
3899 if (__predict_false(error)) {
3900 atomic_set_int(&ifp->if_drv_flags,
3905 KASSERT(m_head != NULL, ("mbuf was freed"));
3906 error = hn_txpkt(ifp, txr, txd);
3907 if (__predict_false(error)) {
3908 /* txd is freed, but m_head is not */
3909 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3910 atomic_set_int(&ifp->if_drv_flags,
3918 KASSERT(txr->hn_agg_txd != NULL,
3919 ("no aggregating txdesc"));
3920 KASSERT(m_head == NULL,
3921 ("pending mbuf for aggregating txdesc"));
3926 /* Flush pending aggerated transmission. */
3927 if (txr->hn_agg_txd != NULL)
3928 hn_flush_txagg(ifp, txr);
3933 hn_start(struct ifnet *ifp)
3935 struct hn_softc *sc = ifp->if_softc;
3936 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
3938 if (txr->hn_sched_tx)
3941 if (mtx_trylock(&txr->hn_tx_lock)) {
3944 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3945 mtx_unlock(&txr->hn_tx_lock);
3950 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3954 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3956 struct hn_tx_ring *txr = xtxr;
3958 mtx_lock(&txr->hn_tx_lock);
3959 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3960 hn_start_locked(txr, 0);
3961 mtx_unlock(&txr->hn_tx_lock);
3965 hn_start_txeof(struct hn_tx_ring *txr)
3967 struct hn_softc *sc = txr->hn_sc;
3968 struct ifnet *ifp = sc->hn_ifp;
3970 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3972 if (txr->hn_sched_tx)
3975 if (mtx_trylock(&txr->hn_tx_lock)) {
3978 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3979 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3980 mtx_unlock(&txr->hn_tx_lock);
3982 taskqueue_enqueue(txr->hn_tx_taskq,
3988 * Release the OACTIVE earlier, with the hope, that
3989 * others could catch up. The task will clear the
3990 * flag again with the hn_tx_lock to avoid possible
3993 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3994 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3998 #endif /* HN_IFSTART_SUPPORT */
4001 hn_xmit(struct hn_tx_ring *txr, int len)
4003 struct hn_softc *sc = txr->hn_sc;
4004 struct ifnet *ifp = sc->hn_ifp;
4005 struct mbuf *m_head;
4008 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4009 #ifdef HN_IFSTART_SUPPORT
4010 KASSERT(hn_use_if_start == 0,
4011 ("hn_xmit is called, when if_start is enabled"));
4013 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4015 if (__predict_false(txr->hn_suspended))
4018 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4021 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4022 struct hn_txdesc *txd;
4025 if (len > 0 && m_head->m_pkthdr.len > len) {
4027 * This sending could be time consuming; let callers
4028 * dispatch this packet sending (and sending of any
4029 * following up packets) to tx taskqueue.
4031 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4036 txd = hn_txdesc_get(txr);
4038 txr->hn_no_txdescs++;
4039 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4040 txr->hn_oactive = 1;
4044 error = hn_encap(ifp, txr, txd, &m_head);
4046 /* Both txd and m_head are freed; discard */
4047 KASSERT(txr->hn_agg_txd == NULL,
4048 ("encap failed w/ pending aggregating txdesc"));
4049 drbr_advance(ifp, txr->hn_mbuf_br);
4053 if (txr->hn_agg_pktleft == 0) {
4054 if (txr->hn_agg_txd != NULL) {
4055 KASSERT(m_head == NULL,
4056 ("pending mbuf for aggregating txdesc"));
4057 error = hn_flush_txagg(ifp, txr);
4058 if (__predict_false(error)) {
4059 txr->hn_oactive = 1;
4063 KASSERT(m_head != NULL, ("mbuf was freed"));
4064 error = hn_txpkt(ifp, txr, txd);
4065 if (__predict_false(error)) {
4066 /* txd is freed, but m_head is not */
4067 drbr_putback(ifp, txr->hn_mbuf_br,
4069 txr->hn_oactive = 1;
4076 KASSERT(txr->hn_agg_txd != NULL,
4077 ("no aggregating txdesc"));
4078 KASSERT(m_head == NULL,
4079 ("pending mbuf for aggregating txdesc"));
4084 drbr_advance(ifp, txr->hn_mbuf_br);
4087 /* Flush pending aggerated transmission. */
4088 if (txr->hn_agg_txd != NULL)
4089 hn_flush_txagg(ifp, txr);
4094 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4096 struct hn_softc *sc = ifp->if_softc;
4097 struct hn_tx_ring *txr;
4100 #if defined(INET6) || defined(INET)
4102 * Perform TSO packet header fixup now, since the TSO
4103 * packet header should be cache-hot.
4105 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4106 m = hn_tso_fixup(m);
4107 if (__predict_false(m == NULL)) {
4108 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4115 * Select the TX ring based on flowid
4117 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
4121 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
4123 idx = bid % sc->hn_tx_ring_inuse;
4126 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4128 txr = &sc->hn_tx_ring[idx];
4130 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4132 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4136 if (txr->hn_oactive)
4139 if (txr->hn_sched_tx)
4142 if (mtx_trylock(&txr->hn_tx_lock)) {
4145 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4146 mtx_unlock(&txr->hn_tx_lock);
4151 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4156 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4160 mtx_lock(&txr->hn_tx_lock);
4161 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4163 mtx_unlock(&txr->hn_tx_lock);
4167 hn_xmit_qflush(struct ifnet *ifp)
4169 struct hn_softc *sc = ifp->if_softc;
4172 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4173 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4178 hn_xmit_txeof(struct hn_tx_ring *txr)
4181 if (txr->hn_sched_tx)
4184 if (mtx_trylock(&txr->hn_tx_lock)) {
4187 txr->hn_oactive = 0;
4188 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4189 mtx_unlock(&txr->hn_tx_lock);
4191 taskqueue_enqueue(txr->hn_tx_taskq,
4197 * Release the oactive earlier, with the hope, that
4198 * others could catch up. The task will clear the
4199 * oactive again with the hn_tx_lock to avoid possible
4202 txr->hn_oactive = 0;
4203 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4208 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4210 struct hn_tx_ring *txr = xtxr;
4212 mtx_lock(&txr->hn_tx_lock);
4214 mtx_unlock(&txr->hn_tx_lock);
4218 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4220 struct hn_tx_ring *txr = xtxr;
4222 mtx_lock(&txr->hn_tx_lock);
4223 txr->hn_oactive = 0;
4225 mtx_unlock(&txr->hn_tx_lock);
4229 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4231 struct vmbus_chan_br cbr;
4232 struct hn_rx_ring *rxr;
4233 struct hn_tx_ring *txr = NULL;
4236 idx = vmbus_chan_subidx(chan);
4239 * Link this channel to RX/TX ring.
4241 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4242 ("invalid channel index %d, should > 0 && < %d",
4243 idx, sc->hn_rx_ring_inuse));
4244 rxr = &sc->hn_rx_ring[idx];
4245 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4246 ("RX ring %d already attached", idx));
4247 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4250 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4251 idx, vmbus_chan_id(chan));
4254 if (idx < sc->hn_tx_ring_inuse) {
4255 txr = &sc->hn_tx_ring[idx];
4256 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4257 ("TX ring %d already attached", idx));
4258 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4260 txr->hn_chan = chan;
4262 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4263 idx, vmbus_chan_id(chan));
4267 /* Bind this channel to a proper CPU. */
4268 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4273 cbr.cbr = rxr->hn_br;
4274 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4275 cbr.cbr_txsz = HN_TXBR_SIZE;
4276 cbr.cbr_rxsz = HN_RXBR_SIZE;
4277 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4279 if (error == EISCONN) {
4280 if_printf(sc->hn_ifp, "bufring is connected after "
4281 "chan%u open failure\n", vmbus_chan_id(chan));
4282 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4284 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4285 vmbus_chan_id(chan), error);
4292 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4294 struct hn_rx_ring *rxr;
4297 idx = vmbus_chan_subidx(chan);
4300 * Link this channel to RX/TX ring.
4302 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4303 ("invalid channel index %d, should > 0 && < %d",
4304 idx, sc->hn_rx_ring_inuse));
4305 rxr = &sc->hn_rx_ring[idx];
4306 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4307 ("RX ring %d is not attached", idx));
4308 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4310 if (idx < sc->hn_tx_ring_inuse) {
4311 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4313 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4314 ("TX ring %d is not attached attached", idx));
4315 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4319 * Close this channel.
4322 * Channel closing does _not_ destroy the target channel.
4324 error = vmbus_chan_close_direct(chan);
4325 if (error == EISCONN) {
4326 if_printf(sc->hn_ifp, "chan%u bufring is connected "
4327 "after being closed\n", vmbus_chan_id(chan));
4328 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4330 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4331 vmbus_chan_id(chan), error);
4336 hn_attach_subchans(struct hn_softc *sc)
4338 struct vmbus_channel **subchans;
4339 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4342 KASSERT(subchan_cnt > 0, ("no sub-channels"));
4344 /* Attach the sub-channels. */
4345 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4346 for (i = 0; i < subchan_cnt; ++i) {
4349 error1 = hn_chan_attach(sc, subchans[i]);
4352 /* Move on; all channels will be detached later. */
4355 vmbus_subchan_rel(subchans, subchan_cnt);
4358 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4361 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4369 hn_detach_allchans(struct hn_softc *sc)
4371 struct vmbus_channel **subchans;
4372 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4375 if (subchan_cnt == 0)
4378 /* Detach the sub-channels. */
4379 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4380 for (i = 0; i < subchan_cnt; ++i)
4381 hn_chan_detach(sc, subchans[i]);
4382 vmbus_subchan_rel(subchans, subchan_cnt);
4386 * Detach the primary channel, _after_ all sub-channels
4389 hn_chan_detach(sc, sc->hn_prichan);
4391 /* Wait for sub-channels to be destroyed, if any. */
4392 vmbus_subchan_drain(sc->hn_prichan);
4395 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4396 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4397 HN_RX_FLAG_ATTACHED) == 0,
4398 ("%dth RX ring is still attached", i));
4400 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4401 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4402 HN_TX_FLAG_ATTACHED) == 0,
4403 ("%dth TX ring is still attached", i));
4409 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4411 struct vmbus_channel **subchans;
4412 int nchan, rxr_cnt, error;
4414 nchan = *nsubch + 1;
4417 * Multiple RX/TX rings are not requested.
4424 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4427 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4429 /* No RSS; this is benign. */
4434 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4438 if (nchan > rxr_cnt)
4441 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4447 * Allocate sub-channels from NVS.
4449 *nsubch = nchan - 1;
4450 error = hn_nvs_alloc_subchans(sc, nsubch);
4451 if (error || *nsubch == 0) {
4452 /* Failed to allocate sub-channels. */
4458 * Wait for all sub-channels to become ready before moving on.
4460 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4461 vmbus_subchan_rel(subchans, *nsubch);
4466 hn_synth_attachable(const struct hn_softc *sc)
4470 if (sc->hn_flags & HN_FLAG_ERRORS)
4473 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4474 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4476 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
4483 hn_synth_attach(struct hn_softc *sc, int mtu)
4485 #define ATTACHED_NVS 0x0002
4486 #define ATTACHED_RNDIS 0x0004
4488 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4489 int error, nsubch, nchan, i;
4490 uint32_t old_caps, attached = 0;
4492 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4493 ("synthetic parts were attached"));
4495 if (!hn_synth_attachable(sc))
4498 /* Save capabilities for later verification. */
4499 old_caps = sc->hn_caps;
4502 /* Clear RSS stuffs. */
4503 sc->hn_rss_ind_size = 0;
4504 sc->hn_rss_hash = 0;
4507 * Attach the primary channel _before_ attaching NVS and RNDIS.
4509 error = hn_chan_attach(sc, sc->hn_prichan);
4516 error = hn_nvs_attach(sc, mtu);
4519 attached |= ATTACHED_NVS;
4522 * Attach RNDIS _after_ NVS is attached.
4524 error = hn_rndis_attach(sc, mtu);
4527 attached |= ATTACHED_RNDIS;
4530 * Make sure capabilities are not changed.
4532 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4533 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4534 old_caps, sc->hn_caps);
4540 * Allocate sub-channels for multi-TX/RX rings.
4543 * The # of RX rings that can be used is equivalent to the # of
4544 * channels to be requested.
4546 nsubch = sc->hn_rx_ring_cnt - 1;
4547 error = hn_synth_alloc_subchans(sc, &nsubch);
4550 /* NOTE: _Full_ synthetic parts detach is required now. */
4551 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4554 * Set the # of TX/RX rings that could be used according to
4555 * the # of channels that NVS offered.
4558 hn_set_ring_inuse(sc, nchan);
4560 /* Only the primary channel can be used; done */
4565 * Attach the sub-channels.
4567 * NOTE: hn_set_ring_inuse() _must_ have been called.
4569 error = hn_attach_subchans(sc);
4574 * Configure RSS key and indirect table _after_ all sub-channels
4577 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4579 * RSS key is not set yet; set it to the default RSS key.
4582 if_printf(sc->hn_ifp, "setup default RSS key\n");
4584 rss_getkey(rss->rss_key);
4586 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4588 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4591 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4593 * RSS indirect table is not set yet; set it up in round-
4597 if_printf(sc->hn_ifp, "setup default RSS indirect "
4600 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
4604 subidx = rss_get_indirection_to_bucket(i);
4608 rss->rss_ind[i] = subidx % nchan;
4610 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4613 * # of usable channels may be changed, so we have to
4614 * make sure that all entries in RSS indirect table
4617 * NOTE: hn_set_ring_inuse() _must_ have been called.
4619 hn_rss_ind_fixup(sc);
4622 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4627 * Fixup transmission aggregation setup.
4633 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
4634 hn_synth_detach(sc);
4636 if (attached & ATTACHED_RNDIS)
4637 hn_rndis_detach(sc);
4638 if (attached & ATTACHED_NVS)
4640 hn_chan_detach(sc, sc->hn_prichan);
4641 /* Restore old capabilities. */
4642 sc->hn_caps = old_caps;
4646 #undef ATTACHED_RNDIS
4652 * The interface must have been suspended though hn_suspend(), before
4653 * this function get called.
4656 hn_synth_detach(struct hn_softc *sc)
4659 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4660 ("synthetic parts were not attached"));
4662 /* Detach the RNDIS first. */
4663 hn_rndis_detach(sc);
4668 /* Detach all of the channels. */
4669 hn_detach_allchans(sc);
4671 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4675 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4677 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4678 ("invalid ring count %d", ring_cnt));
4680 if (sc->hn_tx_ring_cnt > ring_cnt)
4681 sc->hn_tx_ring_inuse = ring_cnt;
4683 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4684 sc->hn_rx_ring_inuse = ring_cnt;
4687 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
4688 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
4689 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
4690 rss_getnumbuckets());
4695 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4696 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4701 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4706 * The TX bufring will not be drained by the hypervisor,
4707 * if the primary channel is revoked.
4709 while (!vmbus_chan_rx_empty(chan) ||
4710 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4711 !vmbus_chan_tx_empty(chan)))
4713 vmbus_chan_intr_drain(chan);
4717 hn_suspend_data(struct hn_softc *sc)
4719 struct vmbus_channel **subch = NULL;
4720 struct hn_tx_ring *txr;
4728 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4729 txr = &sc->hn_tx_ring[i];
4731 mtx_lock(&txr->hn_tx_lock);
4732 txr->hn_suspended = 1;
4733 mtx_unlock(&txr->hn_tx_lock);
4734 /* No one is able send more packets now. */
4737 * Wait for all pending sends to finish.
4740 * We will _not_ receive all pending send-done, if the
4741 * primary channel is revoked.
4743 while (hn_tx_ring_pending(txr) &&
4744 !vmbus_chan_is_revoked(sc->hn_prichan))
4745 pause("hnwtx", 1 /* 1 tick */);
4749 * Disable RX by clearing RX filter.
4751 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
4752 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
4755 * Give RNDIS enough time to flush all pending data packets.
4757 pause("waitrx", (200 * hz) / 1000);
4760 * Drain RX/TX bufrings and interrupts.
4762 nsubch = sc->hn_rx_ring_inuse - 1;
4764 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4766 if (subch != NULL) {
4767 for (i = 0; i < nsubch; ++i)
4768 hn_chan_drain(sc, subch[i]);
4770 hn_chan_drain(sc, sc->hn_prichan);
4773 vmbus_subchan_rel(subch, nsubch);
4776 * Drain any pending TX tasks.
4779 * The above hn_chan_drain() can dispatch TX tasks, so the TX
4780 * tasks will have to be drained _after_ the above hn_chan_drain()
4783 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4784 txr = &sc->hn_tx_ring[i];
4786 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4787 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4792 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4795 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4799 hn_suspend_mgmt(struct hn_softc *sc)
4806 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4807 * through hn_mgmt_taskq.
4809 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4810 vmbus_chan_run_task(sc->hn_prichan, &task);
4813 * Make sure that all pending management tasks are completed.
4815 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4816 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4817 taskqueue_drain_all(sc->hn_mgmt_taskq0);
4821 hn_suspend(struct hn_softc *sc)
4824 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4825 hn_suspend_data(sc);
4826 hn_suspend_mgmt(sc);
4830 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4834 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4835 ("invalid TX ring count %d", tx_ring_cnt));
4837 for (i = 0; i < tx_ring_cnt; ++i) {
4838 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4840 mtx_lock(&txr->hn_tx_lock);
4841 txr->hn_suspended = 0;
4842 mtx_unlock(&txr->hn_tx_lock);
4847 hn_resume_data(struct hn_softc *sc)
4856 hn_set_rxfilter(sc);
4859 * Make sure to clear suspend status on "all" TX rings,
4860 * since hn_tx_ring_inuse can be changed after
4861 * hn_suspend_data().
4863 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4865 #ifdef HN_IFSTART_SUPPORT
4866 if (!hn_use_if_start)
4870 * Flush unused drbrs, since hn_tx_ring_inuse may be
4873 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4874 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4880 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4881 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4884 * Use txeof task, so that any pending oactive can be
4887 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4892 hn_resume_mgmt(struct hn_softc *sc)
4895 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4898 * Kick off network change detection, if it was pending.
4899 * If no network change was pending, start link status
4900 * checks, which is more lightweight than network change
4903 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4904 hn_change_network(sc);
4906 hn_update_link_status(sc);
4910 hn_resume(struct hn_softc *sc)
4913 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4919 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4921 const struct rndis_status_msg *msg;
4924 if (dlen < sizeof(*msg)) {
4925 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4930 switch (msg->rm_status) {
4931 case RNDIS_STATUS_MEDIA_CONNECT:
4932 case RNDIS_STATUS_MEDIA_DISCONNECT:
4933 hn_update_link_status(sc);
4936 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4937 /* Not really useful; ignore. */
4940 case RNDIS_STATUS_NETWORK_CHANGE:
4941 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4942 if (dlen < ofs + msg->rm_stbuflen ||
4943 msg->rm_stbuflen < sizeof(uint32_t)) {
4944 if_printf(sc->hn_ifp, "network changed\n");
4948 memcpy(&change, ((const uint8_t *)msg) + ofs,
4950 if_printf(sc->hn_ifp, "network changed, change %u\n",
4953 hn_change_network(sc);
4957 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4964 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
4966 const struct rndis_pktinfo *pi = info_data;
4969 while (info_dlen != 0) {
4973 if (__predict_false(info_dlen < sizeof(*pi)))
4975 if (__predict_false(info_dlen < pi->rm_size))
4977 info_dlen -= pi->rm_size;
4979 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
4981 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
4983 dlen = pi->rm_size - pi->rm_pktinfooffset;
4986 switch (pi->rm_type) {
4987 case NDIS_PKTINFO_TYPE_VLAN:
4988 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
4990 info->vlan_info = *((const uint32_t *)data);
4991 mask |= HN_RXINFO_VLAN;
4994 case NDIS_PKTINFO_TYPE_CSUM:
4995 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
4997 info->csum_info = *((const uint32_t *)data);
4998 mask |= HN_RXINFO_CSUM;
5001 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5002 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5004 info->hash_value = *((const uint32_t *)data);
5005 mask |= HN_RXINFO_HASHVAL;
5008 case HN_NDIS_PKTINFO_TYPE_HASHINF:
5009 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5011 info->hash_info = *((const uint32_t *)data);
5012 mask |= HN_RXINFO_HASHINF;
5019 if (mask == HN_RXINFO_ALL) {
5020 /* All found; done */
5024 pi = (const struct rndis_pktinfo *)
5025 ((const uint8_t *)pi + pi->rm_size);
5030 * - If there is no hash value, invalidate the hash info.
5032 if ((mask & HN_RXINFO_HASHVAL) == 0)
5033 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5037 static __inline bool
5038 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5041 if (off < check_off) {
5042 if (__predict_true(off + len <= check_off))
5044 } else if (off > check_off) {
5045 if (__predict_true(check_off + check_len <= off))
5052 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5054 const struct rndis_packet_msg *pkt;
5055 struct hn_rxinfo info;
5056 int data_off, pktinfo_off, data_len, pktinfo_len;
5061 if (__predict_false(dlen < sizeof(*pkt))) {
5062 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5067 if (__predict_false(dlen < pkt->rm_len)) {
5068 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5069 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5072 if (__predict_false(pkt->rm_len <
5073 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5074 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5075 "msglen %u, data %u, oob %u, pktinfo %u\n",
5076 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5077 pkt->rm_pktinfolen);
5080 if (__predict_false(pkt->rm_datalen == 0)) {
5081 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5088 #define IS_OFFSET_INVALID(ofs) \
5089 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
5090 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5092 /* XXX Hyper-V does not meet data offset alignment requirement */
5093 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5094 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5095 "data offset %u\n", pkt->rm_dataoffset);
5098 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5099 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5100 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5101 "oob offset %u\n", pkt->rm_oobdataoffset);
5104 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5105 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5106 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5107 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5111 #undef IS_OFFSET_INVALID
5113 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5114 data_len = pkt->rm_datalen;
5115 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5116 pktinfo_len = pkt->rm_pktinfolen;
5119 * Check OOB coverage.
5121 if (__predict_false(pkt->rm_oobdatalen != 0)) {
5122 int oob_off, oob_len;
5124 if_printf(rxr->hn_ifp, "got oobdata\n");
5125 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5126 oob_len = pkt->rm_oobdatalen;
5128 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5129 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5130 "oob overflow, msglen %u, oob abs %d len %d\n",
5131 pkt->rm_len, oob_off, oob_len);
5136 * Check against data.
5138 if (hn_rndis_check_overlap(oob_off, oob_len,
5139 data_off, data_len)) {
5140 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5141 "oob overlaps data, oob abs %d len %d, "
5142 "data abs %d len %d\n",
5143 oob_off, oob_len, data_off, data_len);
5148 * Check against pktinfo.
5150 if (pktinfo_len != 0 &&
5151 hn_rndis_check_overlap(oob_off, oob_len,
5152 pktinfo_off, pktinfo_len)) {
5153 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5154 "oob overlaps pktinfo, oob abs %d len %d, "
5155 "pktinfo abs %d len %d\n",
5156 oob_off, oob_len, pktinfo_off, pktinfo_len);
5162 * Check per-packet-info coverage and find useful per-packet-info.
5164 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5165 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5166 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5167 if (__predict_true(pktinfo_len != 0)) {
5171 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5172 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5173 "pktinfo overflow, msglen %u, "
5174 "pktinfo abs %d len %d\n",
5175 pkt->rm_len, pktinfo_off, pktinfo_len);
5180 * Check packet info coverage.
5182 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5183 data_off, data_len);
5184 if (__predict_false(overlap)) {
5185 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5186 "pktinfo overlap data, pktinfo abs %d len %d, "
5187 "data abs %d len %d\n",
5188 pktinfo_off, pktinfo_len, data_off, data_len);
5193 * Find useful per-packet-info.
5195 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5196 pktinfo_len, &info);
5197 if (__predict_false(error)) {
5198 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5204 if (__predict_false(data_off + data_len > pkt->rm_len)) {
5205 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5206 "data overflow, msglen %u, data abs %d len %d\n",
5207 pkt->rm_len, data_off, data_len);
5210 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5213 static __inline void
5214 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5216 const struct rndis_msghdr *hdr;
5218 if (__predict_false(dlen < sizeof(*hdr))) {
5219 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5224 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5225 /* Hot data path. */
5226 hn_rndis_rx_data(rxr, data, dlen);
5231 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5232 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5234 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5238 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5240 const struct hn_nvs_hdr *hdr;
5242 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5243 if_printf(sc->hn_ifp, "invalid nvs notify\n");
5246 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5248 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5249 /* Useless; ignore */
5252 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5256 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5257 const struct vmbus_chanpkt_hdr *pkt)
5259 struct hn_nvs_sendctx *sndc;
5261 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5262 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5263 VMBUS_CHANPKT_DATALEN(pkt));
5266 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5272 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5273 const struct vmbus_chanpkt_hdr *pkthdr)
5275 const struct vmbus_chanpkt_rxbuf *pkt;
5276 const struct hn_nvs_hdr *nvs_hdr;
5279 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5280 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5283 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5285 /* Make sure that this is a RNDIS message. */
5286 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5287 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5292 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5293 if (__predict_false(hlen < sizeof(*pkt))) {
5294 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5297 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5299 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5300 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5305 count = pkt->cp_rxbuf_cnt;
5306 if (__predict_false(hlen <
5307 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5308 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5312 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5313 for (i = 0; i < count; ++i) {
5316 ofs = pkt->cp_rxbuf[i].rb_ofs;
5317 len = pkt->cp_rxbuf[i].rb_len;
5318 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5319 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5320 "ofs %d, len %d\n", i, ofs, len);
5323 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5327 * Ack the consumed RXBUF associated w/ this channel packet,
5328 * so that this RXBUF can be recycled by the hypervisor.
5330 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5334 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5337 struct hn_nvs_rndis_ack ack;
5340 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5341 ack.nvs_status = HN_NVS_STATUS_OK;
5345 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5346 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5347 if (__predict_false(error == EAGAIN)) {
5350 * This should _not_ happen in real world, since the
5351 * consumption of the TX bufring from the TX path is
5354 if (rxr->hn_ack_failed == 0)
5355 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5356 rxr->hn_ack_failed++;
5363 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5368 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5370 struct hn_rx_ring *rxr = xrxr;
5371 struct hn_softc *sc = rxr->hn_ifp->if_softc;
5374 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5377 pktlen = rxr->hn_pktbuf_len;
5378 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5379 if (__predict_false(error == ENOBUFS)) {
5384 * Expand channel packet buffer.
5387 * Use M_WAITOK here, since allocation failure
5390 nlen = rxr->hn_pktbuf_len * 2;
5391 while (nlen < pktlen)
5393 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5395 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5396 rxr->hn_pktbuf_len, nlen);
5398 free(rxr->hn_pktbuf, M_DEVBUF);
5399 rxr->hn_pktbuf = nbuf;
5400 rxr->hn_pktbuf_len = nlen;
5403 } else if (__predict_false(error == EAGAIN)) {
5404 /* No more channel packets; done! */
5407 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5409 switch (pkt->cph_type) {
5410 case VMBUS_CHANPKT_TYPE_COMP:
5411 hn_nvs_handle_comp(sc, chan, pkt);
5414 case VMBUS_CHANPKT_TYPE_RXBUF:
5415 hn_nvs_handle_rxbuf(rxr, chan, pkt);
5418 case VMBUS_CHANPKT_TYPE_INBAND:
5419 hn_nvs_handle_notify(sc, pkt);
5423 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5428 hn_chan_rollup(rxr, rxr->hn_txr);
5432 hn_tx_taskq_create(void *arg __unused)
5437 * Fix the # of TX taskqueues.
5439 if (hn_tx_taskq_cnt <= 0)
5440 hn_tx_taskq_cnt = 1;
5441 else if (hn_tx_taskq_cnt > mp_ncpus)
5442 hn_tx_taskq_cnt = mp_ncpus;
5445 * Fix the TX taskqueue mode.
5447 switch (hn_tx_taskq_mode) {
5448 case HN_TX_TASKQ_M_INDEP:
5449 case HN_TX_TASKQ_M_GLOBAL:
5450 case HN_TX_TASKQ_M_EVTTQ:
5453 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
5457 if (vm_guest != VM_GUEST_HV)
5460 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
5463 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
5464 M_DEVBUF, M_WAITOK);
5465 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
5466 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
5467 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
5468 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
5472 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5473 hn_tx_taskq_create, NULL);
5476 hn_tx_taskq_destroy(void *arg __unused)
5479 if (hn_tx_taskque != NULL) {
5482 for (i = 0; i < hn_tx_taskq_cnt; ++i)
5483 taskqueue_free(hn_tx_taskque[i]);
5484 free(hn_tx_taskque, M_DEVBUF);
5487 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5488 hn_tx_taskq_destroy, NULL);