2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
62 #include <sys/param.h>
64 #include <sys/kernel.h>
65 #include <sys/limits.h>
66 #include <sys/malloc.h>
68 #include <sys/module.h>
70 #include <sys/queue.h>
72 #include <sys/rmlock.h>
75 #include <sys/socket.h>
76 #include <sys/sockio.h>
78 #include <sys/sysctl.h>
79 #include <sys/systm.h>
80 #include <sys/taskqueue.h>
81 #include <sys/buf_ring.h>
82 #include <sys/eventhandler.h>
84 #include <machine/atomic.h>
85 #include <machine/in_cksum.h>
88 #include <net/ethernet.h>
90 #include <net/if_arp.h>
91 #include <net/if_dl.h>
92 #include <net/if_media.h>
93 #include <net/if_types.h>
94 #include <net/if_var.h>
95 #include <net/if_vlan_var.h>
96 #include <net/rndis.h>
98 #include <netinet/in_systm.h>
99 #include <netinet/in.h>
100 #include <netinet/ip.h>
101 #include <netinet/ip6.h>
102 #include <netinet/tcp.h>
103 #include <netinet/tcp_lro.h>
104 #include <netinet/udp.h>
106 #include <dev/hyperv/include/hyperv.h>
107 #include <dev/hyperv/include/hyperv_busdma.h>
108 #include <dev/hyperv/include/vmbus.h>
109 #include <dev/hyperv/include/vmbus_xact.h>
111 #include <dev/hyperv/netvsc/ndis.h>
112 #include <dev/hyperv/netvsc/if_hnreg.h>
113 #include <dev/hyperv/netvsc/if_hnvar.h>
114 #include <dev/hyperv/netvsc/hn_nvs.h>
115 #include <dev/hyperv/netvsc/hn_rndis.h>
117 #include "vmbus_if.h"
119 #define HN_IFSTART_SUPPORT
121 #define HN_RING_CNT_DEF_MAX 8
123 #define HN_VFMAP_SIZE_DEF 8
125 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */
127 /* YYY should get it from the underlying channel */
128 #define HN_TX_DESC_CNT 512
130 #define HN_RNDIS_PKT_LEN \
131 (sizeof(struct rndis_packet_msg) + \
132 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
133 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
134 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
135 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
136 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
137 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
139 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
140 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
141 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
142 /* -1 for RNDIS packet message */
143 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
145 #define HN_DIRECT_TX_SIZE_DEF 128
147 #define HN_EARLY_TXEOF_THRESH 8
149 #define HN_PKTBUF_LEN_DEF (16 * 1024)
151 #define HN_LROENT_CNT_DEF 128
153 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
154 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
155 /* YYY 2*MTU is a bit rough, but should be good enough. */
156 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
158 #define HN_LRO_ACKCNT_DEF 1
160 #define HN_LOCK_INIT(sc) \
161 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
162 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
163 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
164 #define HN_LOCK(sc) \
166 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
169 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
171 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
172 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
173 #define HN_CSUM_IP_HWASSIST(sc) \
174 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
175 #define HN_CSUM_IP6_HWASSIST(sc) \
176 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
178 #define HN_PKTSIZE_MIN(align) \
179 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
180 HN_RNDIS_PKT_LEN, (align))
181 #define HN_PKTSIZE(m, align) \
182 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
184 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
187 #ifndef HN_USE_TXDESC_BUFRING
188 SLIST_ENTRY(hn_txdesc) link;
190 STAILQ_ENTRY(hn_txdesc) agg_link;
192 /* Aggregated txdescs, in sending order. */
193 STAILQ_HEAD(, hn_txdesc) agg_list;
195 /* The oldest packet, if transmission aggregation happens. */
197 struct hn_tx_ring *txr;
199 uint32_t flags; /* HN_TXD_FLAG_ */
200 struct hn_nvs_sendctx send_ctx;
204 bus_dmamap_t data_dmap;
206 bus_addr_t rndis_pkt_paddr;
207 struct rndis_packet_msg *rndis_pkt;
208 bus_dmamap_t rndis_pkt_dmap;
211 #define HN_TXD_FLAG_ONLIST 0x0001
212 #define HN_TXD_FLAG_DMAMAP 0x0002
213 #define HN_TXD_FLAG_ONAGG 0x0004
222 struct hn_rxvf_setarg {
223 struct hn_rx_ring *rxr;
224 struct ifnet *vf_ifp;
227 #define HN_RXINFO_VLAN 0x0001
228 #define HN_RXINFO_CSUM 0x0002
229 #define HN_RXINFO_HASHINF 0x0004
230 #define HN_RXINFO_HASHVAL 0x0008
231 #define HN_RXINFO_ALL \
234 HN_RXINFO_HASHINF | \
237 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
238 #define HN_NDIS_RXCSUM_INFO_INVALID 0
239 #define HN_NDIS_HASH_INFO_INVALID 0
241 static int hn_probe(device_t);
242 static int hn_attach(device_t);
243 static int hn_detach(device_t);
244 static int hn_shutdown(device_t);
245 static void hn_chan_callback(struct vmbus_channel *,
248 static void hn_init(void *);
249 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
250 #ifdef HN_IFSTART_SUPPORT
251 static void hn_start(struct ifnet *);
253 static int hn_transmit(struct ifnet *, struct mbuf *);
254 static void hn_xmit_qflush(struct ifnet *);
255 static int hn_ifmedia_upd(struct ifnet *);
256 static void hn_ifmedia_sts(struct ifnet *,
257 struct ifmediareq *);
259 static void hn_ifnet_event(void *, struct ifnet *, int);
260 static void hn_ifaddr_event(void *, struct ifnet *);
261 static void hn_ifnet_attevent(void *, struct ifnet *);
262 static void hn_ifnet_detevent(void *, struct ifnet *);
263 static void hn_ifnet_lnkevent(void *, struct ifnet *, int);
265 static bool hn_ismyvf(const struct hn_softc *,
266 const struct ifnet *);
267 static void hn_rxvf_change(struct hn_softc *,
268 struct ifnet *, bool);
269 static void hn_rxvf_set(struct hn_softc *, struct ifnet *);
270 static void hn_rxvf_set_task(void *, int);
271 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
272 static int hn_xpnt_vf_iocsetflags(struct hn_softc *);
273 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *,
275 static void hn_xpnt_vf_saveifflags(struct hn_softc *);
276 static bool hn_xpnt_vf_isready(struct hn_softc *);
277 static void hn_xpnt_vf_setready(struct hn_softc *);
278 static void hn_xpnt_vf_init_taskfunc(void *, int);
279 static void hn_xpnt_vf_init(struct hn_softc *);
280 static void hn_xpnt_vf_setenable(struct hn_softc *);
281 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool);
282 static void hn_vf_rss_fixup(struct hn_softc *, bool);
283 static void hn_vf_rss_restore(struct hn_softc *);
285 static int hn_rndis_rxinfo(const void *, int,
287 static void hn_rndis_rx_data(struct hn_rx_ring *,
289 static void hn_rndis_rx_status(struct hn_softc *,
291 static void hn_rndis_init_fixat(struct hn_softc *, int);
293 static void hn_nvs_handle_notify(struct hn_softc *,
294 const struct vmbus_chanpkt_hdr *);
295 static void hn_nvs_handle_comp(struct hn_softc *,
296 struct vmbus_channel *,
297 const struct vmbus_chanpkt_hdr *);
298 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
299 struct vmbus_channel *,
300 const struct vmbus_chanpkt_hdr *);
301 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
302 struct vmbus_channel *, uint64_t);
304 #if __FreeBSD_version >= 1100099
305 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
306 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
308 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
309 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
310 #if __FreeBSD_version < 1100095
311 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
313 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
315 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
316 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
317 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
318 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
319 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
320 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
321 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
322 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
323 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
324 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
325 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
326 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
327 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
328 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
329 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
330 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
331 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
332 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
333 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
334 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
335 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
336 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
337 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
339 static void hn_stop(struct hn_softc *, bool);
340 static void hn_init_locked(struct hn_softc *);
341 static int hn_chan_attach(struct hn_softc *,
342 struct vmbus_channel *);
343 static void hn_chan_detach(struct hn_softc *,
344 struct vmbus_channel *);
345 static int hn_attach_subchans(struct hn_softc *);
346 static void hn_detach_allchans(struct hn_softc *);
347 static void hn_chan_rollup(struct hn_rx_ring *,
348 struct hn_tx_ring *);
349 static void hn_set_ring_inuse(struct hn_softc *, int);
350 static int hn_synth_attach(struct hn_softc *, int);
351 static void hn_synth_detach(struct hn_softc *);
352 static int hn_synth_alloc_subchans(struct hn_softc *,
354 static bool hn_synth_attachable(const struct hn_softc *);
355 static void hn_suspend(struct hn_softc *);
356 static void hn_suspend_data(struct hn_softc *);
357 static void hn_suspend_mgmt(struct hn_softc *);
358 static void hn_resume(struct hn_softc *);
359 static void hn_resume_data(struct hn_softc *);
360 static void hn_resume_mgmt(struct hn_softc *);
361 static void hn_suspend_mgmt_taskfunc(void *, int);
362 static void hn_chan_drain(struct hn_softc *,
363 struct vmbus_channel *);
364 static void hn_disable_rx(struct hn_softc *);
365 static void hn_drain_rxtx(struct hn_softc *, int);
366 static void hn_polling(struct hn_softc *, u_int);
367 static void hn_chan_polling(struct vmbus_channel *, u_int);
368 static void hn_mtu_change_fixup(struct hn_softc *);
370 static void hn_update_link_status(struct hn_softc *);
371 static void hn_change_network(struct hn_softc *);
372 static void hn_link_taskfunc(void *, int);
373 static void hn_netchg_init_taskfunc(void *, int);
374 static void hn_netchg_status_taskfunc(void *, int);
375 static void hn_link_status(struct hn_softc *);
377 static int hn_create_rx_data(struct hn_softc *, int);
378 static void hn_destroy_rx_data(struct hn_softc *);
379 static int hn_check_iplen(const struct mbuf *, int);
380 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
381 static int hn_rxfilter_config(struct hn_softc *);
382 static int hn_rss_reconfig(struct hn_softc *);
383 static void hn_rss_ind_fixup(struct hn_softc *);
384 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
385 static int hn_rxpkt(struct hn_rx_ring *, const void *,
386 int, const struct hn_rxinfo *);
387 static uint32_t hn_rss_type_fromndis(uint32_t);
388 static uint32_t hn_rss_type_tondis(uint32_t);
390 static int hn_tx_ring_create(struct hn_softc *, int);
391 static void hn_tx_ring_destroy(struct hn_tx_ring *);
392 static int hn_create_tx_data(struct hn_softc *, int);
393 static void hn_fixup_tx_data(struct hn_softc *);
394 static void hn_destroy_tx_data(struct hn_softc *);
395 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
396 static void hn_txdesc_gc(struct hn_tx_ring *,
398 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
399 struct hn_txdesc *, struct mbuf **);
400 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
402 static void hn_set_chim_size(struct hn_softc *, int);
403 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
404 static bool hn_tx_ring_pending(struct hn_tx_ring *);
405 static void hn_tx_ring_qflush(struct hn_tx_ring *);
406 static void hn_resume_tx(struct hn_softc *, int);
407 static void hn_set_txagg(struct hn_softc *);
408 static void *hn_try_txagg(struct ifnet *,
409 struct hn_tx_ring *, struct hn_txdesc *,
411 static int hn_get_txswq_depth(const struct hn_tx_ring *);
412 static void hn_txpkt_done(struct hn_nvs_sendctx *,
413 struct hn_softc *, struct vmbus_channel *,
415 static int hn_txpkt_sglist(struct hn_tx_ring *,
417 static int hn_txpkt_chim(struct hn_tx_ring *,
419 static int hn_xmit(struct hn_tx_ring *, int);
420 static void hn_xmit_taskfunc(void *, int);
421 static void hn_xmit_txeof(struct hn_tx_ring *);
422 static void hn_xmit_txeof_taskfunc(void *, int);
423 #ifdef HN_IFSTART_SUPPORT
424 static int hn_start_locked(struct hn_tx_ring *, int);
425 static void hn_start_taskfunc(void *, int);
426 static void hn_start_txeof(struct hn_tx_ring *);
427 static void hn_start_txeof_taskfunc(void *, int);
430 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
431 "Hyper-V network interface");
433 /* Trust tcp segements verification on host side. */
434 static int hn_trust_hosttcp = 1;
435 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
436 &hn_trust_hosttcp, 0,
437 "Trust tcp segement verification on host side, "
438 "when csum info is missing (global setting)");
440 /* Trust udp datagrams verification on host side. */
441 static int hn_trust_hostudp = 1;
442 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
443 &hn_trust_hostudp, 0,
444 "Trust udp datagram verification on host side, "
445 "when csum info is missing (global setting)");
447 /* Trust ip packets verification on host side. */
448 static int hn_trust_hostip = 1;
449 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
451 "Trust ip packet verification on host side, "
452 "when csum info is missing (global setting)");
454 /* Limit TSO burst size */
455 static int hn_tso_maxlen = IP_MAXPACKET;
456 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
457 &hn_tso_maxlen, 0, "TSO burst limit");
459 /* Limit chimney send size */
460 static int hn_tx_chimney_size = 0;
461 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
462 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
464 /* Limit the size of packet for direct transmission */
465 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
466 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
467 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
469 /* # of LRO entries per RX ring */
470 #if defined(INET) || defined(INET6)
471 #if __FreeBSD_version >= 1100095
472 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
473 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
474 &hn_lro_entry_count, 0, "LRO entry count");
478 static int hn_tx_taskq_cnt = 1;
479 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
480 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
482 #define HN_TX_TASKQ_M_INDEP 0
483 #define HN_TX_TASKQ_M_GLOBAL 1
484 #define HN_TX_TASKQ_M_EVTTQ 2
486 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
487 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
488 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
489 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
491 #ifndef HN_USE_TXDESC_BUFRING
492 static int hn_use_txdesc_bufring = 0;
494 static int hn_use_txdesc_bufring = 1;
496 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
497 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
499 #ifdef HN_IFSTART_SUPPORT
500 /* Use ifnet.if_start instead of ifnet.if_transmit */
501 static int hn_use_if_start = 0;
502 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
503 &hn_use_if_start, 0, "Use if_start TX method");
506 /* # of channels to use */
507 static int hn_chan_cnt = 0;
508 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
510 "# of channels to use; each channel has one RX ring and one TX ring");
512 /* # of transmit rings to use */
513 static int hn_tx_ring_cnt = 0;
514 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
515 &hn_tx_ring_cnt, 0, "# of TX rings to use");
517 /* Software TX ring deptch */
518 static int hn_tx_swq_depth = 0;
519 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
520 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
522 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
523 #if __FreeBSD_version >= 1100095
524 static u_int hn_lro_mbufq_depth = 0;
525 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
526 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
529 /* Packet transmission aggregation size limit */
530 static int hn_tx_agg_size = -1;
531 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
532 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
534 /* Packet transmission aggregation count limit */
535 static int hn_tx_agg_pkts = -1;
536 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
537 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
540 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
541 0, 0, hn_vflist_sysctl, "A", "VF list");
544 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
545 0, 0, hn_vfmap_sysctl, "A", "VF mapping");
548 static int hn_xpnt_vf = 0;
549 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
550 &hn_xpnt_vf, 0, "Transparent VF mod");
552 /* Accurate BPF support for Transparent VF */
553 static int hn_xpnt_vf_accbpf = 0;
554 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
555 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
557 /* Extra wait for transparent VF attach routing; unit seconds. */
558 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
559 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
560 &hn_xpnt_vf_attwait, 0,
561 "Extra wait for transparent VF attach routing; unit: seconds");
563 static u_int hn_cpu_index; /* next CPU for channel */
564 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
566 static struct rmlock hn_vfmap_lock;
567 static int hn_vfmap_size;
568 static struct ifnet **hn_vfmap;
571 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
572 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
573 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
574 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
575 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
576 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
579 static const struct hyperv_guid hn_guid = {
581 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
582 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
585 static device_method_t hn_methods[] = {
586 /* Device interface */
587 DEVMETHOD(device_probe, hn_probe),
588 DEVMETHOD(device_attach, hn_attach),
589 DEVMETHOD(device_detach, hn_detach),
590 DEVMETHOD(device_shutdown, hn_shutdown),
594 static driver_t hn_driver = {
597 sizeof(struct hn_softc)
600 static devclass_t hn_devclass;
602 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
603 MODULE_VERSION(hn, 1);
604 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
606 #if __FreeBSD_version >= 1100099
608 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
612 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
613 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
618 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
621 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
622 txd->chim_size == 0, ("invalid rndis sglist txd"));
623 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
624 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
628 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
630 struct hn_nvs_rndis rndis;
632 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
633 txd->chim_size > 0, ("invalid rndis chim txd"));
635 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
636 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
637 rndis.nvs_chim_idx = txd->chim_index;
638 rndis.nvs_chim_sz = txd->chim_size;
640 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
641 &rndis, sizeof(rndis), &txd->send_ctx));
644 static __inline uint32_t
645 hn_chim_alloc(struct hn_softc *sc)
647 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
648 u_long *bmap = sc->hn_chim_bmap;
649 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
651 for (i = 0; i < bmap_cnt; ++i) {
654 idx = ffsl(~bmap[i]);
658 --idx; /* ffsl is 1-based */
659 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
660 ("invalid i %d and idx %d", i, idx));
662 if (atomic_testandset_long(&bmap[i], idx))
665 ret = i * LONG_BIT + idx;
672 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
677 idx = chim_idx / LONG_BIT;
678 KASSERT(idx < sc->hn_chim_bmap_cnt,
679 ("invalid chimney index 0x%x", chim_idx));
681 mask = 1UL << (chim_idx % LONG_BIT);
682 KASSERT(sc->hn_chim_bmap[idx] & mask,
683 ("index bitmap 0x%lx, chimney index %u, "
684 "bitmap idx %d, bitmask 0x%lx",
685 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
687 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
690 #if defined(INET6) || defined(INET)
692 #define PULLUP_HDR(m, len) \
694 if (__predict_false((m)->m_len < (len))) { \
695 (m) = m_pullup((m), (len)); \
702 * NOTE: If this function failed, the m_head would be freed.
704 static __inline struct mbuf *
705 hn_tso_fixup(struct mbuf *m_head)
707 struct ether_vlan_header *evl;
711 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
713 PULLUP_HDR(m_head, sizeof(*evl));
714 evl = mtod(m_head, struct ether_vlan_header *);
715 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
716 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
718 ehlen = ETHER_HDR_LEN;
719 m_head->m_pkthdr.l2hlen = ehlen;
722 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
726 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
727 ip = mtodo(m_head, ehlen);
728 iphlen = ip->ip_hl << 2;
729 m_head->m_pkthdr.l3hlen = iphlen;
731 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
732 th = mtodo(m_head, ehlen + iphlen);
736 th->th_sum = in_pseudo(ip->ip_src.s_addr,
737 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
740 #if defined(INET6) && defined(INET)
747 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
748 ip6 = mtodo(m_head, ehlen);
749 if (ip6->ip6_nxt != IPPROTO_TCP) {
753 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
755 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
756 th = mtodo(m_head, ehlen + sizeof(*ip6));
759 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
766 * NOTE: If this function failed, the m_head would be freed.
768 static __inline struct mbuf *
769 hn_set_hlen(struct mbuf *m_head)
771 const struct ether_vlan_header *evl;
774 PULLUP_HDR(m_head, sizeof(*evl));
775 evl = mtod(m_head, const struct ether_vlan_header *);
776 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
777 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
779 ehlen = ETHER_HDR_LEN;
780 m_head->m_pkthdr.l2hlen = ehlen;
783 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
787 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
788 ip = mtodo(m_head, ehlen);
789 iphlen = ip->ip_hl << 2;
790 m_head->m_pkthdr.l3hlen = iphlen;
793 #if defined(INET6) && defined(INET)
798 const struct ip6_hdr *ip6;
800 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
801 ip6 = mtodo(m_head, ehlen);
802 if (ip6->ip6_nxt != IPPROTO_TCP) {
806 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
813 * NOTE: If this function failed, the m_head would be freed.
815 static __inline struct mbuf *
816 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
818 const struct tcphdr *th;
822 ehlen = m_head->m_pkthdr.l2hlen;
823 iphlen = m_head->m_pkthdr.l3hlen;
825 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
826 th = mtodo(m_head, ehlen + iphlen);
827 if (th->th_flags & TH_SYN)
834 #endif /* INET6 || INET */
837 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
843 if (sc->hn_rx_filter != filter) {
844 error = hn_rndis_set_rxfilter(sc, filter);
846 sc->hn_rx_filter = filter;
852 hn_rxfilter_config(struct hn_softc *sc)
854 struct ifnet *ifp = sc->hn_ifp;
860 * If the non-transparent mode VF is activated, we don't know how
861 * its RX filter is configured, so stick the synthetic device in
862 * the promiscous mode.
864 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
865 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
867 filter = NDIS_PACKET_TYPE_DIRECTED;
868 if (ifp->if_flags & IFF_BROADCAST)
869 filter |= NDIS_PACKET_TYPE_BROADCAST;
870 /* TODO: support multicast list */
871 if ((ifp->if_flags & IFF_ALLMULTI) ||
872 !TAILQ_EMPTY(&ifp->if_multiaddrs))
873 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
875 return (hn_set_rxfilter(sc, filter));
879 hn_set_txagg(struct hn_softc *sc)
885 * Setup aggregation size.
887 if (sc->hn_agg_size < 0)
890 size = sc->hn_agg_size;
892 if (sc->hn_rndis_agg_size < size)
893 size = sc->hn_rndis_agg_size;
895 /* NOTE: We only aggregate packets using chimney sending buffers. */
896 if (size > (uint32_t)sc->hn_chim_szmax)
897 size = sc->hn_chim_szmax;
899 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
906 /* NOTE: Type of the per TX ring setting is 'int'. */
911 * Setup aggregation packet count.
913 if (sc->hn_agg_pkts < 0)
916 pkts = sc->hn_agg_pkts;
918 if (sc->hn_rndis_agg_pkts < pkts)
919 pkts = sc->hn_rndis_agg_pkts;
928 /* NOTE: Type of the per TX ring setting is 'short'. */
933 /* NOTE: Type of the per TX ring setting is 'short'. */
934 if (sc->hn_rndis_agg_align > SHRT_MAX) {
941 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
942 size, pkts, sc->hn_rndis_agg_align);
945 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
946 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
948 mtx_lock(&txr->hn_tx_lock);
949 txr->hn_agg_szmax = size;
950 txr->hn_agg_pktmax = pkts;
951 txr->hn_agg_align = sc->hn_rndis_agg_align;
952 mtx_unlock(&txr->hn_tx_lock);
957 hn_get_txswq_depth(const struct hn_tx_ring *txr)
960 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
961 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
962 return txr->hn_txdesc_cnt;
963 return hn_tx_swq_depth;
967 hn_rss_reconfig(struct hn_softc *sc)
973 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
980 * Direct reconfiguration by setting the UNCHG flags does
981 * _not_ work properly.
984 if_printf(sc->hn_ifp, "disable RSS\n");
985 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
987 if_printf(sc->hn_ifp, "RSS disable failed\n");
992 * Reenable the RSS w/ the updated RSS key or indirect
996 if_printf(sc->hn_ifp, "reconfig RSS\n");
997 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
999 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1006 hn_rss_ind_fixup(struct hn_softc *sc)
1008 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1011 nchan = sc->hn_rx_ring_inuse;
1012 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1015 * Check indirect table to make sure that all channels in it
1018 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1019 if (rss->rss_ind[i] >= nchan) {
1020 if_printf(sc->hn_ifp,
1021 "RSS indirect table %d fixup: %u -> %d\n",
1022 i, rss->rss_ind[i], nchan - 1);
1023 rss->rss_ind[i] = nchan - 1;
1029 hn_ifmedia_upd(struct ifnet *ifp __unused)
1036 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1038 struct hn_softc *sc = ifp->if_softc;
1040 ifmr->ifm_status = IFM_AVALID;
1041 ifmr->ifm_active = IFM_ETHER;
1043 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1044 ifmr->ifm_active |= IFM_NONE;
1047 ifmr->ifm_status |= IFM_ACTIVE;
1048 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1052 hn_rxvf_set_task(void *xarg, int pending __unused)
1054 struct hn_rxvf_setarg *arg = xarg;
1056 arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1060 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1062 struct hn_rx_ring *rxr;
1063 struct hn_rxvf_setarg arg;
1069 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1071 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1072 rxr = &sc->hn_rx_ring[i];
1074 if (i < sc->hn_rx_ring_inuse) {
1076 arg.vf_ifp = vf_ifp;
1077 vmbus_chan_run_task(rxr->hn_chan, &task);
1079 rxr->hn_rxvf_ifp = vf_ifp;
1085 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1087 const struct ifnet *hn_ifp;
1089 hn_ifp = sc->hn_ifp;
1094 if (ifp->if_alloctype != IFT_ETHER)
1097 /* Ignore lagg/vlan interfaces */
1098 if (strcmp(ifp->if_dname, "lagg") == 0 ||
1099 strcmp(ifp->if_dname, "vlan") == 0)
1102 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1109 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1111 struct ifnet *hn_ifp;
1115 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1118 if (!hn_ismyvf(sc, ifp))
1120 hn_ifp = sc->hn_ifp;
1123 if (sc->hn_flags & HN_FLAG_RXVF)
1126 sc->hn_flags |= HN_FLAG_RXVF;
1127 hn_rxfilter_config(sc);
1129 if (!(sc->hn_flags & HN_FLAG_RXVF))
1132 sc->hn_flags &= ~HN_FLAG_RXVF;
1133 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1134 hn_rxfilter_config(sc);
1136 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1139 hn_nvs_set_datapath(sc,
1140 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1142 hn_rxvf_set(sc, rxvf ? ifp : NULL);
1145 hn_vf_rss_fixup(sc, true);
1146 hn_suspend_mgmt(sc);
1147 sc->hn_link_flags &=
1148 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1149 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1151 hn_vf_rss_restore(sc);
1155 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1156 rxvf ? "VF_UP" : "VF_DOWN", NULL);
1159 if_printf(hn_ifp, "datapath is switched %s %s\n",
1160 rxvf ? "to" : "from", ifp->if_xname);
1167 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1170 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1172 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1176 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1179 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1183 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1185 struct ifnet *ifp, *vf_ifp;
1191 vf_ifp = sc->hn_vf_ifp;
1194 * Fix up requested capabilities w/ supported capabilities,
1195 * since the supported capabilities could have been changed.
1197 ifr->ifr_reqcap &= ifp->if_capabilities;
1198 /* Pass SIOCSIFCAP to VF. */
1199 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1203 * The error will be propagated to the callers, however, it
1204 * is _not_ useful here.
1208 * Merge VF's enabled capabilities.
1210 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1212 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1213 if (ifp->if_capenable & IFCAP_TXCSUM)
1214 ifp->if_hwassist |= tmp;
1216 ifp->if_hwassist &= ~tmp;
1218 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1219 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1220 ifp->if_hwassist |= tmp;
1222 ifp->if_hwassist &= ~tmp;
1224 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1225 if (ifp->if_capenable & IFCAP_TSO4)
1226 ifp->if_hwassist |= tmp;
1228 ifp->if_hwassist &= ~tmp;
1230 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1231 if (ifp->if_capenable & IFCAP_TSO6)
1232 ifp->if_hwassist |= tmp;
1234 ifp->if_hwassist &= ~tmp;
1240 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1242 struct ifnet *vf_ifp;
1246 vf_ifp = sc->hn_vf_ifp;
1248 memset(&ifr, 0, sizeof(ifr));
1249 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1250 ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1251 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1252 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1256 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1258 struct ifnet *ifp = sc->hn_ifp;
1263 /* XXX vlan(4) style mcast addr maintenance */
1264 if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1265 allmulti = IFF_ALLMULTI;
1267 /* Always set the VF's if_flags */
1268 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1272 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1274 struct rm_priotracker pt;
1275 struct ifnet *hn_ifp = NULL;
1279 * XXX racy, if hn(4) ever detached.
1281 rm_rlock(&hn_vfmap_lock, &pt);
1282 if (vf_ifp->if_index < hn_vfmap_size)
1283 hn_ifp = hn_vfmap[vf_ifp->if_index];
1284 rm_runlock(&hn_vfmap_lock, &pt);
1286 if (hn_ifp != NULL) {
1287 for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1289 * Allow tapping on the VF.
1291 ETHER_BPF_MTAP(vf_ifp, mn);
1296 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1297 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1301 * XXX IFCOUNTER_IMCAST
1302 * This stat updating is kinda invasive, since it
1303 * requires two checks on the mbuf: the length check
1304 * and the ethernet header check. As of this write,
1305 * all multicast packets go directly to hn(4), which
1306 * makes imcast stat updating in the VF a try in vian.
1310 * Fix up rcvif and increase hn(4)'s ipackets.
1312 mn->m_pkthdr.rcvif = hn_ifp;
1313 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1316 * Go through hn(4)'s if_input.
1318 hn_ifp->if_input(hn_ifp, m);
1321 * In the middle of the transition; free this
1326 m->m_nextpkt = NULL;
1334 hn_mtu_change_fixup(struct hn_softc *sc)
1341 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1342 #if __FreeBSD_version >= 1100099
1343 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1344 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1349 hn_rss_type_fromndis(uint32_t rss_hash)
1353 if (rss_hash & NDIS_HASH_IPV4)
1354 types |= RSS_TYPE_IPV4;
1355 if (rss_hash & NDIS_HASH_TCP_IPV4)
1356 types |= RSS_TYPE_TCP_IPV4;
1357 if (rss_hash & NDIS_HASH_IPV6)
1358 types |= RSS_TYPE_IPV6;
1359 if (rss_hash & NDIS_HASH_IPV6_EX)
1360 types |= RSS_TYPE_IPV6_EX;
1361 if (rss_hash & NDIS_HASH_TCP_IPV6)
1362 types |= RSS_TYPE_TCP_IPV6;
1363 if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1364 types |= RSS_TYPE_TCP_IPV6_EX;
1369 hn_rss_type_tondis(uint32_t types)
1371 uint32_t rss_hash = 0;
1374 (RSS_TYPE_UDP_IPV4 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1375 ("UDP4, UDP6 and UDP6EX are not supported"));
1377 if (types & RSS_TYPE_IPV4)
1378 rss_hash |= NDIS_HASH_IPV4;
1379 if (types & RSS_TYPE_TCP_IPV4)
1380 rss_hash |= NDIS_HASH_TCP_IPV4;
1381 if (types & RSS_TYPE_IPV6)
1382 rss_hash |= NDIS_HASH_IPV6;
1383 if (types & RSS_TYPE_IPV6_EX)
1384 rss_hash |= NDIS_HASH_IPV6_EX;
1385 if (types & RSS_TYPE_TCP_IPV6)
1386 rss_hash |= NDIS_HASH_TCP_IPV6;
1387 if (types & RSS_TYPE_TCP_IPV6_EX)
1388 rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1393 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1399 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1400 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1404 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1406 struct ifnet *ifp, *vf_ifp;
1407 struct ifrsshash ifrh;
1408 struct ifrsskey ifrk;
1410 uint32_t my_types, diff_types, mbuf_types = 0;
1413 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1414 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1416 if (sc->hn_rx_ring_inuse == 1) {
1417 /* No RSS on synthetic parts; done. */
1420 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1421 /* Synthetic parts do not support Toeplitz; done. */
1426 vf_ifp = sc->hn_vf_ifp;
1429 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is
1432 memset(&ifrk, 0, sizeof(ifrk));
1433 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1434 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1436 if_printf(ifp, "%s SIOCGRSSKEY failed: %d\n",
1437 vf_ifp->if_xname, error);
1440 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1441 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1442 vf_ifp->if_xname, ifrk.ifrk_func);
1445 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1446 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1447 vf_ifp->if_xname, ifrk.ifrk_keylen);
1452 * Extract VF's RSS hash. Only Toeplitz is supported.
1454 memset(&ifrh, 0, sizeof(ifrh));
1455 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1456 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1458 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1459 vf_ifp->if_xname, error);
1462 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1463 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1464 vf_ifp->if_xname, ifrh.ifrh_func);
1468 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1469 if ((ifrh.ifrh_types & my_types) == 0) {
1470 /* This disables RSS; ignore it then */
1471 if_printf(ifp, "%s intersection of RSS types failed. "
1472 "VF %#x, mine %#x\n", vf_ifp->if_xname,
1473 ifrh.ifrh_types, my_types);
1477 diff_types = my_types ^ ifrh.ifrh_types;
1478 my_types &= ifrh.ifrh_types;
1479 mbuf_types = my_types;
1482 * Detect RSS hash value/type confliction.
1485 * We don't disable the hash type, but stop delivery the hash
1486 * value/type through mbufs on RX path.
1488 if ((my_types & RSS_TYPE_IPV4) &&
1489 (diff_types & ifrh.ifrh_types &
1490 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1491 /* Conflict; disable IPV4 hash type/value delivery. */
1492 if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1493 mbuf_types &= ~RSS_TYPE_IPV4;
1495 if ((my_types & RSS_TYPE_IPV6) &&
1496 (diff_types & ifrh.ifrh_types &
1497 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1498 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1499 RSS_TYPE_IPV6_EX))) {
1500 /* Conflict; disable IPV6 hash type/value delivery. */
1501 if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1502 mbuf_types &= ~RSS_TYPE_IPV6;
1504 if ((my_types & RSS_TYPE_IPV6_EX) &&
1505 (diff_types & ifrh.ifrh_types &
1506 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1507 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1509 /* Conflict; disable IPV6_EX hash type/value delivery. */
1510 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1511 mbuf_types &= ~RSS_TYPE_IPV6_EX;
1513 if ((my_types & RSS_TYPE_TCP_IPV6) &&
1514 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1515 /* Conflict; disable TCP_IPV6 hash type/value delivery. */
1516 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1517 mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1519 if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1520 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1521 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1522 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1523 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1525 if ((my_types & RSS_TYPE_UDP_IPV6) &&
1526 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1527 /* Conflict; disable UDP_IPV6 hash type/value delivery. */
1528 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1529 mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1531 if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1532 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1533 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1534 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1535 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1539 * Indirect table does not matter.
1542 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1543 hn_rss_type_tondis(my_types);
1544 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1545 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1548 error = hn_rss_reconfig(sc);
1550 /* XXX roll-back? */
1551 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1552 /* XXX keep going. */
1556 /* Hash deliverability for mbufs. */
1557 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1561 hn_vf_rss_restore(struct hn_softc *sc)
1565 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1566 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1568 if (sc->hn_rx_ring_inuse == 1)
1572 * Restore hash types. Key does _not_ matter.
1574 if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1577 sc->hn_rss_hash = sc->hn_rss_hcap;
1578 error = hn_rss_reconfig(sc);
1580 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1582 /* XXX keep going. */
1586 /* Hash deliverability for mbufs. */
1587 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1591 hn_xpnt_vf_setready(struct hn_softc *sc)
1593 struct ifnet *ifp, *vf_ifp;
1598 vf_ifp = sc->hn_vf_ifp;
1601 * Mark the VF ready.
1603 sc->hn_vf_rdytick = 0;
1606 * Save information for restoration.
1608 sc->hn_saved_caps = ifp->if_capabilities;
1609 sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1610 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1611 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1614 * Intersect supported/enabled capabilities.
1617 * if_hwassist is not changed here.
1619 ifp->if_capabilities &= vf_ifp->if_capabilities;
1620 ifp->if_capenable &= ifp->if_capabilities;
1625 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1626 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1627 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1628 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1629 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1630 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1633 * Change VF's enabled capabilities.
1635 memset(&ifr, 0, sizeof(ifr));
1636 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1637 ifr.ifr_reqcap = ifp->if_capenable;
1638 hn_xpnt_vf_iocsetcaps(sc, &ifr);
1640 if (ifp->if_mtu != ETHERMTU) {
1646 memset(&ifr, 0, sizeof(ifr));
1647 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1648 ifr.ifr_mtu = ifp->if_mtu;
1649 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1651 if_printf(ifp, "%s SIOCSIFMTU %lu failed\n",
1652 vf_ifp->if_xname, ifp->if_mtu);
1653 if (ifp->if_mtu > ETHERMTU) {
1654 if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1658 * No need to adjust the synthetic parts' MTU;
1659 * failure of the adjustment will cause us
1660 * infinite headache.
1662 ifp->if_mtu = ETHERMTU;
1663 hn_mtu_change_fixup(sc);
1670 hn_xpnt_vf_isready(struct hn_softc *sc)
1675 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1678 if (sc->hn_vf_rdytick == 0)
1681 if (sc->hn_vf_rdytick > ticks)
1684 /* Mark VF as ready. */
1685 hn_xpnt_vf_setready(sc);
1690 hn_xpnt_vf_setenable(struct hn_softc *sc)
1696 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1697 rm_wlock(&sc->hn_vf_lock);
1698 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1699 rm_wunlock(&sc->hn_vf_lock);
1701 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1702 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1706 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1712 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1713 rm_wlock(&sc->hn_vf_lock);
1714 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1716 sc->hn_vf_ifp = NULL;
1717 rm_wunlock(&sc->hn_vf_lock);
1719 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1720 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1724 hn_xpnt_vf_init(struct hn_softc *sc)
1730 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1731 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1734 if_printf(sc->hn_ifp, "try bringing up %s\n",
1735 sc->hn_vf_ifp->if_xname);
1741 hn_xpnt_vf_saveifflags(sc);
1742 sc->hn_vf_ifp->if_flags |= IFF_UP;
1743 error = hn_xpnt_vf_iocsetflags(sc);
1745 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1746 sc->hn_vf_ifp->if_xname, error);
1752 * Datapath setting must happen _after_ bringing the VF up.
1754 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1758 * Fixup RSS related bits _after_ the VF is brought up, since
1759 * many VFs generate RSS key during it's initialization.
1761 hn_vf_rss_fixup(sc, true);
1763 /* Mark transparent mode VF as enabled. */
1764 hn_xpnt_vf_setenable(sc);
1768 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1770 struct hn_softc *sc = xsc;
1774 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1776 if (sc->hn_vf_ifp == NULL)
1778 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1781 if (sc->hn_vf_rdytick != 0) {
1782 /* Mark VF as ready. */
1783 hn_xpnt_vf_setready(sc);
1786 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1788 * Delayed VF initialization.
1791 if_printf(sc->hn_ifp, "delayed initialize %s\n",
1792 sc->hn_vf_ifp->if_xname);
1794 hn_xpnt_vf_init(sc);
1801 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1803 struct hn_softc *sc = xsc;
1807 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1810 if (!hn_ismyvf(sc, ifp))
1813 if (sc->hn_vf_ifp != NULL) {
1814 if_printf(sc->hn_ifp, "%s was attached as VF\n",
1815 sc->hn_vf_ifp->if_xname);
1819 if (hn_xpnt_vf && ifp->if_start != NULL) {
1821 * ifnet.if_start is _not_ supported by transparent
1822 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1824 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1825 "in transparent VF mode.\n", ifp->if_xname);
1829 rm_wlock(&hn_vfmap_lock);
1831 if (ifp->if_index >= hn_vfmap_size) {
1832 struct ifnet **newmap;
1835 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1836 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1839 memcpy(newmap, hn_vfmap,
1840 sizeof(struct ifnet *) * hn_vfmap_size);
1841 free(hn_vfmap, M_DEVBUF);
1843 hn_vfmap_size = newsize;
1845 KASSERT(hn_vfmap[ifp->if_index] == NULL,
1846 ("%s: ifindex %d was mapped to %s",
1847 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1848 hn_vfmap[ifp->if_index] = sc->hn_ifp;
1850 rm_wunlock(&hn_vfmap_lock);
1852 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1853 rm_wlock(&sc->hn_vf_lock);
1854 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1855 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1856 sc->hn_vf_ifp = ifp;
1857 rm_wunlock(&sc->hn_vf_lock);
1863 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1864 * Save vf_ifp's current if_input for later restoration.
1866 sc->hn_vf_input = ifp->if_input;
1867 ifp->if_input = hn_xpnt_vf_input;
1870 * Stop link status management; use the VF's.
1872 hn_suspend_mgmt(sc);
1875 * Give VF sometime to complete its attach routing.
1877 wait_ticks = hn_xpnt_vf_attwait * hz;
1878 sc->hn_vf_rdytick = ticks + wait_ticks;
1880 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1888 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1890 struct hn_softc *sc = xsc;
1894 if (sc->hn_vf_ifp == NULL)
1897 if (!hn_ismyvf(sc, ifp))
1902 * Make sure that the delayed initialization is not running.
1905 * - This lock _must_ be released, since the hn_vf_init task
1906 * will try holding this lock.
1907 * - It is safe to release this lock here, since the
1908 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1910 * XXX racy, if hn(4) ever detached.
1913 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1916 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1917 sc->hn_ifp->if_xname));
1918 ifp->if_input = sc->hn_vf_input;
1919 sc->hn_vf_input = NULL;
1921 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1922 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1923 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1925 if (sc->hn_vf_rdytick == 0) {
1927 * The VF was ready; restore some settings.
1929 sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
1932 * There is _no_ need to fixup if_capenable and
1933 * if_hwassist, since the if_capabilities before
1934 * restoration was an intersection of the VF's
1935 * if_capabilites and the synthetic device's
1938 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
1939 sc->hn_ifp->if_hw_tsomaxsegcount =
1940 sc->hn_saved_tsosegcnt;
1941 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
1944 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1946 * Restore RSS settings.
1948 hn_vf_rss_restore(sc);
1951 * Resume link status management, which was suspended
1952 * by hn_ifnet_attevent().
1958 /* Mark transparent mode VF as disabled. */
1959 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
1961 rm_wlock(&hn_vfmap_lock);
1963 KASSERT(ifp->if_index < hn_vfmap_size,
1964 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
1965 if (hn_vfmap[ifp->if_index] != NULL) {
1966 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
1967 ("%s: ifindex %d was mapped to %s",
1968 ifp->if_xname, ifp->if_index,
1969 hn_vfmap[ifp->if_index]->if_xname));
1970 hn_vfmap[ifp->if_index] = NULL;
1973 rm_wunlock(&hn_vfmap_lock);
1979 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
1981 struct hn_softc *sc = xsc;
1983 if (sc->hn_vf_ifp == ifp)
1984 if_link_state_change(sc->hn_ifp, link_state);
1988 hn_probe(device_t dev)
1991 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
1992 device_set_desc(dev, "Hyper-V Network Interface");
1993 return BUS_PROBE_DEFAULT;
1999 hn_attach(device_t dev)
2001 struct hn_softc *sc = device_get_softc(dev);
2002 struct sysctl_oid_list *child;
2003 struct sysctl_ctx_list *ctx;
2004 uint8_t eaddr[ETHER_ADDR_LEN];
2005 struct ifnet *ifp = NULL;
2006 int error, ring_cnt, tx_ring_cnt;
2010 sc->hn_prichan = vmbus_get_channel(dev);
2012 rm_init(&sc->hn_vf_lock, "hnvf");
2013 if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2014 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2017 * Initialize these tunables once.
2019 sc->hn_agg_size = hn_tx_agg_size;
2020 sc->hn_agg_pkts = hn_tx_agg_pkts;
2023 * Setup taskqueue for transmission.
2025 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2029 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2030 M_DEVBUF, M_WAITOK);
2031 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2032 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2033 M_WAITOK, taskqueue_thread_enqueue,
2034 &sc->hn_tx_taskqs[i]);
2035 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2036 "%s tx%d", device_get_nameunit(dev), i);
2038 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2039 sc->hn_tx_taskqs = hn_tx_taskque;
2043 * Setup taskqueue for mangement tasks, e.g. link status.
2045 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2046 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2047 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2048 device_get_nameunit(dev));
2049 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2050 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2051 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2052 hn_netchg_status_taskfunc, sc);
2056 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2058 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2059 taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2060 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2061 device_get_nameunit(dev));
2062 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2063 hn_xpnt_vf_init_taskfunc, sc);
2067 * Allocate ifnet and setup its name earlier, so that if_printf
2068 * can be used by functions, which will be called after
2071 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
2073 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2076 * Initialize ifmedia earlier so that it can be unconditionally
2077 * destroyed, if error happened later on.
2079 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2082 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2083 * to use (tx_ring_cnt).
2086 * The # of RX rings to use is same as the # of channels to use.
2088 ring_cnt = hn_chan_cnt;
2089 if (ring_cnt <= 0) {
2091 ring_cnt = mp_ncpus;
2092 if (ring_cnt > HN_RING_CNT_DEF_MAX)
2093 ring_cnt = HN_RING_CNT_DEF_MAX;
2094 } else if (ring_cnt > mp_ncpus) {
2095 ring_cnt = mp_ncpus;
2098 tx_ring_cnt = hn_tx_ring_cnt;
2099 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2100 tx_ring_cnt = ring_cnt;
2101 #ifdef HN_IFSTART_SUPPORT
2102 if (hn_use_if_start) {
2103 /* ifnet.if_start only needs one TX ring. */
2109 * Set the leader CPU for channels.
2111 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2114 * Create enough TX/RX rings, even if only limited number of
2115 * channels can be allocated.
2117 error = hn_create_tx_data(sc, tx_ring_cnt);
2120 error = hn_create_rx_data(sc, ring_cnt);
2125 * Create transaction context for NVS and RNDIS transactions.
2127 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2128 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2129 if (sc->hn_xact == NULL) {
2135 * Install orphan handler for the revocation of this device's
2139 * The processing order is critical here:
2140 * Install the orphan handler, _before_ testing whether this
2141 * device's primary channel has been revoked or not.
2143 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2144 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2150 * Attach the synthetic parts, i.e. NVS and RNDIS.
2152 error = hn_synth_attach(sc, ETHERMTU);
2156 error = hn_rndis_get_eaddr(sc, eaddr);
2160 error = hn_rndis_get_mtu(sc, &mtu);
2163 else if (bootverbose)
2164 device_printf(dev, "RNDIS mtu %u\n", mtu);
2166 #if __FreeBSD_version >= 1100099
2167 if (sc->hn_rx_ring_inuse > 1) {
2169 * Reduce TCP segment aggregation limit for multiple
2170 * RX rings to increase ACK timeliness.
2172 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2177 * Fixup TX stuffs after synthetic parts are attached.
2179 hn_fixup_tx_data(sc);
2181 ctx = device_get_sysctl_ctx(dev);
2182 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2183 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2184 &sc->hn_nvs_ver, 0, "NVS version");
2185 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2186 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2187 hn_ndis_version_sysctl, "A", "NDIS version");
2188 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2189 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2190 hn_caps_sysctl, "A", "capabilities");
2191 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2192 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2193 hn_hwassist_sysctl, "A", "hwassist");
2194 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2195 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2196 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2197 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2198 "max # of TSO segments");
2199 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2200 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2201 "max size of TSO segment");
2202 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2203 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2204 hn_rxfilter_sysctl, "A", "rxfilter");
2205 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2206 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2207 hn_rss_hash_sysctl, "A", "RSS hash");
2208 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2209 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2210 hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2211 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2212 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2213 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2214 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2215 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2216 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2217 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2218 hn_rss_key_sysctl, "IU", "RSS key");
2219 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2220 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2221 hn_rss_ind_sysctl, "IU", "RSS indirect table");
2222 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2223 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2224 "RNDIS offered packet transmission aggregation size limit");
2225 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2226 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2227 "RNDIS offered packet transmission aggregation count limit");
2228 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2229 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2230 "RNDIS packet transmission aggregation alignment");
2231 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2232 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2233 hn_txagg_size_sysctl, "I",
2234 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2235 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2236 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2237 hn_txagg_pkts_sysctl, "I",
2238 "Packet transmission aggregation packets, "
2239 "0 -- disable, -1 -- auto");
2240 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2241 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2242 hn_polling_sysctl, "I",
2243 "Polling frequency: [100,1000000], 0 disable polling");
2244 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2245 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2246 hn_vf_sysctl, "A", "Virtual Function's name");
2248 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2249 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2250 hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2252 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2253 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2254 hn_xpnt_vf_enabled_sysctl, "I",
2255 "Transparent VF enabled");
2256 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2257 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2258 hn_xpnt_vf_accbpf_sysctl, "I",
2259 "Accurate BPF for transparent VF");
2263 * Setup the ifmedia, which has been initialized earlier.
2265 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2266 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2267 /* XXX ifmedia_set really should do this for us */
2268 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2271 * Setup the ifnet for this interface.
2275 ifp->if_baudrate = IF_Gbps(10);
2277 /* if_baudrate is 32bits on 32bit system. */
2278 ifp->if_baudrate = IF_Gbps(1);
2280 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2281 ifp->if_ioctl = hn_ioctl;
2282 ifp->if_init = hn_init;
2283 #ifdef HN_IFSTART_SUPPORT
2284 if (hn_use_if_start) {
2285 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2287 ifp->if_start = hn_start;
2288 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2289 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2290 IFQ_SET_READY(&ifp->if_snd);
2294 ifp->if_transmit = hn_transmit;
2295 ifp->if_qflush = hn_xmit_qflush;
2298 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2300 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2301 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2303 if (sc->hn_caps & HN_CAP_VLAN) {
2304 /* XXX not sure about VLAN_MTU. */
2305 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2308 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2309 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2310 ifp->if_capabilities |= IFCAP_TXCSUM;
2311 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2312 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2313 if (sc->hn_caps & HN_CAP_TSO4) {
2314 ifp->if_capabilities |= IFCAP_TSO4;
2315 ifp->if_hwassist |= CSUM_IP_TSO;
2317 if (sc->hn_caps & HN_CAP_TSO6) {
2318 ifp->if_capabilities |= IFCAP_TSO6;
2319 ifp->if_hwassist |= CSUM_IP6_TSO;
2322 /* Enable all available capabilities by default. */
2323 ifp->if_capenable = ifp->if_capabilities;
2326 * Disable IPv6 TSO and TXCSUM by default, they still can
2327 * be enabled through SIOCSIFCAP.
2329 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2330 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2332 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2334 * Lock hn_set_tso_maxsize() to simplify its
2338 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2340 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2341 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2344 ether_ifattach(ifp, eaddr);
2346 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2347 if_printf(ifp, "TSO segcnt %u segsz %u\n",
2348 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2350 if (mtu < ETHERMTU) {
2351 if_printf(ifp, "fixup mtu %lu -> %u\n", ifp->if_mtu, mtu);
2355 /* Inform the upper layer about the long frame support. */
2356 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2359 * Kick off link status check.
2361 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2362 hn_update_link_status(sc);
2365 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2366 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2367 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2368 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2370 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2371 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2376 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2377 * since interface's LLADDR is needed; interface LLADDR is not
2378 * available when ifnet_arrival event is triggered.
2380 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2381 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2382 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2383 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2387 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2388 hn_synth_detach(sc);
2394 hn_detach(device_t dev)
2396 struct hn_softc *sc = device_get_softc(dev);
2397 struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2399 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2401 * In case that the vmbus missed the orphan handler
2404 vmbus_xact_ctx_orphan(sc->hn_xact);
2407 if (sc->hn_ifaddr_evthand != NULL)
2408 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2409 if (sc->hn_ifnet_evthand != NULL)
2410 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2411 if (sc->hn_ifnet_atthand != NULL) {
2412 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2413 sc->hn_ifnet_atthand);
2415 if (sc->hn_ifnet_dethand != NULL) {
2416 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2417 sc->hn_ifnet_dethand);
2419 if (sc->hn_ifnet_lnkhand != NULL)
2420 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2422 vf_ifp = sc->hn_vf_ifp;
2423 __compiler_membar();
2425 hn_ifnet_detevent(sc, vf_ifp);
2427 if (device_is_attached(dev)) {
2429 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2430 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2434 * hn_stop() only suspends data, so managment
2435 * stuffs have to be suspended manually here.
2437 hn_suspend_mgmt(sc);
2438 hn_synth_detach(sc);
2441 ether_ifdetach(ifp);
2444 ifmedia_removeall(&sc->hn_media);
2445 hn_destroy_rx_data(sc);
2446 hn_destroy_tx_data(sc);
2448 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2451 for (i = 0; i < hn_tx_taskq_cnt; ++i)
2452 taskqueue_free(sc->hn_tx_taskqs[i]);
2453 free(sc->hn_tx_taskqs, M_DEVBUF);
2455 taskqueue_free(sc->hn_mgmt_taskq0);
2456 if (sc->hn_vf_taskq != NULL)
2457 taskqueue_free(sc->hn_vf_taskq);
2459 if (sc->hn_xact != NULL) {
2461 * Uninstall the orphan handler _before_ the xact is
2464 vmbus_chan_unset_orphan(sc->hn_prichan);
2465 vmbus_xact_ctx_destroy(sc->hn_xact);
2470 HN_LOCK_DESTROY(sc);
2471 rm_destroy(&sc->hn_vf_lock);
2476 hn_shutdown(device_t dev)
2483 hn_link_status(struct hn_softc *sc)
2485 uint32_t link_status;
2488 error = hn_rndis_get_linkstatus(sc, &link_status);
2490 /* XXX what to do? */
2494 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2495 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2497 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2498 if_link_state_change(sc->hn_ifp,
2499 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2500 LINK_STATE_UP : LINK_STATE_DOWN);
2504 hn_link_taskfunc(void *xsc, int pending __unused)
2506 struct hn_softc *sc = xsc;
2508 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2514 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2516 struct hn_softc *sc = xsc;
2518 /* Prevent any link status checks from running. */
2519 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2522 * Fake up a [link down --> link up] state change; 5 seconds
2523 * delay is used, which closely simulates miibus reaction
2524 * upon link down event.
2526 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2527 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2528 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2529 &sc->hn_netchg_status, 5 * hz);
2533 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2535 struct hn_softc *sc = xsc;
2537 /* Re-allow link status checks. */
2538 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2543 hn_update_link_status(struct hn_softc *sc)
2546 if (sc->hn_mgmt_taskq != NULL)
2547 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2551 hn_change_network(struct hn_softc *sc)
2554 if (sc->hn_mgmt_taskq != NULL)
2555 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2559 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2560 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2562 struct mbuf *m = *m_head;
2565 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2567 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2568 m, segs, nsegs, BUS_DMA_NOWAIT);
2569 if (error == EFBIG) {
2572 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2576 *m_head = m = m_new;
2577 txr->hn_tx_collapsed++;
2579 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2580 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2583 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2584 BUS_DMASYNC_PREWRITE);
2585 txd->flags |= HN_TXD_FLAG_DMAMAP;
2591 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2594 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2595 ("put an onlist txd %#x", txd->flags));
2596 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2597 ("put an onagg txd %#x", txd->flags));
2599 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2600 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2603 if (!STAILQ_EMPTY(&txd->agg_list)) {
2604 struct hn_txdesc *tmp_txd;
2606 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2609 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2610 ("resursive aggregation on aggregated txdesc"));
2611 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2612 ("not aggregated txdesc"));
2613 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2614 ("aggregated txdesc uses dmamap"));
2615 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2616 ("aggregated txdesc consumes "
2617 "chimney sending buffer"));
2618 KASSERT(tmp_txd->chim_size == 0,
2619 ("aggregated txdesc has non-zero "
2620 "chimney sending size"));
2622 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2623 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2624 freed = hn_txdesc_put(txr, tmp_txd);
2625 KASSERT(freed, ("failed to free aggregated txdesc"));
2629 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2630 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2631 ("chim txd uses dmamap"));
2632 hn_chim_free(txr->hn_sc, txd->chim_index);
2633 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2635 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2636 bus_dmamap_sync(txr->hn_tx_data_dtag,
2637 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2638 bus_dmamap_unload(txr->hn_tx_data_dtag,
2640 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2643 if (txd->m != NULL) {
2648 txd->flags |= HN_TXD_FLAG_ONLIST;
2649 #ifndef HN_USE_TXDESC_BUFRING
2650 mtx_lock_spin(&txr->hn_txlist_spin);
2651 KASSERT(txr->hn_txdesc_avail >= 0 &&
2652 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2653 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2654 txr->hn_txdesc_avail++;
2655 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2656 mtx_unlock_spin(&txr->hn_txlist_spin);
2657 #else /* HN_USE_TXDESC_BUFRING */
2659 atomic_add_int(&txr->hn_txdesc_avail, 1);
2661 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2662 #endif /* !HN_USE_TXDESC_BUFRING */
2667 static __inline struct hn_txdesc *
2668 hn_txdesc_get(struct hn_tx_ring *txr)
2670 struct hn_txdesc *txd;
2672 #ifndef HN_USE_TXDESC_BUFRING
2673 mtx_lock_spin(&txr->hn_txlist_spin);
2674 txd = SLIST_FIRST(&txr->hn_txlist);
2676 KASSERT(txr->hn_txdesc_avail > 0,
2677 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2678 txr->hn_txdesc_avail--;
2679 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2681 mtx_unlock_spin(&txr->hn_txlist_spin);
2683 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2687 #ifdef HN_USE_TXDESC_BUFRING
2689 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2691 #endif /* HN_USE_TXDESC_BUFRING */
2692 KASSERT(txd->m == NULL && txd->refs == 0 &&
2693 STAILQ_EMPTY(&txd->agg_list) &&
2694 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2695 txd->chim_size == 0 &&
2696 (txd->flags & HN_TXD_FLAG_ONLIST) &&
2697 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2698 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2699 txd->flags &= ~HN_TXD_FLAG_ONLIST;
2705 static __inline void
2706 hn_txdesc_hold(struct hn_txdesc *txd)
2709 /* 0->1 transition will never work */
2710 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2711 atomic_add_int(&txd->refs, 1);
2714 static __inline void
2715 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2718 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2719 ("recursive aggregation on aggregating txdesc"));
2721 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2722 ("already aggregated"));
2723 KASSERT(STAILQ_EMPTY(&txd->agg_list),
2724 ("recursive aggregation on to-be-aggregated txdesc"));
2726 txd->flags |= HN_TXD_FLAG_ONAGG;
2727 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2731 hn_tx_ring_pending(struct hn_tx_ring *txr)
2733 bool pending = false;
2735 #ifndef HN_USE_TXDESC_BUFRING
2736 mtx_lock_spin(&txr->hn_txlist_spin);
2737 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2739 mtx_unlock_spin(&txr->hn_txlist_spin);
2741 if (!buf_ring_full(txr->hn_txdesc_br))
2747 static __inline void
2748 hn_txeof(struct hn_tx_ring *txr)
2750 txr->hn_has_txeof = 0;
2755 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2756 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2758 struct hn_txdesc *txd = sndc->hn_cbarg;
2759 struct hn_tx_ring *txr;
2762 KASSERT(txr->hn_chan == chan,
2763 ("channel mismatch, on chan%u, should be chan%u",
2764 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2766 txr->hn_has_txeof = 1;
2767 hn_txdesc_put(txr, txd);
2769 ++txr->hn_txdone_cnt;
2770 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2771 txr->hn_txdone_cnt = 0;
2772 if (txr->hn_oactive)
2778 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2780 #if defined(INET) || defined(INET6)
2781 struct lro_ctrl *lro = &rxr->hn_lro;
2782 struct lro_entry *queued;
2784 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
2785 SLIST_REMOVE_HEAD(&lro->lro_active, next);
2786 tcp_lro_flush(lro, queued);
2792 * 'txr' could be NULL, if multiple channels and
2793 * ifnet.if_start method are enabled.
2795 if (txr == NULL || !txr->hn_has_txeof)
2798 txr->hn_txdone_cnt = 0;
2802 static __inline uint32_t
2803 hn_rndis_pktmsg_offset(uint32_t ofs)
2806 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2807 ("invalid RNDIS packet msg offset %u", ofs));
2808 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2811 static __inline void *
2812 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2813 size_t pi_dlen, uint32_t pi_type)
2815 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2816 struct rndis_pktinfo *pi;
2818 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2819 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2822 * Per-packet-info does not move; it only grows.
2825 * rm_pktinfooffset in this phase counts from the beginning
2826 * of rndis_packet_msg.
2828 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2829 ("%u pktinfo overflows RNDIS packet msg", pi_type));
2830 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2831 pkt->rm_pktinfolen);
2832 pkt->rm_pktinfolen += pi_size;
2834 pi->rm_size = pi_size;
2835 pi->rm_type = pi_type;
2836 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2838 return (pi->rm_data);
2842 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2844 struct hn_txdesc *txd;
2848 txd = txr->hn_agg_txd;
2849 KASSERT(txd != NULL, ("no aggregate txdesc"));
2852 * Since hn_txpkt() will reset this temporary stat, save
2853 * it now, so that oerrors can be updated properly, if
2854 * hn_txpkt() ever fails.
2856 pkts = txr->hn_stat_pkts;
2859 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2860 * failure, save it for later freeing, if hn_txpkt() ever
2864 error = hn_txpkt(ifp, txr, txd);
2865 if (__predict_false(error)) {
2866 /* txd is freed, but m is not. */
2869 txr->hn_flush_failed++;
2870 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2873 /* Reset all aggregation states. */
2874 txr->hn_agg_txd = NULL;
2875 txr->hn_agg_szleft = 0;
2876 txr->hn_agg_pktleft = 0;
2877 txr->hn_agg_prevpkt = NULL;
2883 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2888 if (txr->hn_agg_txd != NULL) {
2889 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2890 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2891 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2895 * Update the previous RNDIS packet's total length,
2896 * it can be increased due to the mandatory alignment
2897 * padding for this RNDIS packet. And update the
2898 * aggregating txdesc's chimney sending buffer size
2902 * Zero-out the padding, as required by the RNDIS spec.
2905 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2906 agg_txd->chim_size += pkt->rm_len - olen;
2908 /* Link this txdesc to the parent. */
2909 hn_txdesc_agg(agg_txd, txd);
2911 chim = (uint8_t *)pkt + pkt->rm_len;
2912 /* Save the current packet for later fixup. */
2913 txr->hn_agg_prevpkt = chim;
2915 txr->hn_agg_pktleft--;
2916 txr->hn_agg_szleft -= pktsize;
2917 if (txr->hn_agg_szleft <=
2918 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2920 * Probably can't aggregate more packets,
2921 * flush this aggregating txdesc proactively.
2923 txr->hn_agg_pktleft = 0;
2928 hn_flush_txagg(ifp, txr);
2930 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
2932 txr->hn_tx_chimney_tried++;
2933 txd->chim_index = hn_chim_alloc(txr->hn_sc);
2934 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
2936 txr->hn_tx_chimney++;
2938 chim = txr->hn_sc->hn_chim +
2939 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
2941 if (txr->hn_agg_pktmax > 1 &&
2942 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2943 txr->hn_agg_txd = txd;
2944 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
2945 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
2946 txr->hn_agg_prevpkt = chim;
2953 * If this function fails, then both txd and m_head0 will be freed.
2956 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2957 struct mbuf **m_head0)
2959 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
2960 int error, nsegs, i;
2961 struct mbuf *m_head = *m_head0;
2962 struct rndis_packet_msg *pkt;
2965 int pkt_hlen, pkt_size;
2967 pkt = txd->rndis_pkt;
2968 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
2969 if (pkt_size < txr->hn_chim_size) {
2970 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
2974 if (txr->hn_agg_txd != NULL)
2975 hn_flush_txagg(ifp, txr);
2978 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
2979 pkt->rm_len = m_head->m_pkthdr.len;
2980 pkt->rm_dataoffset = 0;
2981 pkt->rm_datalen = m_head->m_pkthdr.len;
2982 pkt->rm_oobdataoffset = 0;
2983 pkt->rm_oobdatalen = 0;
2984 pkt->rm_oobdataelements = 0;
2985 pkt->rm_pktinfooffset = sizeof(*pkt);
2986 pkt->rm_pktinfolen = 0;
2987 pkt->rm_vchandle = 0;
2988 pkt->rm_reserved = 0;
2990 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
2992 * Set the hash value for this packet, so that the host could
2993 * dispatch the TX done event for this packet back to this TX
2996 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2997 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
2998 *pi_data = txr->hn_tx_idx;
3001 if (m_head->m_flags & M_VLANTAG) {
3002 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3003 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3004 *pi_data = NDIS_VLAN_INFO_MAKE(
3005 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3006 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3007 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3010 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3011 #if defined(INET6) || defined(INET)
3012 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3013 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3015 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3016 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3017 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3018 m_head->m_pkthdr.tso_segsz);
3021 #if defined(INET6) && defined(INET)
3026 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3027 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3028 m_head->m_pkthdr.tso_segsz);
3031 #endif /* INET6 || INET */
3032 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3033 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3034 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3035 if (m_head->m_pkthdr.csum_flags &
3036 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3037 *pi_data = NDIS_TXCSUM_INFO_IPV6;
3039 *pi_data = NDIS_TXCSUM_INFO_IPV4;
3040 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3041 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
3044 if (m_head->m_pkthdr.csum_flags &
3045 (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3046 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3047 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3048 } else if (m_head->m_pkthdr.csum_flags &
3049 (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3050 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3051 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3055 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3056 /* Fixup RNDIS packet message total length */
3057 pkt->rm_len += pkt_hlen;
3058 /* Convert RNDIS packet message offsets */
3059 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3060 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3063 * Fast path: Chimney sending.
3066 struct hn_txdesc *tgt_txd = txd;
3068 if (txr->hn_agg_txd != NULL) {
3069 tgt_txd = txr->hn_agg_txd;
3075 KASSERT(pkt == chim,
3076 ("RNDIS pkt not in chimney sending buffer"));
3077 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3078 ("chimney sending buffer is not used"));
3079 tgt_txd->chim_size += pkt->rm_len;
3081 m_copydata(m_head, 0, m_head->m_pkthdr.len,
3082 ((uint8_t *)chim) + pkt_hlen);
3084 txr->hn_gpa_cnt = 0;
3085 txr->hn_sendpkt = hn_txpkt_chim;
3089 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3090 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3091 ("chimney buffer is used"));
3092 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3094 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3095 if (__predict_false(error)) {
3099 * This mbuf is not linked w/ the txd yet, so free it now.
3104 freed = hn_txdesc_put(txr, txd);
3106 ("fail to free txd upon txdma error"));
3108 txr->hn_txdma_failed++;
3109 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3114 /* +1 RNDIS packet message */
3115 txr->hn_gpa_cnt = nsegs + 1;
3117 /* send packet with page buffer */
3118 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3119 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3120 txr->hn_gpa[0].gpa_len = pkt_hlen;
3123 * Fill the page buffers with mbuf info after the page
3124 * buffer for RNDIS packet message.
3126 for (i = 0; i < nsegs; ++i) {
3127 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3129 gpa->gpa_page = atop(segs[i].ds_addr);
3130 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3131 gpa->gpa_len = segs[i].ds_len;
3134 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3136 txr->hn_sendpkt = hn_txpkt_sglist;
3140 /* Set the completion routine */
3141 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3143 /* Update temporary stats for later use. */
3144 txr->hn_stat_pkts++;
3145 txr->hn_stat_size += m_head->m_pkthdr.len;
3146 if (m_head->m_flags & M_MCAST)
3147 txr->hn_stat_mcasts++;
3154 * If this function fails, then txd will be freed, but the mbuf
3155 * associated w/ the txd will _not_ be freed.
3158 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3160 int error, send_failed = 0, has_bpf;
3163 has_bpf = bpf_peers_present(ifp->if_bpf);
3166 * Make sure that this txd and any aggregated txds are not
3167 * freed before ETHER_BPF_MTAP.
3169 hn_txdesc_hold(txd);
3171 error = txr->hn_sendpkt(txr, txd);
3174 const struct hn_txdesc *tmp_txd;
3176 ETHER_BPF_MTAP(ifp, txd->m);
3177 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3178 ETHER_BPF_MTAP(ifp, tmp_txd->m);
3181 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3182 #ifdef HN_IFSTART_SUPPORT
3183 if (!hn_use_if_start)
3186 if_inc_counter(ifp, IFCOUNTER_OBYTES,
3188 if (txr->hn_stat_mcasts != 0) {
3189 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3190 txr->hn_stat_mcasts);
3193 txr->hn_pkts += txr->hn_stat_pkts;
3197 hn_txdesc_put(txr, txd);
3199 if (__predict_false(error)) {
3203 * This should "really rarely" happen.
3205 * XXX Too many RX to be acked or too many sideband
3206 * commands to run? Ask netvsc_channel_rollup()
3207 * to kick start later.
3209 txr->hn_has_txeof = 1;
3211 txr->hn_send_failed++;
3214 * Try sending again after set hn_has_txeof;
3215 * in case that we missed the last
3216 * netvsc_channel_rollup().
3220 if_printf(ifp, "send failed\n");
3223 * Caller will perform further processing on the
3224 * associated mbuf, so don't free it in hn_txdesc_put();
3225 * only unload it from the DMA map in hn_txdesc_put(),
3229 freed = hn_txdesc_put(txr, txd);
3231 ("fail to free txd upon send error"));
3233 txr->hn_send_failed++;
3236 /* Reset temporary stats, after this sending is done. */
3237 txr->hn_stat_size = 0;
3238 txr->hn_stat_pkts = 0;
3239 txr->hn_stat_mcasts = 0;
3245 * Append the specified data to the indicated mbuf chain,
3246 * Extend the mbuf chain if the new data does not fit in
3249 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3250 * There should be an equivalent in the kernel mbuf code,
3251 * but there does not appear to be one yet.
3253 * Differs from m_append() in that additional mbufs are
3254 * allocated with cluster size MJUMPAGESIZE, and filled
3257 * Return 1 if able to complete the job; otherwise 0.
3260 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3263 int remainder, space;
3265 for (m = m0; m->m_next != NULL; m = m->m_next)
3268 space = M_TRAILINGSPACE(m);
3271 * Copy into available space.
3273 if (space > remainder)
3275 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3280 while (remainder > 0) {
3282 * Allocate a new mbuf; could check space
3283 * and allocate a cluster instead.
3285 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
3288 n->m_len = min(MJUMPAGESIZE, remainder);
3289 bcopy(cp, mtod(n, caddr_t), n->m_len);
3291 remainder -= n->m_len;
3295 if (m0->m_flags & M_PKTHDR)
3296 m0->m_pkthdr.len += len - remainder;
3298 return (remainder == 0);
3301 #if defined(INET) || defined(INET6)
3303 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3305 #if __FreeBSD_version >= 1100095
3306 if (hn_lro_mbufq_depth) {
3307 tcp_lro_queue_mbuf(lc, m);
3311 return tcp_lro_rx(lc, m, 0);
3316 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3317 const struct hn_rxinfo *info)
3319 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3321 int size, do_lro = 0, do_csum = 1, is_vf = 0;
3322 int hash_type = M_HASHTYPE_NONE;
3325 if (rxr->hn_rxvf_ifp != NULL) {
3327 * Non-transparent mode VF; pretend this packet is from
3330 ifp = rxr->hn_rxvf_ifp;
3332 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3333 /* Transparent mode VF. */
3337 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3340 * See the NOTE of hn_rndis_init_fixat(). This
3341 * function can be reached, immediately after the
3342 * RNDIS is initialized but before the ifnet is
3343 * setup on the hn_attach() path; drop the unexpected
3349 if (__predict_false(dlen < ETHER_HDR_LEN)) {
3350 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3354 if (dlen <= MHLEN) {
3355 m_new = m_gethdr(M_NOWAIT, MT_DATA);
3356 if (m_new == NULL) {
3357 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3360 memcpy(mtod(m_new, void *), data, dlen);
3361 m_new->m_pkthdr.len = m_new->m_len = dlen;
3362 rxr->hn_small_pkts++;
3365 * Get an mbuf with a cluster. For packets 2K or less,
3366 * get a standard 2K cluster. For anything larger, get a
3367 * 4K cluster. Any buffers larger than 4K can cause problems
3368 * if looped around to the Hyper-V TX channel, so avoid them.
3371 if (dlen > MCLBYTES) {
3373 size = MJUMPAGESIZE;
3376 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3377 if (m_new == NULL) {
3378 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3382 hv_m_append(m_new, dlen, data);
3384 m_new->m_pkthdr.rcvif = ifp;
3386 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3389 /* receive side checksum offload */
3390 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3391 /* IP csum offload */
3392 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3393 m_new->m_pkthdr.csum_flags |=
3394 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3398 /* TCP/UDP csum offload */
3399 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3400 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3401 m_new->m_pkthdr.csum_flags |=
3402 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3403 m_new->m_pkthdr.csum_data = 0xffff;
3404 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3412 * As of this write (Oct 28th, 2016), host side will turn
3413 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3414 * the do_lro setting here is actually _not_ accurate. We
3415 * depend on the RSS hash type check to reset do_lro.
3417 if ((info->csum_info &
3418 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3419 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3422 const struct ether_header *eh;
3427 /* Checked at the beginning of this function. */
3428 KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
3430 eh = mtod(m_new, struct ether_header *);
3431 etype = ntohs(eh->ether_type);
3432 if (etype == ETHERTYPE_VLAN) {
3433 const struct ether_vlan_header *evl;
3435 hoff = sizeof(*evl);
3436 if (m_new->m_len < hoff)
3438 evl = mtod(m_new, struct ether_vlan_header *);
3439 etype = ntohs(evl->evl_proto);
3442 if (etype == ETHERTYPE_IP) {
3445 pr = hn_check_iplen(m_new, hoff);
3446 if (pr == IPPROTO_TCP) {
3448 (rxr->hn_trust_hcsum &
3449 HN_TRUST_HCSUM_TCP)) {
3450 rxr->hn_csum_trusted++;
3451 m_new->m_pkthdr.csum_flags |=
3452 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3453 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3454 m_new->m_pkthdr.csum_data = 0xffff;
3457 } else if (pr == IPPROTO_UDP) {
3459 (rxr->hn_trust_hcsum &
3460 HN_TRUST_HCSUM_UDP)) {
3461 rxr->hn_csum_trusted++;
3462 m_new->m_pkthdr.csum_flags |=
3463 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3464 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3465 m_new->m_pkthdr.csum_data = 0xffff;
3467 } else if (pr != IPPROTO_DONE && do_csum &&
3468 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3469 rxr->hn_csum_trusted++;
3470 m_new->m_pkthdr.csum_flags |=
3471 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3476 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3477 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3478 NDIS_VLAN_INFO_ID(info->vlan_info),
3479 NDIS_VLAN_INFO_PRI(info->vlan_info),
3480 NDIS_VLAN_INFO_CFI(info->vlan_info));
3481 m_new->m_flags |= M_VLANTAG;
3485 * If VF is activated (tranparent/non-transparent mode does not
3490 * hn(4) will only receive broadcast packets, multicast packets,
3491 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3494 * For non-transparent, we definitely _cannot_ enable LRO at
3495 * all, since the LRO flush will use hn(4) as the receiving
3496 * interface; i.e. hn_ifp->if_input(hn_ifp, m).
3502 * If VF is activated (tranparent/non-transparent mode does not
3503 * matter here), do _not_ mess with unsupported hash types or
3506 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3508 m_new->m_pkthdr.flowid = info->hash_value;
3510 hash_type = M_HASHTYPE_OPAQUE;
3511 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3512 NDIS_HASH_FUNCTION_TOEPLITZ) {
3513 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3518 * do_lro is resetted, if the hash types are not TCP
3519 * related. See the comment in the above csum_flags
3523 case NDIS_HASH_IPV4:
3524 hash_type = M_HASHTYPE_RSS_IPV4;
3528 case NDIS_HASH_TCP_IPV4:
3529 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3532 case NDIS_HASH_IPV6:
3533 hash_type = M_HASHTYPE_RSS_IPV6;
3537 case NDIS_HASH_IPV6_EX:
3538 hash_type = M_HASHTYPE_RSS_IPV6_EX;
3542 case NDIS_HASH_TCP_IPV6:
3543 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3546 case NDIS_HASH_TCP_IPV6_EX:
3547 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3551 } else if (!is_vf) {
3552 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3554 M_HASHTYPE_SET(m_new, hash_type);
3556 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3557 if (hn_ifp != ifp) {
3558 const struct ether_header *eh;
3561 * Non-transparent mode VF is activated.
3565 * Allow tapping on hn(4).
3567 ETHER_BPF_MTAP(hn_ifp, m_new);
3570 * Update hn(4)'s stats.
3572 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3573 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3574 /* Checked at the beginning of this function. */
3575 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3576 eh = mtod(m_new, struct ether_header *);
3577 if (ETHER_IS_MULTICAST(eh->ether_dhost))
3578 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3582 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3583 #if defined(INET) || defined(INET6)
3584 struct lro_ctrl *lro = &rxr->hn_lro;
3587 rxr->hn_lro_tried++;
3588 if (hn_lro_rx(lro, m_new) == 0) {
3595 ifp->if_input(ifp, m_new);
3601 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3603 struct hn_softc *sc = ifp->if_softc;
3604 struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3605 struct ifnet *vf_ifp;
3606 int mask, error = 0;
3607 struct ifrsskey *ifrk;
3608 struct ifrsshash *ifrh;
3613 if (ifr->ifr_mtu > HN_MTU_MAX) {
3620 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3625 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3626 /* Can't change MTU */
3632 if (ifp->if_mtu == ifr->ifr_mtu) {
3637 if (hn_xpnt_vf_isready(sc)) {
3638 vf_ifp = sc->hn_vf_ifp;
3640 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3641 sizeof(ifr_vf.ifr_name));
3642 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3646 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3647 vf_ifp->if_xname, ifr->ifr_mtu, error);
3653 * Suspend this interface before the synthetic parts
3659 * Detach the synthetics parts, i.e. NVS and RNDIS.
3661 hn_synth_detach(sc);
3664 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3665 * with the new MTU setting.
3667 error = hn_synth_attach(sc, ifr->ifr_mtu);
3673 error = hn_rndis_get_mtu(sc, &mtu);
3676 else if (bootverbose)
3677 if_printf(ifp, "RNDIS mtu %u\n", mtu);
3680 * Commit the requested MTU, after the synthetic parts
3681 * have been successfully attached.
3683 if (mtu >= ifr->ifr_mtu) {
3686 if_printf(ifp, "fixup mtu %d -> %u\n",
3692 * Synthetic parts' reattach may change the chimney
3693 * sending size; update it.
3695 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3696 hn_set_chim_size(sc, sc->hn_chim_szmax);
3699 * Make sure that various parameters based on MTU are
3700 * still valid, after the MTU change.
3702 hn_mtu_change_fixup(sc);
3705 * All done! Resume the interface now.
3709 if ((sc->hn_flags & HN_FLAG_RXVF) ||
3710 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3712 * Since we have reattached the NVS part,
3713 * change the datapath to VF again; in case
3714 * that it is lost, after the NVS was detached.
3716 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3725 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3730 if (hn_xpnt_vf_isready(sc))
3731 hn_xpnt_vf_saveifflags(sc);
3733 if (ifp->if_flags & IFF_UP) {
3734 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3736 * Caller meight hold mutex, e.g.
3737 * bpf; use busy-wait for the RNDIS
3741 hn_rxfilter_config(sc);
3744 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3745 error = hn_xpnt_vf_iocsetflags(sc);
3750 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3753 sc->hn_if_flags = ifp->if_flags;
3761 if (hn_xpnt_vf_isready(sc)) {
3763 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3764 sizeof(ifr_vf.ifr_name));
3765 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3771 * Fix up requested capabilities w/ supported capabilities,
3772 * since the supported capabilities could have been changed.
3774 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3777 if (mask & IFCAP_TXCSUM) {
3778 ifp->if_capenable ^= IFCAP_TXCSUM;
3779 if (ifp->if_capenable & IFCAP_TXCSUM)
3780 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3782 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3784 if (mask & IFCAP_TXCSUM_IPV6) {
3785 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3786 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3787 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3789 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3792 /* TODO: flip RNDIS offload parameters for RXCSUM. */
3793 if (mask & IFCAP_RXCSUM)
3794 ifp->if_capenable ^= IFCAP_RXCSUM;
3796 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
3797 if (mask & IFCAP_RXCSUM_IPV6)
3798 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3801 if (mask & IFCAP_LRO)
3802 ifp->if_capenable ^= IFCAP_LRO;
3804 if (mask & IFCAP_TSO4) {
3805 ifp->if_capenable ^= IFCAP_TSO4;
3806 if (ifp->if_capenable & IFCAP_TSO4)
3807 ifp->if_hwassist |= CSUM_IP_TSO;
3809 ifp->if_hwassist &= ~CSUM_IP_TSO;
3811 if (mask & IFCAP_TSO6) {
3812 ifp->if_capenable ^= IFCAP_TSO6;
3813 if (ifp->if_capenable & IFCAP_TSO6)
3814 ifp->if_hwassist |= CSUM_IP6_TSO;
3816 ifp->if_hwassist &= ~CSUM_IP6_TSO;
3826 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3830 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3832 * Multicast uses mutex; use busy-wait for
3836 hn_rxfilter_config(sc);
3840 /* XXX vlan(4) style mcast addr maintenance */
3841 if (hn_xpnt_vf_isready(sc)) {
3844 old_if_flags = sc->hn_vf_ifp->if_flags;
3845 hn_xpnt_vf_saveifflags(sc);
3847 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3848 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3850 error = hn_xpnt_vf_iocsetflags(sc);
3859 if (hn_xpnt_vf_isready(sc)) {
3861 * SIOCGIFMEDIA expects ifmediareq, so don't
3862 * create and pass ifr_vf to the VF here; just
3863 * replace the ifr_name.
3865 vf_ifp = sc->hn_vf_ifp;
3866 strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3867 sizeof(ifr->ifr_name));
3868 error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3869 /* Restore the ifr_name. */
3870 strlcpy(ifr->ifr_name, ifp->if_xname,
3871 sizeof(ifr->ifr_name));
3876 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3879 case SIOCGIFRSSHASH:
3880 ifrh = (struct ifrsshash *)data;
3882 if (sc->hn_rx_ring_inuse == 1) {
3884 ifrh->ifrh_func = RSS_FUNC_NONE;
3885 ifrh->ifrh_types = 0;
3889 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3890 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3892 ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3893 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3898 ifrk = (struct ifrsskey *)data;
3900 if (sc->hn_rx_ring_inuse == 1) {
3902 ifrk->ifrk_func = RSS_FUNC_NONE;
3903 ifrk->ifrk_keylen = 0;
3906 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3907 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3909 ifrk->ifrk_func = RSS_FUNC_PRIVATE;
3910 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
3911 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
3912 NDIS_HASH_KEYSIZE_TOEPLITZ);
3917 error = ether_ioctl(ifp, cmd, data);
3924 hn_stop(struct hn_softc *sc, bool detaching)
3926 struct ifnet *ifp = sc->hn_ifp;
3931 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3932 ("synthetic parts were not attached"));
3934 /* Clear RUNNING bit ASAP. */
3935 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3937 /* Disable polling. */
3940 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
3941 KASSERT(sc->hn_vf_ifp != NULL,
3942 ("%s: VF is not attached", ifp->if_xname));
3944 /* Mark transparent mode VF as disabled. */
3945 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
3949 * Datapath setting must happen _before_ bringing
3952 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
3955 * Bring the VF down.
3957 hn_xpnt_vf_saveifflags(sc);
3958 sc->hn_vf_ifp->if_flags &= ~IFF_UP;
3959 hn_xpnt_vf_iocsetflags(sc);
3962 /* Suspend data transfers. */
3963 hn_suspend_data(sc);
3965 /* Clear OACTIVE bit. */
3966 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3967 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3968 sc->hn_tx_ring[i].hn_oactive = 0;
3971 * If the non-transparent mode VF is active, make sure
3972 * that the RX filter still allows packet reception.
3974 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
3975 hn_rxfilter_config(sc);
3979 hn_init_locked(struct hn_softc *sc)
3981 struct ifnet *ifp = sc->hn_ifp;
3986 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
3989 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3992 /* Configure RX filter */
3993 hn_rxfilter_config(sc);
3995 /* Clear OACTIVE bit. */
3996 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3997 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3998 sc->hn_tx_ring[i].hn_oactive = 0;
4000 /* Clear TX 'suspended' bit. */
4001 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4003 if (hn_xpnt_vf_isready(sc)) {
4004 /* Initialize transparent VF. */
4005 hn_xpnt_vf_init(sc);
4008 /* Everything is ready; unleash! */
4009 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4011 /* Re-enable polling if requested. */
4012 if (sc->hn_pollhz > 0)
4013 hn_polling(sc, sc->hn_pollhz);
4019 struct hn_softc *sc = xsc;
4026 #if __FreeBSD_version >= 1100099
4029 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4031 struct hn_softc *sc = arg1;
4032 unsigned int lenlim;
4035 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4036 error = sysctl_handle_int(oidp, &lenlim, 0, req);
4037 if (error || req->newptr == NULL)
4041 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4042 lenlim > TCP_LRO_LENGTH_MAX) {
4046 hn_set_lro_lenlim(sc, lenlim);
4053 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4055 struct hn_softc *sc = arg1;
4056 int ackcnt, error, i;
4059 * lro_ackcnt_lim is append count limit,
4060 * +1 to turn it into aggregation limit.
4062 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4063 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4064 if (error || req->newptr == NULL)
4067 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4071 * Convert aggregation limit back to append
4076 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4077 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4085 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4087 struct hn_softc *sc = arg1;
4092 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4095 error = sysctl_handle_int(oidp, &on, 0, req);
4096 if (error || req->newptr == NULL)
4100 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4101 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4104 rxr->hn_trust_hcsum |= hcsum;
4106 rxr->hn_trust_hcsum &= ~hcsum;
4113 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4115 struct hn_softc *sc = arg1;
4116 int chim_size, error;
4118 chim_size = sc->hn_tx_ring[0].hn_chim_size;
4119 error = sysctl_handle_int(oidp, &chim_size, 0, req);
4120 if (error || req->newptr == NULL)
4123 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4127 hn_set_chim_size(sc, chim_size);
4132 #if __FreeBSD_version < 1100095
4134 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4136 struct hn_softc *sc = arg1;
4137 int ofs = arg2, i, error;
4138 struct hn_rx_ring *rxr;
4142 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4143 rxr = &sc->hn_rx_ring[i];
4144 stat += *((int *)((uint8_t *)rxr + ofs));
4147 error = sysctl_handle_64(oidp, &stat, 0, req);
4148 if (error || req->newptr == NULL)
4151 /* Zero out this stat. */
4152 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4153 rxr = &sc->hn_rx_ring[i];
4154 *((int *)((uint8_t *)rxr + ofs)) = 0;
4160 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4162 struct hn_softc *sc = arg1;
4163 int ofs = arg2, i, error;
4164 struct hn_rx_ring *rxr;
4168 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4169 rxr = &sc->hn_rx_ring[i];
4170 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4173 error = sysctl_handle_64(oidp, &stat, 0, req);
4174 if (error || req->newptr == NULL)
4177 /* Zero out this stat. */
4178 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4179 rxr = &sc->hn_rx_ring[i];
4180 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4188 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4190 struct hn_softc *sc = arg1;
4191 int ofs = arg2, i, error;
4192 struct hn_rx_ring *rxr;
4196 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4197 rxr = &sc->hn_rx_ring[i];
4198 stat += *((u_long *)((uint8_t *)rxr + ofs));
4201 error = sysctl_handle_long(oidp, &stat, 0, req);
4202 if (error || req->newptr == NULL)
4205 /* Zero out this stat. */
4206 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4207 rxr = &sc->hn_rx_ring[i];
4208 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
4214 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4216 struct hn_softc *sc = arg1;
4217 int ofs = arg2, i, error;
4218 struct hn_tx_ring *txr;
4222 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4223 txr = &sc->hn_tx_ring[i];
4224 stat += *((u_long *)((uint8_t *)txr + ofs));
4227 error = sysctl_handle_long(oidp, &stat, 0, req);
4228 if (error || req->newptr == NULL)
4231 /* Zero out this stat. */
4232 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4233 txr = &sc->hn_tx_ring[i];
4234 *((u_long *)((uint8_t *)txr + ofs)) = 0;
4240 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4242 struct hn_softc *sc = arg1;
4243 int ofs = arg2, i, error, conf;
4244 struct hn_tx_ring *txr;
4246 txr = &sc->hn_tx_ring[0];
4247 conf = *((int *)((uint8_t *)txr + ofs));
4249 error = sysctl_handle_int(oidp, &conf, 0, req);
4250 if (error || req->newptr == NULL)
4254 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4255 txr = &sc->hn_tx_ring[i];
4256 *((int *)((uint8_t *)txr + ofs)) = conf;
4264 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4266 struct hn_softc *sc = arg1;
4269 size = sc->hn_agg_size;
4270 error = sysctl_handle_int(oidp, &size, 0, req);
4271 if (error || req->newptr == NULL)
4275 sc->hn_agg_size = size;
4283 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4285 struct hn_softc *sc = arg1;
4288 pkts = sc->hn_agg_pkts;
4289 error = sysctl_handle_int(oidp, &pkts, 0, req);
4290 if (error || req->newptr == NULL)
4294 sc->hn_agg_pkts = pkts;
4302 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4304 struct hn_softc *sc = arg1;
4307 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4308 return (sysctl_handle_int(oidp, &pkts, 0, req));
4312 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4314 struct hn_softc *sc = arg1;
4317 align = sc->hn_tx_ring[0].hn_agg_align;
4318 return (sysctl_handle_int(oidp, &align, 0, req));
4322 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4325 vmbus_chan_poll_disable(chan);
4327 vmbus_chan_poll_enable(chan, pollhz);
4331 hn_polling(struct hn_softc *sc, u_int pollhz)
4333 int nsubch = sc->hn_rx_ring_inuse - 1;
4338 struct vmbus_channel **subch;
4341 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4342 for (i = 0; i < nsubch; ++i)
4343 hn_chan_polling(subch[i], pollhz);
4344 vmbus_subchan_rel(subch, nsubch);
4346 hn_chan_polling(sc->hn_prichan, pollhz);
4350 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4352 struct hn_softc *sc = arg1;
4355 pollhz = sc->hn_pollhz;
4356 error = sysctl_handle_int(oidp, &pollhz, 0, req);
4357 if (error || req->newptr == NULL)
4361 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4365 if (sc->hn_pollhz != pollhz) {
4366 sc->hn_pollhz = pollhz;
4367 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4368 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4369 hn_polling(sc, sc->hn_pollhz);
4377 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4379 struct hn_softc *sc = arg1;
4382 snprintf(verstr, sizeof(verstr), "%u.%u",
4383 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4384 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4385 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4389 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4391 struct hn_softc *sc = arg1;
4398 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4399 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4403 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4405 struct hn_softc *sc = arg1;
4406 char assist_str[128];
4410 hwassist = sc->hn_ifp->if_hwassist;
4412 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4413 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4417 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4419 struct hn_softc *sc = arg1;
4420 char filter_str[128];
4424 filter = sc->hn_rx_filter;
4426 snprintf(filter_str, sizeof(filter_str), "%b", filter,
4428 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4432 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4434 struct hn_softc *sc = arg1;
4439 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4440 if (error || req->newptr == NULL)
4443 if ((sc->hn_flags & HN_FLAG_RXVF) ||
4444 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4446 * RSS key is synchronized w/ VF's, don't allow users
4453 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4456 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4458 if (sc->hn_rx_ring_inuse > 1) {
4459 error = hn_rss_reconfig(sc);
4461 /* Not RSS capable, at least for now; just save the RSS key. */
4470 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4472 struct hn_softc *sc = arg1;
4477 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4478 if (error || req->newptr == NULL)
4482 * Don't allow RSS indirect table change, if this interface is not
4483 * RSS capable currently.
4485 if (sc->hn_rx_ring_inuse == 1) {
4490 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4493 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4495 hn_rss_ind_fixup(sc);
4496 error = hn_rss_reconfig(sc);
4503 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4505 struct hn_softc *sc = arg1;
4510 hash = sc->hn_rss_hash;
4512 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4513 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4517 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4519 struct hn_softc *sc = arg1;
4524 hash = sc->hn_rss_hcap;
4526 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4527 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4531 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4533 struct hn_softc *sc = arg1;
4538 hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4540 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4541 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4545 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4547 struct hn_softc *sc = arg1;
4548 char vf_name[IFNAMSIZ + 1];
4549 struct ifnet *vf_ifp;
4553 vf_ifp = sc->hn_vf_ifp;
4555 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4557 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4561 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4563 struct hn_softc *sc = arg1;
4564 char vf_name[IFNAMSIZ + 1];
4565 struct ifnet *vf_ifp;
4569 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4571 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4573 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4577 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4579 struct rm_priotracker pt;
4584 error = sysctl_wire_old_buffer(req, 0);
4588 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4592 rm_rlock(&hn_vfmap_lock, &pt);
4595 for (i = 0; i < hn_vfmap_size; ++i) {
4598 if (hn_vfmap[i] == NULL)
4601 ifp = ifnet_byindex(i);
4604 sbuf_printf(sb, "%s", ifp->if_xname);
4606 sbuf_printf(sb, " %s", ifp->if_xname);
4611 rm_runlock(&hn_vfmap_lock, &pt);
4613 error = sbuf_finish(sb);
4619 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4621 struct rm_priotracker pt;
4626 error = sysctl_wire_old_buffer(req, 0);
4630 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4634 rm_rlock(&hn_vfmap_lock, &pt);
4637 for (i = 0; i < hn_vfmap_size; ++i) {
4638 struct ifnet *ifp, *hn_ifp;
4640 hn_ifp = hn_vfmap[i];
4644 ifp = ifnet_byindex(i);
4647 sbuf_printf(sb, "%s:%s", ifp->if_xname,
4650 sbuf_printf(sb, " %s:%s", ifp->if_xname,
4657 rm_runlock(&hn_vfmap_lock, &pt);
4659 error = sbuf_finish(sb);
4665 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4667 struct hn_softc *sc = arg1;
4668 int error, onoff = 0;
4670 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4672 error = sysctl_handle_int(oidp, &onoff, 0, req);
4673 if (error || req->newptr == NULL)
4677 /* NOTE: hn_vf_lock for hn_transmit() */
4678 rm_wlock(&sc->hn_vf_lock);
4680 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4682 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4683 rm_wunlock(&sc->hn_vf_lock);
4690 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4692 struct hn_softc *sc = arg1;
4695 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4697 return (sysctl_handle_int(oidp, &enabled, 0, req));
4701 hn_check_iplen(const struct mbuf *m, int hoff)
4703 const struct ip *ip;
4704 int len, iphlen, iplen;
4705 const struct tcphdr *th;
4706 int thoff; /* TCP data offset */
4708 len = hoff + sizeof(struct ip);
4710 /* The packet must be at least the size of an IP header. */
4711 if (m->m_pkthdr.len < len)
4712 return IPPROTO_DONE;
4714 /* The fixed IP header must reside completely in the first mbuf. */
4716 return IPPROTO_DONE;
4718 ip = mtodo(m, hoff);
4720 /* Bound check the packet's stated IP header length. */
4721 iphlen = ip->ip_hl << 2;
4722 if (iphlen < sizeof(struct ip)) /* minimum header length */
4723 return IPPROTO_DONE;
4725 /* The full IP header must reside completely in the one mbuf. */
4726 if (m->m_len < hoff + iphlen)
4727 return IPPROTO_DONE;
4729 iplen = ntohs(ip->ip_len);
4732 * Check that the amount of data in the buffers is as
4733 * at least much as the IP header would have us expect.
4735 if (m->m_pkthdr.len < hoff + iplen)
4736 return IPPROTO_DONE;
4739 * Ignore IP fragments.
4741 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4742 return IPPROTO_DONE;
4745 * The TCP/IP or UDP/IP header must be entirely contained within
4746 * the first fragment of a packet.
4750 if (iplen < iphlen + sizeof(struct tcphdr))
4751 return IPPROTO_DONE;
4752 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4753 return IPPROTO_DONE;
4754 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4755 thoff = th->th_off << 2;
4756 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4757 return IPPROTO_DONE;
4758 if (m->m_len < hoff + iphlen + thoff)
4759 return IPPROTO_DONE;
4762 if (iplen < iphlen + sizeof(struct udphdr))
4763 return IPPROTO_DONE;
4764 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4765 return IPPROTO_DONE;
4769 return IPPROTO_DONE;
4776 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4778 struct sysctl_oid_list *child;
4779 struct sysctl_ctx_list *ctx;
4780 device_t dev = sc->hn_dev;
4781 #if defined(INET) || defined(INET6)
4782 #if __FreeBSD_version >= 1100095
4789 * Create RXBUF for reception.
4792 * - It is shared by all channels.
4793 * - A large enough buffer is allocated, certain version of NVSes
4794 * may further limit the usable space.
4796 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4797 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4798 BUS_DMA_WAITOK | BUS_DMA_ZERO);
4799 if (sc->hn_rxbuf == NULL) {
4800 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4804 sc->hn_rx_ring_cnt = ring_cnt;
4805 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4807 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4808 M_DEVBUF, M_WAITOK | M_ZERO);
4810 #if defined(INET) || defined(INET6)
4811 #if __FreeBSD_version >= 1100095
4812 lroent_cnt = hn_lro_entry_count;
4813 if (lroent_cnt < TCP_LRO_ENTRIES)
4814 lroent_cnt = TCP_LRO_ENTRIES;
4816 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4818 #endif /* INET || INET6 */
4820 ctx = device_get_sysctl_ctx(dev);
4821 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4823 /* Create dev.hn.UNIT.rx sysctl tree */
4824 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4825 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4827 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4828 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4830 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4831 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4832 &rxr->hn_br_dma, BUS_DMA_WAITOK);
4833 if (rxr->hn_br == NULL) {
4834 device_printf(dev, "allocate bufring failed\n");
4838 if (hn_trust_hosttcp)
4839 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4840 if (hn_trust_hostudp)
4841 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4842 if (hn_trust_hostip)
4843 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4844 rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4845 rxr->hn_ifp = sc->hn_ifp;
4846 if (i < sc->hn_tx_ring_cnt)
4847 rxr->hn_txr = &sc->hn_tx_ring[i];
4848 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4849 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4851 rxr->hn_rxbuf = sc->hn_rxbuf;
4856 #if defined(INET) || defined(INET6)
4857 #if __FreeBSD_version >= 1100095
4858 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4859 hn_lro_mbufq_depth);
4861 tcp_lro_init(&rxr->hn_lro);
4862 rxr->hn_lro.ifp = sc->hn_ifp;
4864 #if __FreeBSD_version >= 1100099
4865 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4866 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4868 #endif /* INET || INET6 */
4870 if (sc->hn_rx_sysctl_tree != NULL) {
4874 * Create per RX ring sysctl tree:
4875 * dev.hn.UNIT.rx.RINGID
4877 snprintf(name, sizeof(name), "%d", i);
4878 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4879 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4880 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4882 if (rxr->hn_rx_sysctl_tree != NULL) {
4883 SYSCTL_ADD_ULONG(ctx,
4884 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4885 OID_AUTO, "packets", CTLFLAG_RW,
4886 &rxr->hn_pkts, "# of packets received");
4887 SYSCTL_ADD_ULONG(ctx,
4888 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4889 OID_AUTO, "rss_pkts", CTLFLAG_RW,
4891 "# of packets w/ RSS info received");
4893 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4894 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
4895 &rxr->hn_pktbuf_len, 0,
4896 "Temporary channel packet buffer length");
4901 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
4902 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4903 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
4904 #if __FreeBSD_version < 1100095
4905 hn_rx_stat_int_sysctl,
4907 hn_rx_stat_u64_sysctl,
4909 "LU", "LRO queued");
4910 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
4911 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4912 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
4913 #if __FreeBSD_version < 1100095
4914 hn_rx_stat_int_sysctl,
4916 hn_rx_stat_u64_sysctl,
4918 "LU", "LRO flushed");
4919 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
4920 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4921 __offsetof(struct hn_rx_ring, hn_lro_tried),
4922 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
4923 #if __FreeBSD_version >= 1100099
4924 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
4925 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4926 hn_lro_lenlim_sysctl, "IU",
4927 "Max # of data bytes to be aggregated by LRO");
4928 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
4929 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4930 hn_lro_ackcnt_sysctl, "I",
4931 "Max # of ACKs to be aggregated by LRO");
4933 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
4934 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
4935 hn_trust_hcsum_sysctl, "I",
4936 "Trust tcp segement verification on host side, "
4937 "when csum info is missing");
4938 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
4939 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
4940 hn_trust_hcsum_sysctl, "I",
4941 "Trust udp datagram verification on host side, "
4942 "when csum info is missing");
4943 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
4944 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
4945 hn_trust_hcsum_sysctl, "I",
4946 "Trust ip packet verification on host side, "
4947 "when csum info is missing");
4948 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
4949 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4950 __offsetof(struct hn_rx_ring, hn_csum_ip),
4951 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
4952 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
4953 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4954 __offsetof(struct hn_rx_ring, hn_csum_tcp),
4955 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
4956 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
4957 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4958 __offsetof(struct hn_rx_ring, hn_csum_udp),
4959 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
4960 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
4961 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4962 __offsetof(struct hn_rx_ring, hn_csum_trusted),
4963 hn_rx_stat_ulong_sysctl, "LU",
4964 "# of packets that we trust host's csum verification");
4965 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
4966 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4967 __offsetof(struct hn_rx_ring, hn_small_pkts),
4968 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
4969 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
4970 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4971 __offsetof(struct hn_rx_ring, hn_ack_failed),
4972 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
4973 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
4974 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
4975 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
4976 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
4982 hn_destroy_rx_data(struct hn_softc *sc)
4986 if (sc->hn_rxbuf != NULL) {
4987 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
4988 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
4990 device_printf(sc->hn_dev, "RXBUF is referenced\n");
4991 sc->hn_rxbuf = NULL;
4994 if (sc->hn_rx_ring_cnt == 0)
4997 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4998 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5000 if (rxr->hn_br == NULL)
5002 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5003 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5005 device_printf(sc->hn_dev,
5006 "%dth channel bufring is referenced", i);
5010 #if defined(INET) || defined(INET6)
5011 tcp_lro_free(&rxr->hn_lro);
5013 free(rxr->hn_pktbuf, M_DEVBUF);
5015 free(sc->hn_rx_ring, M_DEVBUF);
5016 sc->hn_rx_ring = NULL;
5018 sc->hn_rx_ring_cnt = 0;
5019 sc->hn_rx_ring_inuse = 0;
5023 hn_tx_ring_create(struct hn_softc *sc, int id)
5025 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5026 device_t dev = sc->hn_dev;
5027 bus_dma_tag_t parent_dtag;
5031 txr->hn_tx_idx = id;
5033 #ifndef HN_USE_TXDESC_BUFRING
5034 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5036 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5038 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5039 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5040 M_DEVBUF, M_WAITOK | M_ZERO);
5041 #ifndef HN_USE_TXDESC_BUFRING
5042 SLIST_INIT(&txr->hn_txlist);
5044 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5045 M_WAITOK, &txr->hn_tx_lock);
5048 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5049 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5050 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5052 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5055 #ifdef HN_IFSTART_SUPPORT
5056 if (hn_use_if_start) {
5057 txr->hn_txeof = hn_start_txeof;
5058 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5059 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5065 txr->hn_txeof = hn_xmit_txeof;
5066 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5067 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5069 br_depth = hn_get_txswq_depth(txr);
5070 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5071 M_WAITOK, &txr->hn_tx_lock);
5074 txr->hn_direct_tx_size = hn_direct_tx_size;
5077 * Always schedule transmission instead of trying to do direct
5078 * transmission. This one gives the best performance so far.
5080 txr->hn_sched_tx = 1;
5082 parent_dtag = bus_get_dma_tag(dev);
5084 /* DMA tag for RNDIS packet messages. */
5085 error = bus_dma_tag_create(parent_dtag, /* parent */
5086 HN_RNDIS_PKT_ALIGN, /* alignment */
5087 HN_RNDIS_PKT_BOUNDARY, /* boundary */
5088 BUS_SPACE_MAXADDR, /* lowaddr */
5089 BUS_SPACE_MAXADDR, /* highaddr */
5090 NULL, NULL, /* filter, filterarg */
5091 HN_RNDIS_PKT_LEN, /* maxsize */
5093 HN_RNDIS_PKT_LEN, /* maxsegsize */
5095 NULL, /* lockfunc */
5096 NULL, /* lockfuncarg */
5097 &txr->hn_tx_rndis_dtag);
5099 device_printf(dev, "failed to create rndis dmatag\n");
5103 /* DMA tag for data. */
5104 error = bus_dma_tag_create(parent_dtag, /* parent */
5106 HN_TX_DATA_BOUNDARY, /* boundary */
5107 BUS_SPACE_MAXADDR, /* lowaddr */
5108 BUS_SPACE_MAXADDR, /* highaddr */
5109 NULL, NULL, /* filter, filterarg */
5110 HN_TX_DATA_MAXSIZE, /* maxsize */
5111 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
5112 HN_TX_DATA_SEGSIZE, /* maxsegsize */
5114 NULL, /* lockfunc */
5115 NULL, /* lockfuncarg */
5116 &txr->hn_tx_data_dtag);
5118 device_printf(dev, "failed to create data dmatag\n");
5122 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5123 struct hn_txdesc *txd = &txr->hn_txdesc[i];
5126 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5127 STAILQ_INIT(&txd->agg_list);
5130 * Allocate and load RNDIS packet message.
5132 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5133 (void **)&txd->rndis_pkt,
5134 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5135 &txd->rndis_pkt_dmap);
5138 "failed to allocate rndis_packet_msg, %d\n", i);
5142 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5143 txd->rndis_pkt_dmap,
5144 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5145 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5149 "failed to load rndis_packet_msg, %d\n", i);
5150 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5151 txd->rndis_pkt, txd->rndis_pkt_dmap);
5155 /* DMA map for TX data. */
5156 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5160 "failed to allocate tx data dmamap\n");
5161 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5162 txd->rndis_pkt_dmap);
5163 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5164 txd->rndis_pkt, txd->rndis_pkt_dmap);
5168 /* All set, put it to list */
5169 txd->flags |= HN_TXD_FLAG_ONLIST;
5170 #ifndef HN_USE_TXDESC_BUFRING
5171 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5173 buf_ring_enqueue(txr->hn_txdesc_br, txd);
5176 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5178 if (sc->hn_tx_sysctl_tree != NULL) {
5179 struct sysctl_oid_list *child;
5180 struct sysctl_ctx_list *ctx;
5184 * Create per TX ring sysctl tree:
5185 * dev.hn.UNIT.tx.RINGID
5187 ctx = device_get_sysctl_ctx(dev);
5188 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5190 snprintf(name, sizeof(name), "%d", id);
5191 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5192 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5194 if (txr->hn_tx_sysctl_tree != NULL) {
5195 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5198 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5199 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5200 "# of available TX descs");
5202 #ifdef HN_IFSTART_SUPPORT
5203 if (!hn_use_if_start)
5206 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5207 CTLFLAG_RD, &txr->hn_oactive, 0,
5210 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5211 CTLFLAG_RW, &txr->hn_pkts,
5212 "# of packets transmitted");
5213 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5214 CTLFLAG_RW, &txr->hn_sends, "# of sends");
5222 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5224 struct hn_tx_ring *txr = txd->txr;
5226 KASSERT(txd->m == NULL, ("still has mbuf installed"));
5227 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5229 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5230 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5231 txd->rndis_pkt_dmap);
5232 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5236 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5239 KASSERT(txd->refs == 0 || txd->refs == 1,
5240 ("invalid txd refs %d", txd->refs));
5242 /* Aggregated txds will be freed by their aggregating txd. */
5243 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5246 freed = hn_txdesc_put(txr, txd);
5247 KASSERT(freed, ("can't free txdesc"));
5252 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5256 if (txr->hn_txdesc == NULL)
5261 * Because the freeing of aggregated txds will be deferred
5262 * to the aggregating txd, two passes are used here:
5263 * - The first pass GCes any pending txds. This GC is necessary,
5264 * since if the channels are revoked, hypervisor will not
5265 * deliver send-done for all pending txds.
5266 * - The second pass frees the busdma stuffs, i.e. after all txds
5269 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5270 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5271 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5272 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5274 if (txr->hn_tx_data_dtag != NULL)
5275 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5276 if (txr->hn_tx_rndis_dtag != NULL)
5277 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5279 #ifdef HN_USE_TXDESC_BUFRING
5280 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5283 free(txr->hn_txdesc, M_DEVBUF);
5284 txr->hn_txdesc = NULL;
5286 if (txr->hn_mbuf_br != NULL)
5287 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5289 #ifndef HN_USE_TXDESC_BUFRING
5290 mtx_destroy(&txr->hn_txlist_spin);
5292 mtx_destroy(&txr->hn_tx_lock);
5296 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5298 struct sysctl_oid_list *child;
5299 struct sysctl_ctx_list *ctx;
5303 * Create TXBUF for chimney sending.
5305 * NOTE: It is shared by all channels.
5307 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5308 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5309 BUS_DMA_WAITOK | BUS_DMA_ZERO);
5310 if (sc->hn_chim == NULL) {
5311 device_printf(sc->hn_dev, "allocate txbuf failed\n");
5315 sc->hn_tx_ring_cnt = ring_cnt;
5316 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5318 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5319 M_DEVBUF, M_WAITOK | M_ZERO);
5321 ctx = device_get_sysctl_ctx(sc->hn_dev);
5322 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5324 /* Create dev.hn.UNIT.tx sysctl tree */
5325 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5326 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5328 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5331 error = hn_tx_ring_create(sc, i);
5336 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5337 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5338 __offsetof(struct hn_tx_ring, hn_no_txdescs),
5339 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5340 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5341 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5342 __offsetof(struct hn_tx_ring, hn_send_failed),
5343 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5344 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5345 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5346 __offsetof(struct hn_tx_ring, hn_txdma_failed),
5347 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5348 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5349 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5350 __offsetof(struct hn_tx_ring, hn_flush_failed),
5351 hn_tx_stat_ulong_sysctl, "LU",
5352 "# of packet transmission aggregation flush failure");
5353 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5354 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5355 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5356 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5357 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5358 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5359 __offsetof(struct hn_tx_ring, hn_tx_chimney),
5360 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5361 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5362 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5363 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5364 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5365 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5366 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5367 "# of total TX descs");
5368 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5369 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5370 "Chimney send packet size upper boundary");
5371 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5372 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5373 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5374 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5375 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5376 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5377 hn_tx_conf_int_sysctl, "I",
5378 "Size of the packet for direct transmission");
5379 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5380 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5381 __offsetof(struct hn_tx_ring, hn_sched_tx),
5382 hn_tx_conf_int_sysctl, "I",
5383 "Always schedule transmission "
5384 "instead of doing direct transmission");
5385 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5386 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5387 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5388 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5389 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5390 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5391 "Applied packet transmission aggregation size");
5392 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5393 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5394 hn_txagg_pktmax_sysctl, "I",
5395 "Applied packet transmission aggregation packets");
5396 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5397 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5398 hn_txagg_align_sysctl, "I",
5399 "Applied packet transmission aggregation alignment");
5405 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5409 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5410 sc->hn_tx_ring[i].hn_chim_size = chim_size;
5414 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5416 struct ifnet *ifp = sc->hn_ifp;
5422 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5425 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5426 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5427 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5429 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5430 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5431 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5433 if (tso_maxlen < tso_minlen)
5434 tso_maxlen = tso_minlen;
5435 else if (tso_maxlen > IP_MAXPACKET)
5436 tso_maxlen = IP_MAXPACKET;
5437 if (tso_maxlen > sc->hn_ndis_tso_szmax)
5438 tso_maxlen = sc->hn_ndis_tso_szmax;
5439 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5441 if (hn_xpnt_vf_isready(sc)) {
5442 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5443 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5445 ifp->if_hw_tsomax = hw_tsomax;
5447 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5451 hn_fixup_tx_data(struct hn_softc *sc)
5453 uint64_t csum_assist;
5456 hn_set_chim_size(sc, sc->hn_chim_szmax);
5457 if (hn_tx_chimney_size > 0 &&
5458 hn_tx_chimney_size < sc->hn_chim_szmax)
5459 hn_set_chim_size(sc, hn_tx_chimney_size);
5462 if (sc->hn_caps & HN_CAP_IPCS)
5463 csum_assist |= CSUM_IP;
5464 if (sc->hn_caps & HN_CAP_TCP4CS)
5465 csum_assist |= CSUM_IP_TCP;
5466 if (sc->hn_caps & HN_CAP_UDP4CS)
5467 csum_assist |= CSUM_IP_UDP;
5468 if (sc->hn_caps & HN_CAP_TCP6CS)
5469 csum_assist |= CSUM_IP6_TCP;
5470 if (sc->hn_caps & HN_CAP_UDP6CS)
5471 csum_assist |= CSUM_IP6_UDP;
5472 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5473 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5475 if (sc->hn_caps & HN_CAP_HASHVAL) {
5477 * Support HASHVAL pktinfo on TX path.
5480 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5481 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5482 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5487 hn_destroy_tx_data(struct hn_softc *sc)
5491 if (sc->hn_chim != NULL) {
5492 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5493 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5495 device_printf(sc->hn_dev,
5496 "chimney sending buffer is referenced");
5501 if (sc->hn_tx_ring_cnt == 0)
5504 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5505 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5507 free(sc->hn_tx_ring, M_DEVBUF);
5508 sc->hn_tx_ring = NULL;
5510 sc->hn_tx_ring_cnt = 0;
5511 sc->hn_tx_ring_inuse = 0;
5514 #ifdef HN_IFSTART_SUPPORT
5517 hn_start_taskfunc(void *xtxr, int pending __unused)
5519 struct hn_tx_ring *txr = xtxr;
5521 mtx_lock(&txr->hn_tx_lock);
5522 hn_start_locked(txr, 0);
5523 mtx_unlock(&txr->hn_tx_lock);
5527 hn_start_locked(struct hn_tx_ring *txr, int len)
5529 struct hn_softc *sc = txr->hn_sc;
5530 struct ifnet *ifp = sc->hn_ifp;
5533 KASSERT(hn_use_if_start,
5534 ("hn_start_locked is called, when if_start is disabled"));
5535 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5536 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5537 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5539 if (__predict_false(txr->hn_suspended))
5542 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5546 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5547 struct hn_txdesc *txd;
5548 struct mbuf *m_head;
5551 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5555 if (len > 0 && m_head->m_pkthdr.len > len) {
5557 * This sending could be time consuming; let callers
5558 * dispatch this packet sending (and sending of any
5559 * following up packets) to tx taskqueue.
5561 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5566 #if defined(INET6) || defined(INET)
5567 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5568 m_head = hn_tso_fixup(m_head);
5569 if (__predict_false(m_head == NULL)) {
5570 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5573 } else if (m_head->m_pkthdr.csum_flags &
5574 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5575 m_head = hn_set_hlen(m_head);
5576 if (__predict_false(m_head == NULL)) {
5577 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5583 txd = hn_txdesc_get(txr);
5585 txr->hn_no_txdescs++;
5586 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5587 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5591 error = hn_encap(ifp, txr, txd, &m_head);
5593 /* Both txd and m_head are freed */
5594 KASSERT(txr->hn_agg_txd == NULL,
5595 ("encap failed w/ pending aggregating txdesc"));
5599 if (txr->hn_agg_pktleft == 0) {
5600 if (txr->hn_agg_txd != NULL) {
5601 KASSERT(m_head == NULL,
5602 ("pending mbuf for aggregating txdesc"));
5603 error = hn_flush_txagg(ifp, txr);
5604 if (__predict_false(error)) {
5605 atomic_set_int(&ifp->if_drv_flags,
5610 KASSERT(m_head != NULL, ("mbuf was freed"));
5611 error = hn_txpkt(ifp, txr, txd);
5612 if (__predict_false(error)) {
5613 /* txd is freed, but m_head is not */
5614 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5615 atomic_set_int(&ifp->if_drv_flags,
5623 KASSERT(txr->hn_agg_txd != NULL,
5624 ("no aggregating txdesc"));
5625 KASSERT(m_head == NULL,
5626 ("pending mbuf for aggregating txdesc"));
5631 /* Flush pending aggerated transmission. */
5632 if (txr->hn_agg_txd != NULL)
5633 hn_flush_txagg(ifp, txr);
5638 hn_start(struct ifnet *ifp)
5640 struct hn_softc *sc = ifp->if_softc;
5641 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5643 if (txr->hn_sched_tx)
5646 if (mtx_trylock(&txr->hn_tx_lock)) {
5649 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5650 mtx_unlock(&txr->hn_tx_lock);
5655 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5659 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5661 struct hn_tx_ring *txr = xtxr;
5663 mtx_lock(&txr->hn_tx_lock);
5664 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5665 hn_start_locked(txr, 0);
5666 mtx_unlock(&txr->hn_tx_lock);
5670 hn_start_txeof(struct hn_tx_ring *txr)
5672 struct hn_softc *sc = txr->hn_sc;
5673 struct ifnet *ifp = sc->hn_ifp;
5675 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5677 if (txr->hn_sched_tx)
5680 if (mtx_trylock(&txr->hn_tx_lock)) {
5683 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5684 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5685 mtx_unlock(&txr->hn_tx_lock);
5687 taskqueue_enqueue(txr->hn_tx_taskq,
5693 * Release the OACTIVE earlier, with the hope, that
5694 * others could catch up. The task will clear the
5695 * flag again with the hn_tx_lock to avoid possible
5698 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5699 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5703 #endif /* HN_IFSTART_SUPPORT */
5706 hn_xmit(struct hn_tx_ring *txr, int len)
5708 struct hn_softc *sc = txr->hn_sc;
5709 struct ifnet *ifp = sc->hn_ifp;
5710 struct mbuf *m_head;
5713 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5714 #ifdef HN_IFSTART_SUPPORT
5715 KASSERT(hn_use_if_start == 0,
5716 ("hn_xmit is called, when if_start is enabled"));
5718 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5720 if (__predict_false(txr->hn_suspended))
5723 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5726 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5727 struct hn_txdesc *txd;
5730 if (len > 0 && m_head->m_pkthdr.len > len) {
5732 * This sending could be time consuming; let callers
5733 * dispatch this packet sending (and sending of any
5734 * following up packets) to tx taskqueue.
5736 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5741 txd = hn_txdesc_get(txr);
5743 txr->hn_no_txdescs++;
5744 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5745 txr->hn_oactive = 1;
5749 error = hn_encap(ifp, txr, txd, &m_head);
5751 /* Both txd and m_head are freed; discard */
5752 KASSERT(txr->hn_agg_txd == NULL,
5753 ("encap failed w/ pending aggregating txdesc"));
5754 drbr_advance(ifp, txr->hn_mbuf_br);
5758 if (txr->hn_agg_pktleft == 0) {
5759 if (txr->hn_agg_txd != NULL) {
5760 KASSERT(m_head == NULL,
5761 ("pending mbuf for aggregating txdesc"));
5762 error = hn_flush_txagg(ifp, txr);
5763 if (__predict_false(error)) {
5764 txr->hn_oactive = 1;
5768 KASSERT(m_head != NULL, ("mbuf was freed"));
5769 error = hn_txpkt(ifp, txr, txd);
5770 if (__predict_false(error)) {
5771 /* txd is freed, but m_head is not */
5772 drbr_putback(ifp, txr->hn_mbuf_br,
5774 txr->hn_oactive = 1;
5781 KASSERT(txr->hn_agg_txd != NULL,
5782 ("no aggregating txdesc"));
5783 KASSERT(m_head == NULL,
5784 ("pending mbuf for aggregating txdesc"));
5789 drbr_advance(ifp, txr->hn_mbuf_br);
5792 /* Flush pending aggerated transmission. */
5793 if (txr->hn_agg_txd != NULL)
5794 hn_flush_txagg(ifp, txr);
5799 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5801 struct hn_softc *sc = ifp->if_softc;
5802 struct hn_tx_ring *txr;
5805 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5806 struct rm_priotracker pt;
5808 rm_rlock(&sc->hn_vf_lock, &pt);
5809 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5810 struct mbuf *m_bpf = NULL;
5813 obytes = m->m_pkthdr.len;
5814 if (m->m_flags & M_MCAST)
5817 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5818 if (bpf_peers_present(ifp->if_bpf)) {
5819 m_bpf = m_copypacket(m, M_NOWAIT);
5820 if (m_bpf == NULL) {
5822 * Failed to grab a shallow
5825 ETHER_BPF_MTAP(ifp, m);
5829 ETHER_BPF_MTAP(ifp, m);
5832 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5833 rm_runlock(&sc->hn_vf_lock, &pt);
5835 if (m_bpf != NULL) {
5837 ETHER_BPF_MTAP(ifp, m_bpf);
5841 if (error == ENOBUFS) {
5842 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5844 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5846 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5847 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5849 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5855 rm_runlock(&sc->hn_vf_lock, &pt);
5858 #if defined(INET6) || defined(INET)
5860 * Perform TSO packet header fixup or get l2/l3 header length now,
5861 * since packet headers should be cache-hot.
5863 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5864 m = hn_tso_fixup(m);
5865 if (__predict_false(m == NULL)) {
5866 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5869 } else if (m->m_pkthdr.csum_flags &
5870 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5872 if (__predict_false(m == NULL)) {
5873 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5880 * Select the TX ring based on flowid
5882 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
5883 #if defined(INET6) || defined(INET)
5886 if (m->m_pkthdr.len < 128 &&
5887 (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
5888 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
5889 m = hn_check_tcpsyn(m, &tcpsyn);
5890 if (__predict_false(m == NULL)) {
5892 IFCOUNTER_OERRORS, 1);
5897 const int tcpsyn = 0;
5902 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
5904 txr = &sc->hn_tx_ring[idx];
5906 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
5908 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5912 if (txr->hn_oactive)
5915 if (txr->hn_sched_tx)
5918 if (mtx_trylock(&txr->hn_tx_lock)) {
5921 sched = hn_xmit(txr, txr->hn_direct_tx_size);
5922 mtx_unlock(&txr->hn_tx_lock);
5927 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5932 hn_tx_ring_qflush(struct hn_tx_ring *txr)
5936 mtx_lock(&txr->hn_tx_lock);
5937 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
5939 mtx_unlock(&txr->hn_tx_lock);
5943 hn_xmit_qflush(struct ifnet *ifp)
5945 struct hn_softc *sc = ifp->if_softc;
5946 struct rm_priotracker pt;
5949 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
5950 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5953 rm_rlock(&sc->hn_vf_lock, &pt);
5954 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
5955 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
5956 rm_runlock(&sc->hn_vf_lock, &pt);
5960 hn_xmit_txeof(struct hn_tx_ring *txr)
5963 if (txr->hn_sched_tx)
5966 if (mtx_trylock(&txr->hn_tx_lock)) {
5969 txr->hn_oactive = 0;
5970 sched = hn_xmit(txr, txr->hn_direct_tx_size);
5971 mtx_unlock(&txr->hn_tx_lock);
5973 taskqueue_enqueue(txr->hn_tx_taskq,
5979 * Release the oactive earlier, with the hope, that
5980 * others could catch up. The task will clear the
5981 * oactive again with the hn_tx_lock to avoid possible
5984 txr->hn_oactive = 0;
5985 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5990 hn_xmit_taskfunc(void *xtxr, int pending __unused)
5992 struct hn_tx_ring *txr = xtxr;
5994 mtx_lock(&txr->hn_tx_lock);
5996 mtx_unlock(&txr->hn_tx_lock);
6000 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6002 struct hn_tx_ring *txr = xtxr;
6004 mtx_lock(&txr->hn_tx_lock);
6005 txr->hn_oactive = 0;
6007 mtx_unlock(&txr->hn_tx_lock);
6011 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6013 struct vmbus_chan_br cbr;
6014 struct hn_rx_ring *rxr;
6015 struct hn_tx_ring *txr = NULL;
6018 idx = vmbus_chan_subidx(chan);
6021 * Link this channel to RX/TX ring.
6023 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6024 ("invalid channel index %d, should > 0 && < %d",
6025 idx, sc->hn_rx_ring_inuse));
6026 rxr = &sc->hn_rx_ring[idx];
6027 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6028 ("RX ring %d already attached", idx));
6029 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6030 rxr->hn_chan = chan;
6033 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6034 idx, vmbus_chan_id(chan));
6037 if (idx < sc->hn_tx_ring_inuse) {
6038 txr = &sc->hn_tx_ring[idx];
6039 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6040 ("TX ring %d already attached", idx));
6041 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6043 txr->hn_chan = chan;
6045 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6046 idx, vmbus_chan_id(chan));
6050 /* Bind this channel to a proper CPU. */
6051 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6056 cbr.cbr = rxr->hn_br;
6057 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6058 cbr.cbr_txsz = HN_TXBR_SIZE;
6059 cbr.cbr_rxsz = HN_RXBR_SIZE;
6060 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6062 if (error == EISCONN) {
6063 if_printf(sc->hn_ifp, "bufring is connected after "
6064 "chan%u open failure\n", vmbus_chan_id(chan));
6065 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6067 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6068 vmbus_chan_id(chan), error);
6075 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6077 struct hn_rx_ring *rxr;
6080 idx = vmbus_chan_subidx(chan);
6083 * Link this channel to RX/TX ring.
6085 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6086 ("invalid channel index %d, should > 0 && < %d",
6087 idx, sc->hn_rx_ring_inuse));
6088 rxr = &sc->hn_rx_ring[idx];
6089 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6090 ("RX ring %d is not attached", idx));
6091 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6093 if (idx < sc->hn_tx_ring_inuse) {
6094 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6096 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6097 ("TX ring %d is not attached attached", idx));
6098 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6102 * Close this channel.
6105 * Channel closing does _not_ destroy the target channel.
6107 error = vmbus_chan_close_direct(chan);
6108 if (error == EISCONN) {
6109 if_printf(sc->hn_ifp, "chan%u bufring is connected "
6110 "after being closed\n", vmbus_chan_id(chan));
6111 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6113 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6114 vmbus_chan_id(chan), error);
6119 hn_attach_subchans(struct hn_softc *sc)
6121 struct vmbus_channel **subchans;
6122 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6125 KASSERT(subchan_cnt > 0, ("no sub-channels"));
6127 /* Attach the sub-channels. */
6128 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6129 for (i = 0; i < subchan_cnt; ++i) {
6132 error1 = hn_chan_attach(sc, subchans[i]);
6135 /* Move on; all channels will be detached later. */
6138 vmbus_subchan_rel(subchans, subchan_cnt);
6141 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6144 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6152 hn_detach_allchans(struct hn_softc *sc)
6154 struct vmbus_channel **subchans;
6155 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6158 if (subchan_cnt == 0)
6161 /* Detach the sub-channels. */
6162 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6163 for (i = 0; i < subchan_cnt; ++i)
6164 hn_chan_detach(sc, subchans[i]);
6165 vmbus_subchan_rel(subchans, subchan_cnt);
6169 * Detach the primary channel, _after_ all sub-channels
6172 hn_chan_detach(sc, sc->hn_prichan);
6174 /* Wait for sub-channels to be destroyed, if any. */
6175 vmbus_subchan_drain(sc->hn_prichan);
6178 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6179 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6180 HN_RX_FLAG_ATTACHED) == 0,
6181 ("%dth RX ring is still attached", i));
6183 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6184 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6185 HN_TX_FLAG_ATTACHED) == 0,
6186 ("%dth TX ring is still attached", i));
6192 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6194 struct vmbus_channel **subchans;
6195 int nchan, rxr_cnt, error;
6197 nchan = *nsubch + 1;
6200 * Multiple RX/TX rings are not requested.
6207 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6210 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6212 /* No RSS; this is benign. */
6217 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6221 if (nchan > rxr_cnt)
6224 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6230 * Allocate sub-channels from NVS.
6232 *nsubch = nchan - 1;
6233 error = hn_nvs_alloc_subchans(sc, nsubch);
6234 if (error || *nsubch == 0) {
6235 /* Failed to allocate sub-channels. */
6241 * Wait for all sub-channels to become ready before moving on.
6243 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6244 vmbus_subchan_rel(subchans, *nsubch);
6249 hn_synth_attachable(const struct hn_softc *sc)
6253 if (sc->hn_flags & HN_FLAG_ERRORS)
6256 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6257 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6259 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6266 * Make sure that the RX filter is zero after the successful
6267 * RNDIS initialization.
6270 * Under certain conditions on certain versions of Hyper-V,
6271 * the RNDIS rxfilter is _not_ zero on the hypervisor side
6272 * after the successful RNDIS initialization, which breaks
6273 * the assumption of any following code (well, it breaks the
6274 * RNDIS API contract actually). Clear the RNDIS rxfilter
6275 * explicitly, drain packets sneaking through, and drain the
6276 * interrupt taskqueues scheduled due to the stealth packets.
6279 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6283 hn_drain_rxtx(sc, nchan);
6287 hn_synth_attach(struct hn_softc *sc, int mtu)
6289 #define ATTACHED_NVS 0x0002
6290 #define ATTACHED_RNDIS 0x0004
6292 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6293 int error, nsubch, nchan = 1, i, rndis_inited;
6294 uint32_t old_caps, attached = 0;
6296 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6297 ("synthetic parts were attached"));
6299 if (!hn_synth_attachable(sc))
6302 /* Save capabilities for later verification. */
6303 old_caps = sc->hn_caps;
6306 /* Clear RSS stuffs. */
6307 sc->hn_rss_ind_size = 0;
6308 sc->hn_rss_hash = 0;
6309 sc->hn_rss_hcap = 0;
6312 * Attach the primary channel _before_ attaching NVS and RNDIS.
6314 error = hn_chan_attach(sc, sc->hn_prichan);
6321 error = hn_nvs_attach(sc, mtu);
6324 attached |= ATTACHED_NVS;
6327 * Attach RNDIS _after_ NVS is attached.
6329 error = hn_rndis_attach(sc, mtu, &rndis_inited);
6331 attached |= ATTACHED_RNDIS;
6336 * Make sure capabilities are not changed.
6338 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6339 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6340 old_caps, sc->hn_caps);
6346 * Allocate sub-channels for multi-TX/RX rings.
6349 * The # of RX rings that can be used is equivalent to the # of
6350 * channels to be requested.
6352 nsubch = sc->hn_rx_ring_cnt - 1;
6353 error = hn_synth_alloc_subchans(sc, &nsubch);
6356 /* NOTE: _Full_ synthetic parts detach is required now. */
6357 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6360 * Set the # of TX/RX rings that could be used according to
6361 * the # of channels that NVS offered.
6364 hn_set_ring_inuse(sc, nchan);
6366 /* Only the primary channel can be used; done */
6371 * Attach the sub-channels.
6373 * NOTE: hn_set_ring_inuse() _must_ have been called.
6375 error = hn_attach_subchans(sc);
6380 * Configure RSS key and indirect table _after_ all sub-channels
6383 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6385 * RSS key is not set yet; set it to the default RSS key.
6388 if_printf(sc->hn_ifp, "setup default RSS key\n");
6389 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6390 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6393 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6395 * RSS indirect table is not set yet; set it up in round-
6399 if_printf(sc->hn_ifp, "setup default RSS indirect "
6402 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
6403 rss->rss_ind[i] = i % nchan;
6404 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6407 * # of usable channels may be changed, so we have to
6408 * make sure that all entries in RSS indirect table
6411 * NOTE: hn_set_ring_inuse() _must_ have been called.
6413 hn_rss_ind_fixup(sc);
6416 sc->hn_rss_hash = sc->hn_rss_hcap;
6417 if ((sc->hn_flags & HN_FLAG_RXVF) ||
6418 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6419 /* NOTE: Don't reconfigure RSS; will do immediately. */
6420 hn_vf_rss_fixup(sc, false);
6422 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6427 * Fixup transmission aggregation setup.
6430 hn_rndis_init_fixat(sc, nchan);
6434 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6435 hn_rndis_init_fixat(sc, nchan);
6436 hn_synth_detach(sc);
6438 if (attached & ATTACHED_RNDIS) {
6439 hn_rndis_init_fixat(sc, nchan);
6440 hn_rndis_detach(sc);
6442 if (attached & ATTACHED_NVS)
6444 hn_chan_detach(sc, sc->hn_prichan);
6445 /* Restore old capabilities. */
6446 sc->hn_caps = old_caps;
6450 #undef ATTACHED_RNDIS
6456 * The interface must have been suspended though hn_suspend(), before
6457 * this function get called.
6460 hn_synth_detach(struct hn_softc *sc)
6463 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6464 ("synthetic parts were not attached"));
6466 /* Detach the RNDIS first. */
6467 hn_rndis_detach(sc);
6472 /* Detach all of the channels. */
6473 hn_detach_allchans(sc);
6475 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6479 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6481 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6482 ("invalid ring count %d", ring_cnt));
6484 if (sc->hn_tx_ring_cnt > ring_cnt)
6485 sc->hn_tx_ring_inuse = ring_cnt;
6487 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6488 sc->hn_rx_ring_inuse = ring_cnt;
6491 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6492 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6497 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6502 * The TX bufring will not be drained by the hypervisor,
6503 * if the primary channel is revoked.
6505 while (!vmbus_chan_rx_empty(chan) ||
6506 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6507 !vmbus_chan_tx_empty(chan)))
6509 vmbus_chan_intr_drain(chan);
6513 hn_disable_rx(struct hn_softc *sc)
6517 * Disable RX by clearing RX filter forcefully.
6519 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6520 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6523 * Give RNDIS enough time to flush all pending data packets.
6525 pause("waitrx", (200 * hz) / 1000);
6530 * RX/TX _must_ have been suspended/disabled, before this function
6534 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6536 struct vmbus_channel **subch = NULL;
6540 * Drain RX/TX bufrings and interrupts.
6544 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6546 if (subch != NULL) {
6549 for (i = 0; i < nsubch; ++i)
6550 hn_chan_drain(sc, subch[i]);
6552 hn_chan_drain(sc, sc->hn_prichan);
6555 vmbus_subchan_rel(subch, nsubch);
6559 hn_suspend_data(struct hn_softc *sc)
6561 struct hn_tx_ring *txr;
6569 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6570 txr = &sc->hn_tx_ring[i];
6572 mtx_lock(&txr->hn_tx_lock);
6573 txr->hn_suspended = 1;
6574 mtx_unlock(&txr->hn_tx_lock);
6575 /* No one is able send more packets now. */
6578 * Wait for all pending sends to finish.
6581 * We will _not_ receive all pending send-done, if the
6582 * primary channel is revoked.
6584 while (hn_tx_ring_pending(txr) &&
6585 !vmbus_chan_is_revoked(sc->hn_prichan))
6586 pause("hnwtx", 1 /* 1 tick */);
6597 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6600 * Drain any pending TX tasks.
6603 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6604 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6606 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6607 txr = &sc->hn_tx_ring[i];
6609 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6610 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6615 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6618 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6622 hn_suspend_mgmt(struct hn_softc *sc)
6629 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6630 * through hn_mgmt_taskq.
6632 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6633 vmbus_chan_run_task(sc->hn_prichan, &task);
6636 * Make sure that all pending management tasks are completed.
6638 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6639 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6640 taskqueue_drain_all(sc->hn_mgmt_taskq0);
6644 hn_suspend(struct hn_softc *sc)
6647 /* Disable polling. */
6651 * If the non-transparent mode VF is activated, the synthetic
6652 * device is receiving packets, so the data path of the
6653 * synthetic device must be suspended.
6655 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6656 (sc->hn_flags & HN_FLAG_RXVF))
6657 hn_suspend_data(sc);
6658 hn_suspend_mgmt(sc);
6662 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6666 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6667 ("invalid TX ring count %d", tx_ring_cnt));
6669 for (i = 0; i < tx_ring_cnt; ++i) {
6670 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6672 mtx_lock(&txr->hn_tx_lock);
6673 txr->hn_suspended = 0;
6674 mtx_unlock(&txr->hn_tx_lock);
6679 hn_resume_data(struct hn_softc *sc)
6688 hn_rxfilter_config(sc);
6691 * Make sure to clear suspend status on "all" TX rings,
6692 * since hn_tx_ring_inuse can be changed after
6693 * hn_suspend_data().
6695 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6697 #ifdef HN_IFSTART_SUPPORT
6698 if (!hn_use_if_start)
6702 * Flush unused drbrs, since hn_tx_ring_inuse may be
6705 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6706 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6712 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6713 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6716 * Use txeof task, so that any pending oactive can be
6719 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6724 hn_resume_mgmt(struct hn_softc *sc)
6727 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6730 * Kick off network change detection, if it was pending.
6731 * If no network change was pending, start link status
6732 * checks, which is more lightweight than network change
6735 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6736 hn_change_network(sc);
6738 hn_update_link_status(sc);
6742 hn_resume(struct hn_softc *sc)
6746 * If the non-transparent mode VF is activated, the synthetic
6747 * device have to receive packets, so the data path of the
6748 * synthetic device must be resumed.
6750 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6751 (sc->hn_flags & HN_FLAG_RXVF))
6755 * Don't resume link status change if VF is attached/activated.
6756 * - In the non-transparent VF mode, the synthetic device marks
6757 * link down until the VF is deactivated; i.e. VF is down.
6758 * - In transparent VF mode, VF's media status is used until
6759 * the VF is detached.
6761 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6762 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6766 * Re-enable polling if this interface is running and
6767 * the polling is requested.
6769 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6770 hn_polling(sc, sc->hn_pollhz);
6774 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6776 const struct rndis_status_msg *msg;
6779 if (dlen < sizeof(*msg)) {
6780 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6785 switch (msg->rm_status) {
6786 case RNDIS_STATUS_MEDIA_CONNECT:
6787 case RNDIS_STATUS_MEDIA_DISCONNECT:
6788 hn_update_link_status(sc);
6791 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6792 case RNDIS_STATUS_LINK_SPEED_CHANGE:
6793 /* Not really useful; ignore. */
6796 case RNDIS_STATUS_NETWORK_CHANGE:
6797 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6798 if (dlen < ofs + msg->rm_stbuflen ||
6799 msg->rm_stbuflen < sizeof(uint32_t)) {
6800 if_printf(sc->hn_ifp, "network changed\n");
6804 memcpy(&change, ((const uint8_t *)msg) + ofs,
6806 if_printf(sc->hn_ifp, "network changed, change %u\n",
6809 hn_change_network(sc);
6813 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6820 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6822 const struct rndis_pktinfo *pi = info_data;
6825 while (info_dlen != 0) {
6829 if (__predict_false(info_dlen < sizeof(*pi)))
6831 if (__predict_false(info_dlen < pi->rm_size))
6833 info_dlen -= pi->rm_size;
6835 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6837 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6839 dlen = pi->rm_size - pi->rm_pktinfooffset;
6842 switch (pi->rm_type) {
6843 case NDIS_PKTINFO_TYPE_VLAN:
6844 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
6846 info->vlan_info = *((const uint32_t *)data);
6847 mask |= HN_RXINFO_VLAN;
6850 case NDIS_PKTINFO_TYPE_CSUM:
6851 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
6853 info->csum_info = *((const uint32_t *)data);
6854 mask |= HN_RXINFO_CSUM;
6857 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
6858 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
6860 info->hash_value = *((const uint32_t *)data);
6861 mask |= HN_RXINFO_HASHVAL;
6864 case HN_NDIS_PKTINFO_TYPE_HASHINF:
6865 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
6867 info->hash_info = *((const uint32_t *)data);
6868 mask |= HN_RXINFO_HASHINF;
6875 if (mask == HN_RXINFO_ALL) {
6876 /* All found; done */
6880 pi = (const struct rndis_pktinfo *)
6881 ((const uint8_t *)pi + pi->rm_size);
6886 * - If there is no hash value, invalidate the hash info.
6888 if ((mask & HN_RXINFO_HASHVAL) == 0)
6889 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
6893 static __inline bool
6894 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
6897 if (off < check_off) {
6898 if (__predict_true(off + len <= check_off))
6900 } else if (off > check_off) {
6901 if (__predict_true(check_off + check_len <= off))
6908 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
6910 const struct rndis_packet_msg *pkt;
6911 struct hn_rxinfo info;
6912 int data_off, pktinfo_off, data_len, pktinfo_len;
6917 if (__predict_false(dlen < sizeof(*pkt))) {
6918 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
6923 if (__predict_false(dlen < pkt->rm_len)) {
6924 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
6925 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
6928 if (__predict_false(pkt->rm_len <
6929 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
6930 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
6931 "msglen %u, data %u, oob %u, pktinfo %u\n",
6932 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
6933 pkt->rm_pktinfolen);
6936 if (__predict_false(pkt->rm_datalen == 0)) {
6937 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
6944 #define IS_OFFSET_INVALID(ofs) \
6945 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
6946 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
6948 /* XXX Hyper-V does not meet data offset alignment requirement */
6949 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
6950 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6951 "data offset %u\n", pkt->rm_dataoffset);
6954 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
6955 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
6956 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6957 "oob offset %u\n", pkt->rm_oobdataoffset);
6960 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
6961 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
6962 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6963 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
6967 #undef IS_OFFSET_INVALID
6969 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
6970 data_len = pkt->rm_datalen;
6971 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
6972 pktinfo_len = pkt->rm_pktinfolen;
6975 * Check OOB coverage.
6977 if (__predict_false(pkt->rm_oobdatalen != 0)) {
6978 int oob_off, oob_len;
6980 if_printf(rxr->hn_ifp, "got oobdata\n");
6981 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
6982 oob_len = pkt->rm_oobdatalen;
6984 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
6985 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6986 "oob overflow, msglen %u, oob abs %d len %d\n",
6987 pkt->rm_len, oob_off, oob_len);
6992 * Check against data.
6994 if (hn_rndis_check_overlap(oob_off, oob_len,
6995 data_off, data_len)) {
6996 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6997 "oob overlaps data, oob abs %d len %d, "
6998 "data abs %d len %d\n",
6999 oob_off, oob_len, data_off, data_len);
7004 * Check against pktinfo.
7006 if (pktinfo_len != 0 &&
7007 hn_rndis_check_overlap(oob_off, oob_len,
7008 pktinfo_off, pktinfo_len)) {
7009 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7010 "oob overlaps pktinfo, oob abs %d len %d, "
7011 "pktinfo abs %d len %d\n",
7012 oob_off, oob_len, pktinfo_off, pktinfo_len);
7018 * Check per-packet-info coverage and find useful per-packet-info.
7020 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7021 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7022 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7023 if (__predict_true(pktinfo_len != 0)) {
7027 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7028 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7029 "pktinfo overflow, msglen %u, "
7030 "pktinfo abs %d len %d\n",
7031 pkt->rm_len, pktinfo_off, pktinfo_len);
7036 * Check packet info coverage.
7038 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7039 data_off, data_len);
7040 if (__predict_false(overlap)) {
7041 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7042 "pktinfo overlap data, pktinfo abs %d len %d, "
7043 "data abs %d len %d\n",
7044 pktinfo_off, pktinfo_len, data_off, data_len);
7049 * Find useful per-packet-info.
7051 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7052 pktinfo_len, &info);
7053 if (__predict_false(error)) {
7054 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7060 if (__predict_false(data_off + data_len > pkt->rm_len)) {
7061 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7062 "data overflow, msglen %u, data abs %d len %d\n",
7063 pkt->rm_len, data_off, data_len);
7066 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7069 static __inline void
7070 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7072 const struct rndis_msghdr *hdr;
7074 if (__predict_false(dlen < sizeof(*hdr))) {
7075 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7080 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7081 /* Hot data path. */
7082 hn_rndis_rx_data(rxr, data, dlen);
7087 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7088 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7090 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7094 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7096 const struct hn_nvs_hdr *hdr;
7098 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7099 if_printf(sc->hn_ifp, "invalid nvs notify\n");
7102 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7104 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7105 /* Useless; ignore */
7108 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7112 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7113 const struct vmbus_chanpkt_hdr *pkt)
7115 struct hn_nvs_sendctx *sndc;
7117 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7118 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7119 VMBUS_CHANPKT_DATALEN(pkt));
7122 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7128 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7129 const struct vmbus_chanpkt_hdr *pkthdr)
7131 const struct vmbus_chanpkt_rxbuf *pkt;
7132 const struct hn_nvs_hdr *nvs_hdr;
7135 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7136 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7139 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7141 /* Make sure that this is a RNDIS message. */
7142 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7143 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7148 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7149 if (__predict_false(hlen < sizeof(*pkt))) {
7150 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7153 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7155 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7156 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7161 count = pkt->cp_rxbuf_cnt;
7162 if (__predict_false(hlen <
7163 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7164 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7168 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7169 for (i = 0; i < count; ++i) {
7172 ofs = pkt->cp_rxbuf[i].rb_ofs;
7173 len = pkt->cp_rxbuf[i].rb_len;
7174 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7175 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7176 "ofs %d, len %d\n", i, ofs, len);
7179 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7183 * Ack the consumed RXBUF associated w/ this channel packet,
7184 * so that this RXBUF can be recycled by the hypervisor.
7186 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7190 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7193 struct hn_nvs_rndis_ack ack;
7196 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7197 ack.nvs_status = HN_NVS_STATUS_OK;
7201 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7202 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7203 if (__predict_false(error == EAGAIN)) {
7206 * This should _not_ happen in real world, since the
7207 * consumption of the TX bufring from the TX path is
7210 if (rxr->hn_ack_failed == 0)
7211 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7212 rxr->hn_ack_failed++;
7219 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7224 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7226 struct hn_rx_ring *rxr = xrxr;
7227 struct hn_softc *sc = rxr->hn_ifp->if_softc;
7230 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7233 pktlen = rxr->hn_pktbuf_len;
7234 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7235 if (__predict_false(error == ENOBUFS)) {
7240 * Expand channel packet buffer.
7243 * Use M_WAITOK here, since allocation failure
7246 nlen = rxr->hn_pktbuf_len * 2;
7247 while (nlen < pktlen)
7249 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7251 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7252 rxr->hn_pktbuf_len, nlen);
7254 free(rxr->hn_pktbuf, M_DEVBUF);
7255 rxr->hn_pktbuf = nbuf;
7256 rxr->hn_pktbuf_len = nlen;
7259 } else if (__predict_false(error == EAGAIN)) {
7260 /* No more channel packets; done! */
7263 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7265 switch (pkt->cph_type) {
7266 case VMBUS_CHANPKT_TYPE_COMP:
7267 hn_nvs_handle_comp(sc, chan, pkt);
7270 case VMBUS_CHANPKT_TYPE_RXBUF:
7271 hn_nvs_handle_rxbuf(rxr, chan, pkt);
7274 case VMBUS_CHANPKT_TYPE_INBAND:
7275 hn_nvs_handle_notify(sc, pkt);
7279 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7284 hn_chan_rollup(rxr, rxr->hn_txr);
7288 hn_sysinit(void *arg __unused)
7292 #ifdef HN_IFSTART_SUPPORT
7294 * Don't use ifnet.if_start if transparent VF mode is requested;
7295 * mainly due to the IFF_DRV_OACTIVE flag.
7297 if (hn_xpnt_vf && hn_use_if_start) {
7298 hn_use_if_start = 0;
7299 printf("hn: tranparent VF mode, if_transmit will be used, "
7300 "instead of if_start\n");
7303 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7304 printf("hn: invalid transparent VF attach routing "
7305 "wait timeout %d, reset to %d\n",
7306 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7307 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7311 * Initialize VF map.
7313 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7314 hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7315 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7319 * Fix the # of TX taskqueues.
7321 if (hn_tx_taskq_cnt <= 0)
7322 hn_tx_taskq_cnt = 1;
7323 else if (hn_tx_taskq_cnt > mp_ncpus)
7324 hn_tx_taskq_cnt = mp_ncpus;
7327 * Fix the TX taskqueue mode.
7329 switch (hn_tx_taskq_mode) {
7330 case HN_TX_TASKQ_M_INDEP:
7331 case HN_TX_TASKQ_M_GLOBAL:
7332 case HN_TX_TASKQ_M_EVTTQ:
7335 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7339 if (vm_guest != VM_GUEST_HV)
7342 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7345 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7346 M_DEVBUF, M_WAITOK);
7347 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7348 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7349 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7350 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7354 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7357 hn_sysuninit(void *arg __unused)
7360 if (hn_tx_taskque != NULL) {
7363 for (i = 0; i < hn_tx_taskq_cnt; ++i)
7364 taskqueue_free(hn_tx_taskque[i]);
7365 free(hn_tx_taskque, M_DEVBUF);
7368 if (hn_vfmap != NULL)
7369 free(hn_vfmap, M_DEVBUF);
7370 rm_destroy(&hn_vfmap_lock);
7372 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);