2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
59 #include "opt_inet6.h"
63 #include <sys/param.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
72 #include <sys/rmlock.h>
75 #include <sys/socket.h>
76 #include <sys/sockio.h>
78 #include <sys/sysctl.h>
79 #include <sys/systm.h>
80 #include <sys/taskqueue.h>
81 #include <sys/buf_ring.h>
82 #include <sys/eventhandler.h>
84 #include <machine/atomic.h>
85 #include <machine/in_cksum.h>
88 #include <net/ethernet.h>
90 #include <net/if_dl.h>
91 #include <net/if_media.h>
92 #include <net/if_types.h>
93 #include <net/if_var.h>
94 #include <net/rndis.h>
96 #include <net/rss_config.h>
99 #include <netinet/in_systm.h>
100 #include <netinet/in.h>
101 #include <netinet/ip.h>
102 #include <netinet/ip6.h>
103 #include <netinet/tcp.h>
104 #include <netinet/tcp_lro.h>
105 #include <netinet/udp.h>
107 #include <dev/hyperv/include/hyperv.h>
108 #include <dev/hyperv/include/hyperv_busdma.h>
109 #include <dev/hyperv/include/vmbus.h>
110 #include <dev/hyperv/include/vmbus_xact.h>
112 #include <dev/hyperv/netvsc/ndis.h>
113 #include <dev/hyperv/netvsc/if_hnreg.h>
114 #include <dev/hyperv/netvsc/if_hnvar.h>
115 #include <dev/hyperv/netvsc/hn_nvs.h>
116 #include <dev/hyperv/netvsc/hn_rndis.h>
118 #include "vmbus_if.h"
120 #define HN_IFSTART_SUPPORT
122 #define HN_RING_CNT_DEF_MAX 8
124 #define HN_VFMAP_SIZE_DEF 8
126 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */
128 /* YYY should get it from the underlying channel */
129 #define HN_TX_DESC_CNT 512
131 #define HN_RNDIS_PKT_LEN \
132 (sizeof(struct rndis_packet_msg) + \
133 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
134 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
135 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
136 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
137 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
138 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
140 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
141 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
142 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
143 /* -1 for RNDIS packet message */
144 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
146 #define HN_DIRECT_TX_SIZE_DEF 128
148 #define HN_EARLY_TXEOF_THRESH 8
150 #define HN_PKTBUF_LEN_DEF (16 * 1024)
152 #define HN_LROENT_CNT_DEF 128
154 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
155 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
156 /* YYY 2*MTU is a bit rough, but should be good enough. */
157 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
159 #define HN_LRO_ACKCNT_DEF 1
161 #define HN_LOCK_INIT(sc) \
162 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
163 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
164 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
165 #define HN_LOCK(sc) \
167 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
170 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
172 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
173 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
174 #define HN_CSUM_IP_HWASSIST(sc) \
175 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
176 #define HN_CSUM_IP6_HWASSIST(sc) \
177 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
179 #define HN_PKTSIZE_MIN(align) \
180 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
181 HN_RNDIS_PKT_LEN, (align))
182 #define HN_PKTSIZE(m, align) \
183 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
186 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets())
188 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
192 #ifndef HN_USE_TXDESC_BUFRING
193 SLIST_ENTRY(hn_txdesc) link;
195 STAILQ_ENTRY(hn_txdesc) agg_link;
197 /* Aggregated txdescs, in sending order. */
198 STAILQ_HEAD(, hn_txdesc) agg_list;
200 /* The oldest packet, if transmission aggregation happens. */
202 struct hn_tx_ring *txr;
204 uint32_t flags; /* HN_TXD_FLAG_ */
205 struct hn_nvs_sendctx send_ctx;
209 bus_dmamap_t data_dmap;
211 bus_addr_t rndis_pkt_paddr;
212 struct rndis_packet_msg *rndis_pkt;
213 bus_dmamap_t rndis_pkt_dmap;
216 #define HN_TXD_FLAG_ONLIST 0x0001
217 #define HN_TXD_FLAG_DMAMAP 0x0002
218 #define HN_TXD_FLAG_ONAGG 0x0004
227 struct hn_rxvf_setarg {
228 struct hn_rx_ring *rxr;
229 struct ifnet *vf_ifp;
232 #define HN_RXINFO_VLAN 0x0001
233 #define HN_RXINFO_CSUM 0x0002
234 #define HN_RXINFO_HASHINF 0x0004
235 #define HN_RXINFO_HASHVAL 0x0008
236 #define HN_RXINFO_ALL \
239 HN_RXINFO_HASHINF | \
242 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
243 #define HN_NDIS_RXCSUM_INFO_INVALID 0
244 #define HN_NDIS_HASH_INFO_INVALID 0
246 static int hn_probe(device_t);
247 static int hn_attach(device_t);
248 static int hn_detach(device_t);
249 static int hn_shutdown(device_t);
250 static void hn_chan_callback(struct vmbus_channel *,
253 static void hn_init(void *);
254 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
255 #ifdef HN_IFSTART_SUPPORT
256 static void hn_start(struct ifnet *);
258 static int hn_transmit(struct ifnet *, struct mbuf *);
259 static void hn_xmit_qflush(struct ifnet *);
260 static int hn_ifmedia_upd(struct ifnet *);
261 static void hn_ifmedia_sts(struct ifnet *,
262 struct ifmediareq *);
264 static void hn_ifnet_event(void *, struct ifnet *, int);
265 static void hn_ifaddr_event(void *, struct ifnet *);
266 static void hn_ifnet_attevent(void *, struct ifnet *);
267 static void hn_ifnet_detevent(void *, struct ifnet *);
268 static void hn_ifnet_lnkevent(void *, struct ifnet *, int);
270 static bool hn_ismyvf(const struct hn_softc *,
271 const struct ifnet *);
272 static void hn_rxvf_change(struct hn_softc *,
273 struct ifnet *, bool);
274 static void hn_rxvf_set(struct hn_softc *, struct ifnet *);
275 static void hn_rxvf_set_task(void *, int);
276 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
277 static int hn_xpnt_vf_iocsetflags(struct hn_softc *);
278 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *,
280 static void hn_xpnt_vf_saveifflags(struct hn_softc *);
281 static bool hn_xpnt_vf_isready(struct hn_softc *);
282 static void hn_xpnt_vf_setready(struct hn_softc *);
283 static void hn_xpnt_vf_init_taskfunc(void *, int);
284 static void hn_xpnt_vf_init(struct hn_softc *);
285 static void hn_xpnt_vf_setenable(struct hn_softc *);
286 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool);
287 static void hn_vf_rss_fixup(struct hn_softc *, bool);
288 static void hn_vf_rss_restore(struct hn_softc *);
290 static int hn_rndis_rxinfo(const void *, int,
292 static void hn_rndis_rx_data(struct hn_rx_ring *,
294 static void hn_rndis_rx_status(struct hn_softc *,
296 static void hn_rndis_init_fixat(struct hn_softc *, int);
298 static void hn_nvs_handle_notify(struct hn_softc *,
299 const struct vmbus_chanpkt_hdr *);
300 static void hn_nvs_handle_comp(struct hn_softc *,
301 struct vmbus_channel *,
302 const struct vmbus_chanpkt_hdr *);
303 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
304 struct vmbus_channel *,
305 const struct vmbus_chanpkt_hdr *);
306 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
307 struct vmbus_channel *, uint64_t);
309 #if __FreeBSD_version >= 1100099
310 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
311 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
313 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
314 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
315 #if __FreeBSD_version < 1100095
316 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
318 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
320 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
321 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
322 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
323 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
324 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
325 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
326 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
328 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
329 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
331 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
332 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
333 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
334 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
335 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
336 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
337 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
338 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
339 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
340 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
341 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
342 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
343 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
344 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
346 static void hn_stop(struct hn_softc *, bool);
347 static void hn_init_locked(struct hn_softc *);
348 static int hn_chan_attach(struct hn_softc *,
349 struct vmbus_channel *);
350 static void hn_chan_detach(struct hn_softc *,
351 struct vmbus_channel *);
352 static int hn_attach_subchans(struct hn_softc *);
353 static void hn_detach_allchans(struct hn_softc *);
354 static void hn_chan_rollup(struct hn_rx_ring *,
355 struct hn_tx_ring *);
356 static void hn_set_ring_inuse(struct hn_softc *, int);
357 static int hn_synth_attach(struct hn_softc *, int);
358 static void hn_synth_detach(struct hn_softc *);
359 static int hn_synth_alloc_subchans(struct hn_softc *,
361 static bool hn_synth_attachable(const struct hn_softc *);
362 static void hn_suspend(struct hn_softc *);
363 static void hn_suspend_data(struct hn_softc *);
364 static void hn_suspend_mgmt(struct hn_softc *);
365 static void hn_resume(struct hn_softc *);
366 static void hn_resume_data(struct hn_softc *);
367 static void hn_resume_mgmt(struct hn_softc *);
368 static void hn_suspend_mgmt_taskfunc(void *, int);
369 static void hn_chan_drain(struct hn_softc *,
370 struct vmbus_channel *);
371 static void hn_disable_rx(struct hn_softc *);
372 static void hn_drain_rxtx(struct hn_softc *, int);
373 static void hn_polling(struct hn_softc *, u_int);
374 static void hn_chan_polling(struct vmbus_channel *, u_int);
375 static void hn_mtu_change_fixup(struct hn_softc *);
377 static void hn_update_link_status(struct hn_softc *);
378 static void hn_change_network(struct hn_softc *);
379 static void hn_link_taskfunc(void *, int);
380 static void hn_netchg_init_taskfunc(void *, int);
381 static void hn_netchg_status_taskfunc(void *, int);
382 static void hn_link_status(struct hn_softc *);
384 static int hn_create_rx_data(struct hn_softc *, int);
385 static void hn_destroy_rx_data(struct hn_softc *);
386 static int hn_check_iplen(const struct mbuf *, int);
387 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
388 static int hn_rxfilter_config(struct hn_softc *);
390 static int hn_rss_reconfig(struct hn_softc *);
392 static void hn_rss_ind_fixup(struct hn_softc *);
393 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
394 static int hn_rxpkt(struct hn_rx_ring *, const void *,
395 int, const struct hn_rxinfo *);
396 static uint32_t hn_rss_type_fromndis(uint32_t);
397 static uint32_t hn_rss_type_tondis(uint32_t);
399 static int hn_tx_ring_create(struct hn_softc *, int);
400 static void hn_tx_ring_destroy(struct hn_tx_ring *);
401 static int hn_create_tx_data(struct hn_softc *, int);
402 static void hn_fixup_tx_data(struct hn_softc *);
403 static void hn_destroy_tx_data(struct hn_softc *);
404 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
405 static void hn_txdesc_gc(struct hn_tx_ring *,
407 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
408 struct hn_txdesc *, struct mbuf **);
409 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
411 static void hn_set_chim_size(struct hn_softc *, int);
412 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
413 static bool hn_tx_ring_pending(struct hn_tx_ring *);
414 static void hn_tx_ring_qflush(struct hn_tx_ring *);
415 static void hn_resume_tx(struct hn_softc *, int);
416 static void hn_set_txagg(struct hn_softc *);
417 static void *hn_try_txagg(struct ifnet *,
418 struct hn_tx_ring *, struct hn_txdesc *,
420 static int hn_get_txswq_depth(const struct hn_tx_ring *);
421 static void hn_txpkt_done(struct hn_nvs_sendctx *,
422 struct hn_softc *, struct vmbus_channel *,
424 static int hn_txpkt_sglist(struct hn_tx_ring *,
426 static int hn_txpkt_chim(struct hn_tx_ring *,
428 static int hn_xmit(struct hn_tx_ring *, int);
429 static void hn_xmit_taskfunc(void *, int);
430 static void hn_xmit_txeof(struct hn_tx_ring *);
431 static void hn_xmit_txeof_taskfunc(void *, int);
432 #ifdef HN_IFSTART_SUPPORT
433 static int hn_start_locked(struct hn_tx_ring *, int);
434 static void hn_start_taskfunc(void *, int);
435 static void hn_start_txeof(struct hn_tx_ring *);
436 static void hn_start_txeof_taskfunc(void *, int);
439 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
440 "Hyper-V network interface");
442 /* Trust tcp segements verification on host side. */
443 static int hn_trust_hosttcp = 1;
444 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
445 &hn_trust_hosttcp, 0,
446 "Trust tcp segement verification on host side, "
447 "when csum info is missing (global setting)");
449 /* Trust udp datagrams verification on host side. */
450 static int hn_trust_hostudp = 1;
451 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
452 &hn_trust_hostudp, 0,
453 "Trust udp datagram verification on host side, "
454 "when csum info is missing (global setting)");
456 /* Trust ip packets verification on host side. */
457 static int hn_trust_hostip = 1;
458 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
460 "Trust ip packet verification on host side, "
461 "when csum info is missing (global setting)");
463 /* Limit TSO burst size */
464 static int hn_tso_maxlen = IP_MAXPACKET;
465 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
466 &hn_tso_maxlen, 0, "TSO burst limit");
468 /* Limit chimney send size */
469 static int hn_tx_chimney_size = 0;
470 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
471 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
473 /* Limit the size of packet for direct transmission */
474 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
475 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
476 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
478 /* # of LRO entries per RX ring */
479 #if defined(INET) || defined(INET6)
480 #if __FreeBSD_version >= 1100095
481 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
482 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
483 &hn_lro_entry_count, 0, "LRO entry count");
487 static int hn_tx_taskq_cnt = 1;
488 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
489 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
491 #define HN_TX_TASKQ_M_INDEP 0
492 #define HN_TX_TASKQ_M_GLOBAL 1
493 #define HN_TX_TASKQ_M_EVTTQ 2
495 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
496 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
497 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
498 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
500 #ifndef HN_USE_TXDESC_BUFRING
501 static int hn_use_txdesc_bufring = 0;
503 static int hn_use_txdesc_bufring = 1;
505 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
506 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
508 #ifdef HN_IFSTART_SUPPORT
509 /* Use ifnet.if_start instead of ifnet.if_transmit */
510 static int hn_use_if_start = 0;
511 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
512 &hn_use_if_start, 0, "Use if_start TX method");
515 /* # of channels to use */
516 static int hn_chan_cnt = 0;
517 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
519 "# of channels to use; each channel has one RX ring and one TX ring");
521 /* # of transmit rings to use */
522 static int hn_tx_ring_cnt = 0;
523 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
524 &hn_tx_ring_cnt, 0, "# of TX rings to use");
526 /* Software TX ring deptch */
527 static int hn_tx_swq_depth = 0;
528 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
529 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
531 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
532 #if __FreeBSD_version >= 1100095
533 static u_int hn_lro_mbufq_depth = 0;
534 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
535 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
538 /* Packet transmission aggregation size limit */
539 static int hn_tx_agg_size = -1;
540 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
541 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
543 /* Packet transmission aggregation count limit */
544 static int hn_tx_agg_pkts = -1;
545 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
546 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
549 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
550 0, 0, hn_vflist_sysctl, "A", "VF list");
553 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
554 0, 0, hn_vfmap_sysctl, "A", "VF mapping");
557 static int hn_xpnt_vf = 0;
558 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
559 &hn_xpnt_vf, 0, "Transparent VF mod");
561 /* Accurate BPF support for Transparent VF */
562 static int hn_xpnt_vf_accbpf = 0;
563 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
564 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
566 /* Extra wait for transparent VF attach routing; unit seconds. */
567 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
568 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
569 &hn_xpnt_vf_attwait, 0,
570 "Extra wait for transparent VF attach routing; unit: seconds");
572 static u_int hn_cpu_index; /* next CPU for channel */
573 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
575 static struct rmlock hn_vfmap_lock;
576 static int hn_vfmap_size;
577 static struct ifnet **hn_vfmap;
581 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
582 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
583 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
584 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
585 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
586 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
590 static const struct hyperv_guid hn_guid = {
592 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
593 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
596 static device_method_t hn_methods[] = {
597 /* Device interface */
598 DEVMETHOD(device_probe, hn_probe),
599 DEVMETHOD(device_attach, hn_attach),
600 DEVMETHOD(device_detach, hn_detach),
601 DEVMETHOD(device_shutdown, hn_shutdown),
605 static driver_t hn_driver = {
608 sizeof(struct hn_softc)
611 static devclass_t hn_devclass;
613 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
614 MODULE_VERSION(hn, 1);
615 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
617 #if __FreeBSD_version >= 1100099
619 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
623 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
624 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
629 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
632 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
633 txd->chim_size == 0, ("invalid rndis sglist txd"));
634 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
635 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
639 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
641 struct hn_nvs_rndis rndis;
643 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
644 txd->chim_size > 0, ("invalid rndis chim txd"));
646 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
647 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
648 rndis.nvs_chim_idx = txd->chim_index;
649 rndis.nvs_chim_sz = txd->chim_size;
651 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
652 &rndis, sizeof(rndis), &txd->send_ctx));
655 static __inline uint32_t
656 hn_chim_alloc(struct hn_softc *sc)
658 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
659 u_long *bmap = sc->hn_chim_bmap;
660 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
662 for (i = 0; i < bmap_cnt; ++i) {
665 idx = ffsl(~bmap[i]);
669 --idx; /* ffsl is 1-based */
670 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
671 ("invalid i %d and idx %d", i, idx));
673 if (atomic_testandset_long(&bmap[i], idx))
676 ret = i * LONG_BIT + idx;
683 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
688 idx = chim_idx / LONG_BIT;
689 KASSERT(idx < sc->hn_chim_bmap_cnt,
690 ("invalid chimney index 0x%x", chim_idx));
692 mask = 1UL << (chim_idx % LONG_BIT);
693 KASSERT(sc->hn_chim_bmap[idx] & mask,
694 ("index bitmap 0x%lx, chimney index %u, "
695 "bitmap idx %d, bitmask 0x%lx",
696 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
698 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
701 #if defined(INET6) || defined(INET)
703 #define PULLUP_HDR(m, len) \
705 if (__predict_false((m)->m_len < (len))) { \
706 (m) = m_pullup((m), (len)); \
713 * NOTE: If this function failed, the m_head would be freed.
715 static __inline struct mbuf *
716 hn_tso_fixup(struct mbuf *m_head)
718 struct ether_vlan_header *evl;
722 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
724 PULLUP_HDR(m_head, sizeof(*evl));
725 evl = mtod(m_head, struct ether_vlan_header *);
726 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
727 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
729 ehlen = ETHER_HDR_LEN;
732 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
736 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
737 ip = mtodo(m_head, ehlen);
738 iphlen = ip->ip_hl << 2;
740 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
741 th = mtodo(m_head, ehlen + iphlen);
745 th->th_sum = in_pseudo(ip->ip_src.s_addr,
746 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
749 #if defined(INET6) && defined(INET)
756 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
757 ip6 = mtodo(m_head, ehlen);
758 if (ip6->ip6_nxt != IPPROTO_TCP) {
763 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
764 th = mtodo(m_head, ehlen + sizeof(*ip6));
767 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
775 * NOTE: If this function failed, the m_head would be freed.
777 static __inline struct mbuf *
778 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
780 const struct ether_vlan_header *evl;
781 const struct tcphdr *th;
786 PULLUP_HDR(m_head, sizeof(*evl));
787 evl = mtod(m_head, const struct ether_vlan_header *);
788 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
789 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
791 ehlen = ETHER_HDR_LEN;
794 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TCP) {
798 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
799 ip = mtodo(m_head, ehlen);
800 iphlen = ip->ip_hl << 2;
802 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
803 th = mtodo(m_head, ehlen + iphlen);
804 if (th->th_flags & TH_SYN)
808 #if defined(INET6) && defined(INET)
813 const struct ip6_hdr *ip6;
815 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
816 ip6 = mtodo(m_head, ehlen);
817 if (ip6->ip6_nxt != IPPROTO_TCP)
820 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
821 th = mtodo(m_head, ehlen + sizeof(*ip6));
822 if (th->th_flags & TH_SYN)
831 #endif /* INET6 || INET */
834 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
840 if (sc->hn_rx_filter != filter) {
841 error = hn_rndis_set_rxfilter(sc, filter);
843 sc->hn_rx_filter = filter;
849 hn_rxfilter_config(struct hn_softc *sc)
851 struct ifnet *ifp = sc->hn_ifp;
857 * If the non-transparent mode VF is activated, we don't know how
858 * its RX filter is configured, so stick the synthetic device in
859 * the promiscous mode.
861 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
862 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
864 filter = NDIS_PACKET_TYPE_DIRECTED;
865 if (ifp->if_flags & IFF_BROADCAST)
866 filter |= NDIS_PACKET_TYPE_BROADCAST;
867 /* TODO: support multicast list */
868 if ((ifp->if_flags & IFF_ALLMULTI) ||
869 !TAILQ_EMPTY(&ifp->if_multiaddrs))
870 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
872 return (hn_set_rxfilter(sc, filter));
876 hn_set_txagg(struct hn_softc *sc)
882 * Setup aggregation size.
884 if (sc->hn_agg_size < 0)
887 size = sc->hn_agg_size;
889 if (sc->hn_rndis_agg_size < size)
890 size = sc->hn_rndis_agg_size;
892 /* NOTE: We only aggregate packets using chimney sending buffers. */
893 if (size > (uint32_t)sc->hn_chim_szmax)
894 size = sc->hn_chim_szmax;
896 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
903 /* NOTE: Type of the per TX ring setting is 'int'. */
908 * Setup aggregation packet count.
910 if (sc->hn_agg_pkts < 0)
913 pkts = sc->hn_agg_pkts;
915 if (sc->hn_rndis_agg_pkts < pkts)
916 pkts = sc->hn_rndis_agg_pkts;
925 /* NOTE: Type of the per TX ring setting is 'short'. */
930 /* NOTE: Type of the per TX ring setting is 'short'. */
931 if (sc->hn_rndis_agg_align > SHRT_MAX) {
938 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
939 size, pkts, sc->hn_rndis_agg_align);
942 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
943 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
945 mtx_lock(&txr->hn_tx_lock);
946 txr->hn_agg_szmax = size;
947 txr->hn_agg_pktmax = pkts;
948 txr->hn_agg_align = sc->hn_rndis_agg_align;
949 mtx_unlock(&txr->hn_tx_lock);
954 hn_get_txswq_depth(const struct hn_tx_ring *txr)
957 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
958 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
959 return txr->hn_txdesc_cnt;
960 return hn_tx_swq_depth;
965 hn_rss_reconfig(struct hn_softc *sc)
971 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
978 * Direct reconfiguration by setting the UNCHG flags does
979 * _not_ work properly.
982 if_printf(sc->hn_ifp, "disable RSS\n");
983 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
985 if_printf(sc->hn_ifp, "RSS disable failed\n");
990 * Reenable the RSS w/ the updated RSS key or indirect
994 if_printf(sc->hn_ifp, "reconfig RSS\n");
995 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
997 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1005 hn_rss_ind_fixup(struct hn_softc *sc)
1007 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1010 nchan = sc->hn_rx_ring_inuse;
1011 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1014 * Check indirect table to make sure that all channels in it
1017 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1018 if (rss->rss_ind[i] >= nchan) {
1019 if_printf(sc->hn_ifp,
1020 "RSS indirect table %d fixup: %u -> %d\n",
1021 i, rss->rss_ind[i], nchan - 1);
1022 rss->rss_ind[i] = nchan - 1;
1028 hn_ifmedia_upd(struct ifnet *ifp __unused)
1035 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1037 struct hn_softc *sc = ifp->if_softc;
1039 ifmr->ifm_status = IFM_AVALID;
1040 ifmr->ifm_active = IFM_ETHER;
1042 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1043 ifmr->ifm_active |= IFM_NONE;
1046 ifmr->ifm_status |= IFM_ACTIVE;
1047 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1051 hn_rxvf_set_task(void *xarg, int pending __unused)
1053 struct hn_rxvf_setarg *arg = xarg;
1055 arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1059 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1061 struct hn_rx_ring *rxr;
1062 struct hn_rxvf_setarg arg;
1068 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1070 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1071 rxr = &sc->hn_rx_ring[i];
1073 if (i < sc->hn_rx_ring_inuse) {
1075 arg.vf_ifp = vf_ifp;
1076 vmbus_chan_run_task(rxr->hn_chan, &task);
1078 rxr->hn_rxvf_ifp = vf_ifp;
1084 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1086 const struct ifnet *hn_ifp;
1088 hn_ifp = sc->hn_ifp;
1093 if (ifp->if_alloctype != IFT_ETHER)
1096 /* Ignore lagg/vlan interfaces */
1097 if (strcmp(ifp->if_dname, "lagg") == 0 ||
1098 strcmp(ifp->if_dname, "vlan") == 0)
1101 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1108 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1110 struct ifnet *hn_ifp;
1114 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1117 if (!hn_ismyvf(sc, ifp))
1119 hn_ifp = sc->hn_ifp;
1122 if (sc->hn_flags & HN_FLAG_RXVF)
1125 sc->hn_flags |= HN_FLAG_RXVF;
1126 hn_rxfilter_config(sc);
1128 if (!(sc->hn_flags & HN_FLAG_RXVF))
1131 sc->hn_flags &= ~HN_FLAG_RXVF;
1132 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1133 hn_rxfilter_config(sc);
1135 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1138 hn_nvs_set_datapath(sc,
1139 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1141 hn_rxvf_set(sc, rxvf ? ifp : NULL);
1144 hn_vf_rss_fixup(sc, true);
1145 hn_suspend_mgmt(sc);
1146 sc->hn_link_flags &=
1147 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1148 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1150 hn_vf_rss_restore(sc);
1154 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1155 rxvf ? "VF_UP" : "VF_DOWN", NULL);
1158 if_printf(hn_ifp, "datapath is switched %s %s\n",
1159 rxvf ? "to" : "from", ifp->if_xname);
1166 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1169 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1171 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1175 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1178 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1182 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1184 struct ifnet *ifp, *vf_ifp;
1190 vf_ifp = sc->hn_vf_ifp;
1193 * Fix up requested capabilities w/ supported capabilities,
1194 * since the supported capabilities could have been changed.
1196 ifr->ifr_reqcap &= ifp->if_capabilities;
1197 /* Pass SIOCSIFCAP to VF. */
1198 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1202 * The error will be propagated to the callers, however, it
1203 * is _not_ useful here.
1207 * Merge VF's enabled capabilities.
1209 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1211 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1212 if (ifp->if_capenable & IFCAP_TXCSUM)
1213 ifp->if_hwassist |= tmp;
1215 ifp->if_hwassist &= ~tmp;
1217 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1218 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1219 ifp->if_hwassist |= tmp;
1221 ifp->if_hwassist &= ~tmp;
1223 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1224 if (ifp->if_capenable & IFCAP_TSO4)
1225 ifp->if_hwassist |= tmp;
1227 ifp->if_hwassist &= ~tmp;
1229 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1230 if (ifp->if_capenable & IFCAP_TSO6)
1231 ifp->if_hwassist |= tmp;
1233 ifp->if_hwassist &= ~tmp;
1239 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1241 struct ifnet *vf_ifp;
1245 vf_ifp = sc->hn_vf_ifp;
1247 memset(&ifr, 0, sizeof(ifr));
1248 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1249 ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1250 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1251 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1255 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1257 struct ifnet *ifp = sc->hn_ifp;
1262 /* XXX vlan(4) style mcast addr maintenance */
1263 if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1264 allmulti = IFF_ALLMULTI;
1266 /* Always set the VF's if_flags */
1267 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1271 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1273 struct rm_priotracker pt;
1274 struct ifnet *hn_ifp = NULL;
1278 * XXX racy, if hn(4) ever detached.
1280 rm_rlock(&hn_vfmap_lock, &pt);
1281 if (vf_ifp->if_index < hn_vfmap_size)
1282 hn_ifp = hn_vfmap[vf_ifp->if_index];
1283 rm_runlock(&hn_vfmap_lock, &pt);
1285 if (hn_ifp != NULL) {
1286 for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1288 * Allow tapping on the VF.
1290 ETHER_BPF_MTAP(vf_ifp, mn);
1295 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1296 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1300 * XXX IFCOUNTER_IMCAST
1301 * This stat updating is kinda invasive, since it
1302 * requires two checks on the mbuf: the length check
1303 * and the ethernet header check. As of this write,
1304 * all multicast packets go directly to hn(4), which
1305 * makes imcast stat updating in the VF a try in vian.
1309 * Fix up rcvif and increase hn(4)'s ipackets.
1311 mn->m_pkthdr.rcvif = hn_ifp;
1312 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1315 * Go through hn(4)'s if_input.
1317 hn_ifp->if_input(hn_ifp, m);
1320 * In the middle of the transition; free this
1325 m->m_nextpkt = NULL;
1333 hn_mtu_change_fixup(struct hn_softc *sc)
1340 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1341 #if __FreeBSD_version >= 1100099
1342 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1343 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1348 hn_rss_type_fromndis(uint32_t rss_hash)
1352 if (rss_hash & NDIS_HASH_IPV4)
1353 types |= RSS_TYPE_IPV4;
1354 if (rss_hash & NDIS_HASH_TCP_IPV4)
1355 types |= RSS_TYPE_TCP_IPV4;
1356 if (rss_hash & NDIS_HASH_IPV6)
1357 types |= RSS_TYPE_IPV6;
1358 if (rss_hash & NDIS_HASH_IPV6_EX)
1359 types |= RSS_TYPE_IPV6_EX;
1360 if (rss_hash & NDIS_HASH_TCP_IPV6)
1361 types |= RSS_TYPE_TCP_IPV6;
1362 if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1363 types |= RSS_TYPE_TCP_IPV6_EX;
1368 hn_rss_type_tondis(uint32_t types)
1370 uint32_t rss_hash = 0;
1373 (RSS_TYPE_UDP_IPV4 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1374 ("UDP4, UDP6 and UDP6EX are not supported"));
1376 if (types & RSS_TYPE_IPV4)
1377 rss_hash |= NDIS_HASH_IPV4;
1378 if (types & RSS_TYPE_TCP_IPV4)
1379 rss_hash |= NDIS_HASH_TCP_IPV4;
1380 if (types & RSS_TYPE_IPV6)
1381 rss_hash |= NDIS_HASH_IPV6;
1382 if (types & RSS_TYPE_IPV6_EX)
1383 rss_hash |= NDIS_HASH_IPV6_EX;
1384 if (types & RSS_TYPE_TCP_IPV6)
1385 rss_hash |= NDIS_HASH_TCP_IPV6;
1386 if (types & RSS_TYPE_TCP_IPV6_EX)
1387 rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1392 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1398 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1399 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1403 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1405 struct ifnet *ifp, *vf_ifp;
1406 struct ifrsshash ifrh;
1407 struct ifrsskey ifrk;
1409 uint32_t my_types, diff_types, mbuf_types = 0;
1412 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1413 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1415 if (sc->hn_rx_ring_inuse == 1) {
1416 /* No RSS on synthetic parts; done. */
1419 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1420 /* Synthetic parts do not support Toeplitz; done. */
1425 vf_ifp = sc->hn_vf_ifp;
1428 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is
1431 memset(&ifrk, 0, sizeof(ifrk));
1432 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1433 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1435 if_printf(ifp, "%s SIOCGRSSKEY failed: %d\n",
1436 vf_ifp->if_xname, error);
1439 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1440 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1441 vf_ifp->if_xname, ifrk.ifrk_func);
1444 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1445 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1446 vf_ifp->if_xname, ifrk.ifrk_keylen);
1451 * Extract VF's RSS hash. Only Toeplitz is supported.
1453 memset(&ifrh, 0, sizeof(ifrh));
1454 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1455 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1457 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1458 vf_ifp->if_xname, error);
1461 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1462 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1463 vf_ifp->if_xname, ifrh.ifrh_func);
1467 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1468 if ((ifrh.ifrh_types & my_types) == 0) {
1469 /* This disables RSS; ignore it then */
1470 if_printf(ifp, "%s intersection of RSS types failed. "
1471 "VF %#x, mine %#x\n", vf_ifp->if_xname,
1472 ifrh.ifrh_types, my_types);
1476 diff_types = my_types ^ ifrh.ifrh_types;
1477 my_types &= ifrh.ifrh_types;
1478 mbuf_types = my_types;
1481 * Detect RSS hash value/type confliction.
1484 * We don't disable the hash type, but stop delivery the hash
1485 * value/type through mbufs on RX path.
1487 if ((my_types & RSS_TYPE_IPV4) &&
1488 (diff_types & ifrh.ifrh_types &
1489 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1490 /* Conflict; disable IPV4 hash type/value delivery. */
1491 if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1492 mbuf_types &= ~RSS_TYPE_IPV4;
1494 if ((my_types & RSS_TYPE_IPV6) &&
1495 (diff_types & ifrh.ifrh_types &
1496 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1497 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1498 RSS_TYPE_IPV6_EX))) {
1499 /* Conflict; disable IPV6 hash type/value delivery. */
1500 if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1501 mbuf_types &= ~RSS_TYPE_IPV6;
1503 if ((my_types & RSS_TYPE_IPV6_EX) &&
1504 (diff_types & ifrh.ifrh_types &
1505 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1506 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1508 /* Conflict; disable IPV6_EX hash type/value delivery. */
1509 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1510 mbuf_types &= ~RSS_TYPE_IPV6_EX;
1512 if ((my_types & RSS_TYPE_TCP_IPV6) &&
1513 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1514 /* Conflict; disable TCP_IPV6 hash type/value delivery. */
1515 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1516 mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1518 if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1519 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1520 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1521 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1522 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1524 if ((my_types & RSS_TYPE_UDP_IPV6) &&
1525 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1526 /* Conflict; disable UDP_IPV6 hash type/value delivery. */
1527 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1528 mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1530 if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1531 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1532 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1533 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1534 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1538 * Indirect table does not matter.
1541 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1542 hn_rss_type_tondis(my_types);
1543 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1544 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1547 error = hn_rss_reconfig(sc);
1549 /* XXX roll-back? */
1550 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1551 /* XXX keep going. */
1555 /* Hash deliverability for mbufs. */
1556 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1560 hn_vf_rss_restore(struct hn_softc *sc)
1564 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1565 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1567 if (sc->hn_rx_ring_inuse == 1)
1571 * Restore hash types. Key does _not_ matter.
1573 if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1576 sc->hn_rss_hash = sc->hn_rss_hcap;
1577 error = hn_rss_reconfig(sc);
1579 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1581 /* XXX keep going. */
1585 /* Hash deliverability for mbufs. */
1586 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1590 hn_xpnt_vf_setready(struct hn_softc *sc)
1592 struct ifnet *ifp, *vf_ifp;
1597 vf_ifp = sc->hn_vf_ifp;
1600 * Mark the VF ready.
1602 sc->hn_vf_rdytick = 0;
1605 * Save information for restoration.
1607 sc->hn_saved_caps = ifp->if_capabilities;
1608 sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1609 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1610 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1613 * Intersect supported/enabled capabilities.
1616 * if_hwassist is not changed here.
1618 ifp->if_capabilities &= vf_ifp->if_capabilities;
1619 ifp->if_capenable &= ifp->if_capabilities;
1624 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1625 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1626 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1627 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1628 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1629 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1632 * Change VF's enabled capabilities.
1634 memset(&ifr, 0, sizeof(ifr));
1635 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1636 ifr.ifr_reqcap = ifp->if_capenable;
1637 hn_xpnt_vf_iocsetcaps(sc, &ifr);
1639 if (ifp->if_mtu != ETHERMTU) {
1645 memset(&ifr, 0, sizeof(ifr));
1646 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1647 ifr.ifr_mtu = ifp->if_mtu;
1648 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1650 if_printf(ifp, "%s SIOCSIFMTU %u failed\n",
1651 vf_ifp->if_xname, ifp->if_mtu);
1652 if (ifp->if_mtu > ETHERMTU) {
1653 if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1657 * No need to adjust the synthetic parts' MTU;
1658 * failure of the adjustment will cause us
1659 * infinite headache.
1661 ifp->if_mtu = ETHERMTU;
1662 hn_mtu_change_fixup(sc);
1669 hn_xpnt_vf_isready(struct hn_softc *sc)
1674 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1677 if (sc->hn_vf_rdytick == 0)
1680 if (sc->hn_vf_rdytick > ticks)
1683 /* Mark VF as ready. */
1684 hn_xpnt_vf_setready(sc);
1689 hn_xpnt_vf_setenable(struct hn_softc *sc)
1695 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1696 rm_wlock(&sc->hn_vf_lock);
1697 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1698 rm_wunlock(&sc->hn_vf_lock);
1700 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1701 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1705 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1711 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1712 rm_wlock(&sc->hn_vf_lock);
1713 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1715 sc->hn_vf_ifp = NULL;
1716 rm_wunlock(&sc->hn_vf_lock);
1718 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1719 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1723 hn_xpnt_vf_init(struct hn_softc *sc)
1729 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1730 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1733 if_printf(sc->hn_ifp, "try bringing up %s\n",
1734 sc->hn_vf_ifp->if_xname);
1740 hn_xpnt_vf_saveifflags(sc);
1741 sc->hn_vf_ifp->if_flags |= IFF_UP;
1742 error = hn_xpnt_vf_iocsetflags(sc);
1744 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1745 sc->hn_vf_ifp->if_xname, error);
1751 * Datapath setting must happen _after_ bringing the VF up.
1753 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1757 * Fixup RSS related bits _after_ the VF is brought up, since
1758 * many VFs generate RSS key during it's initialization.
1760 hn_vf_rss_fixup(sc, true);
1762 /* Mark transparent mode VF as enabled. */
1763 hn_xpnt_vf_setenable(sc);
1767 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1769 struct hn_softc *sc = xsc;
1773 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1775 if (sc->hn_vf_ifp == NULL)
1777 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1780 if (sc->hn_vf_rdytick != 0) {
1781 /* Mark VF as ready. */
1782 hn_xpnt_vf_setready(sc);
1785 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1787 * Delayed VF initialization.
1790 if_printf(sc->hn_ifp, "delayed initialize %s\n",
1791 sc->hn_vf_ifp->if_xname);
1793 hn_xpnt_vf_init(sc);
1800 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1802 struct hn_softc *sc = xsc;
1806 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1809 if (!hn_ismyvf(sc, ifp))
1812 if (sc->hn_vf_ifp != NULL) {
1813 if_printf(sc->hn_ifp, "%s was attached as VF\n",
1814 sc->hn_vf_ifp->if_xname);
1818 if (hn_xpnt_vf && ifp->if_start != NULL) {
1820 * ifnet.if_start is _not_ supported by transparent
1821 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1823 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1824 "in transparent VF mode.\n", ifp->if_xname);
1828 rm_wlock(&hn_vfmap_lock);
1830 if (ifp->if_index >= hn_vfmap_size) {
1831 struct ifnet **newmap;
1834 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1835 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1838 memcpy(newmap, hn_vfmap,
1839 sizeof(struct ifnet *) * hn_vfmap_size);
1840 free(hn_vfmap, M_DEVBUF);
1842 hn_vfmap_size = newsize;
1844 KASSERT(hn_vfmap[ifp->if_index] == NULL,
1845 ("%s: ifindex %d was mapped to %s",
1846 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1847 hn_vfmap[ifp->if_index] = sc->hn_ifp;
1849 rm_wunlock(&hn_vfmap_lock);
1851 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1852 rm_wlock(&sc->hn_vf_lock);
1853 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1854 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1855 sc->hn_vf_ifp = ifp;
1856 rm_wunlock(&sc->hn_vf_lock);
1862 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1863 * Save vf_ifp's current if_input for later restoration.
1865 sc->hn_vf_input = ifp->if_input;
1866 ifp->if_input = hn_xpnt_vf_input;
1869 * Stop link status management; use the VF's.
1871 hn_suspend_mgmt(sc);
1874 * Give VF sometime to complete its attach routing.
1876 wait_ticks = hn_xpnt_vf_attwait * hz;
1877 sc->hn_vf_rdytick = ticks + wait_ticks;
1879 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1887 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1889 struct hn_softc *sc = xsc;
1893 if (sc->hn_vf_ifp == NULL)
1896 if (!hn_ismyvf(sc, ifp))
1901 * Make sure that the delayed initialization is not running.
1904 * - This lock _must_ be released, since the hn_vf_init task
1905 * will try holding this lock.
1906 * - It is safe to release this lock here, since the
1907 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1909 * XXX racy, if hn(4) ever detached.
1912 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1915 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1916 sc->hn_ifp->if_xname));
1917 ifp->if_input = sc->hn_vf_input;
1918 sc->hn_vf_input = NULL;
1920 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1921 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1922 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1924 if (sc->hn_vf_rdytick == 0) {
1926 * The VF was ready; restore some settings.
1928 sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
1931 * There is _no_ need to fixup if_capenable and
1932 * if_hwassist, since the if_capabilities before
1933 * restoration was an intersection of the VF's
1934 * if_capabilites and the synthetic device's
1937 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
1938 sc->hn_ifp->if_hw_tsomaxsegcount =
1939 sc->hn_saved_tsosegcnt;
1940 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
1943 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1945 * Restore RSS settings.
1947 hn_vf_rss_restore(sc);
1950 * Resume link status management, which was suspended
1951 * by hn_ifnet_attevent().
1957 /* Mark transparent mode VF as disabled. */
1958 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
1960 rm_wlock(&hn_vfmap_lock);
1962 KASSERT(ifp->if_index < hn_vfmap_size,
1963 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
1964 if (hn_vfmap[ifp->if_index] != NULL) {
1965 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
1966 ("%s: ifindex %d was mapped to %s",
1967 ifp->if_xname, ifp->if_index,
1968 hn_vfmap[ifp->if_index]->if_xname));
1969 hn_vfmap[ifp->if_index] = NULL;
1972 rm_wunlock(&hn_vfmap_lock);
1978 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
1980 struct hn_softc *sc = xsc;
1982 if (sc->hn_vf_ifp == ifp)
1983 if_link_state_change(sc->hn_ifp, link_state);
1987 hn_probe(device_t dev)
1990 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
1991 device_set_desc(dev, "Hyper-V Network Interface");
1992 return BUS_PROBE_DEFAULT;
1998 hn_attach(device_t dev)
2000 struct hn_softc *sc = device_get_softc(dev);
2001 struct sysctl_oid_list *child;
2002 struct sysctl_ctx_list *ctx;
2003 uint8_t eaddr[ETHER_ADDR_LEN];
2004 struct ifnet *ifp = NULL;
2005 int error, ring_cnt, tx_ring_cnt;
2008 sc->hn_prichan = vmbus_get_channel(dev);
2010 rm_init(&sc->hn_vf_lock, "hnvf");
2011 if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2012 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2015 * Initialize these tunables once.
2017 sc->hn_agg_size = hn_tx_agg_size;
2018 sc->hn_agg_pkts = hn_tx_agg_pkts;
2021 * Setup taskqueue for transmission.
2023 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2027 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2028 M_DEVBUF, M_WAITOK);
2029 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2030 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2031 M_WAITOK, taskqueue_thread_enqueue,
2032 &sc->hn_tx_taskqs[i]);
2033 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2034 "%s tx%d", device_get_nameunit(dev), i);
2036 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2037 sc->hn_tx_taskqs = hn_tx_taskque;
2041 * Setup taskqueue for mangement tasks, e.g. link status.
2043 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2044 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2045 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2046 device_get_nameunit(dev));
2047 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2048 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2049 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2050 hn_netchg_status_taskfunc, sc);
2054 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2056 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2057 taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2058 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2059 device_get_nameunit(dev));
2060 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2061 hn_xpnt_vf_init_taskfunc, sc);
2065 * Allocate ifnet and setup its name earlier, so that if_printf
2066 * can be used by functions, which will be called after
2069 ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
2071 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2074 * Initialize ifmedia earlier so that it can be unconditionally
2075 * destroyed, if error happened later on.
2077 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2080 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2081 * to use (tx_ring_cnt).
2084 * The # of RX rings to use is same as the # of channels to use.
2086 ring_cnt = hn_chan_cnt;
2087 if (ring_cnt <= 0) {
2089 ring_cnt = mp_ncpus;
2090 if (ring_cnt > HN_RING_CNT_DEF_MAX)
2091 ring_cnt = HN_RING_CNT_DEF_MAX;
2092 } else if (ring_cnt > mp_ncpus) {
2093 ring_cnt = mp_ncpus;
2096 if (ring_cnt > rss_getnumbuckets())
2097 ring_cnt = rss_getnumbuckets();
2100 tx_ring_cnt = hn_tx_ring_cnt;
2101 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2102 tx_ring_cnt = ring_cnt;
2103 #ifdef HN_IFSTART_SUPPORT
2104 if (hn_use_if_start) {
2105 /* ifnet.if_start only needs one TX ring. */
2111 * Set the leader CPU for channels.
2113 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2116 * Create enough TX/RX rings, even if only limited number of
2117 * channels can be allocated.
2119 error = hn_create_tx_data(sc, tx_ring_cnt);
2122 error = hn_create_rx_data(sc, ring_cnt);
2127 * Create transaction context for NVS and RNDIS transactions.
2129 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2130 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2131 if (sc->hn_xact == NULL) {
2137 * Install orphan handler for the revocation of this device's
2141 * The processing order is critical here:
2142 * Install the orphan handler, _before_ testing whether this
2143 * device's primary channel has been revoked or not.
2145 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2146 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2152 * Attach the synthetic parts, i.e. NVS and RNDIS.
2154 error = hn_synth_attach(sc, ETHERMTU);
2158 error = hn_rndis_get_eaddr(sc, eaddr);
2162 #if __FreeBSD_version >= 1100099
2163 if (sc->hn_rx_ring_inuse > 1) {
2165 * Reduce TCP segment aggregation limit for multiple
2166 * RX rings to increase ACK timeliness.
2168 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2173 * Fixup TX stuffs after synthetic parts are attached.
2175 hn_fixup_tx_data(sc);
2177 ctx = device_get_sysctl_ctx(dev);
2178 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2179 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2180 &sc->hn_nvs_ver, 0, "NVS version");
2181 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2182 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2183 hn_ndis_version_sysctl, "A", "NDIS version");
2184 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2185 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2186 hn_caps_sysctl, "A", "capabilities");
2187 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2188 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2189 hn_hwassist_sysctl, "A", "hwassist");
2190 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2191 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2192 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2193 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2194 "max # of TSO segments");
2195 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2196 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2197 "max size of TSO segment");
2198 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2199 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2200 hn_rxfilter_sysctl, "A", "rxfilter");
2201 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2202 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2203 hn_rss_hash_sysctl, "A", "RSS hash");
2204 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2205 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2206 hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2207 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2208 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2209 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2210 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2211 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2214 * Don't allow RSS key/indirect table changes, if RSS is defined.
2216 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2217 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2218 hn_rss_key_sysctl, "IU", "RSS key");
2219 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2220 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2221 hn_rss_ind_sysctl, "IU", "RSS indirect table");
2223 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2224 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2225 "RNDIS offered packet transmission aggregation size limit");
2226 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2227 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2228 "RNDIS offered packet transmission aggregation count limit");
2229 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2230 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2231 "RNDIS packet transmission aggregation alignment");
2232 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2233 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2234 hn_txagg_size_sysctl, "I",
2235 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2236 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2237 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2238 hn_txagg_pkts_sysctl, "I",
2239 "Packet transmission aggregation packets, "
2240 "0 -- disable, -1 -- auto");
2241 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2242 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2243 hn_polling_sysctl, "I",
2244 "Polling frequency: [100,1000000], 0 disable polling");
2245 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2246 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2247 hn_vf_sysctl, "A", "Virtual Function's name");
2249 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2250 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2251 hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2253 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2254 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2255 hn_xpnt_vf_enabled_sysctl, "I",
2256 "Transparent VF enabled");
2257 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2258 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2259 hn_xpnt_vf_accbpf_sysctl, "I",
2260 "Accurate BPF for transparent VF");
2264 * Setup the ifmedia, which has been initialized earlier.
2266 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2267 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2268 /* XXX ifmedia_set really should do this for us */
2269 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2272 * Setup the ifnet for this interface.
2275 ifp->if_baudrate = IF_Gbps(10);
2276 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2277 ifp->if_ioctl = hn_ioctl;
2278 ifp->if_init = hn_init;
2279 #ifdef HN_IFSTART_SUPPORT
2280 if (hn_use_if_start) {
2281 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2283 ifp->if_start = hn_start;
2284 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2285 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2286 IFQ_SET_READY(&ifp->if_snd);
2290 ifp->if_transmit = hn_transmit;
2291 ifp->if_qflush = hn_xmit_qflush;
2294 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2296 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2297 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2299 if (sc->hn_caps & HN_CAP_VLAN) {
2300 /* XXX not sure about VLAN_MTU. */
2301 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2304 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2305 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2306 ifp->if_capabilities |= IFCAP_TXCSUM;
2307 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2308 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2309 if (sc->hn_caps & HN_CAP_TSO4) {
2310 ifp->if_capabilities |= IFCAP_TSO4;
2311 ifp->if_hwassist |= CSUM_IP_TSO;
2313 if (sc->hn_caps & HN_CAP_TSO6) {
2314 ifp->if_capabilities |= IFCAP_TSO6;
2315 ifp->if_hwassist |= CSUM_IP6_TSO;
2318 /* Enable all available capabilities by default. */
2319 ifp->if_capenable = ifp->if_capabilities;
2322 * Disable IPv6 TSO and TXCSUM by default, they still can
2323 * be enabled through SIOCSIFCAP.
2325 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2326 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2328 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2330 * Lock hn_set_tso_maxsize() to simplify its
2334 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2336 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2337 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2340 ether_ifattach(ifp, eaddr);
2342 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2343 if_printf(ifp, "TSO segcnt %u segsz %u\n",
2344 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2347 /* Inform the upper layer about the long frame support. */
2348 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2351 * Kick off link status check.
2353 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2354 hn_update_link_status(sc);
2357 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2358 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2359 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2360 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2362 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2363 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2368 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2369 * since interface's LLADDR is needed; interface LLADDR is not
2370 * available when ifnet_arrival event is triggered.
2372 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2373 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2374 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2375 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2379 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2380 hn_synth_detach(sc);
2386 hn_detach(device_t dev)
2388 struct hn_softc *sc = device_get_softc(dev);
2389 struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2391 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2393 * In case that the vmbus missed the orphan handler
2396 vmbus_xact_ctx_orphan(sc->hn_xact);
2399 if (sc->hn_ifaddr_evthand != NULL)
2400 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2401 if (sc->hn_ifnet_evthand != NULL)
2402 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2403 if (sc->hn_ifnet_atthand != NULL) {
2404 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2405 sc->hn_ifnet_atthand);
2407 if (sc->hn_ifnet_dethand != NULL) {
2408 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2409 sc->hn_ifnet_dethand);
2411 if (sc->hn_ifnet_lnkhand != NULL)
2412 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2414 vf_ifp = sc->hn_vf_ifp;
2415 __compiler_membar();
2417 hn_ifnet_detevent(sc, vf_ifp);
2419 if (device_is_attached(dev)) {
2421 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2422 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2426 * hn_stop() only suspends data, so managment
2427 * stuffs have to be suspended manually here.
2429 hn_suspend_mgmt(sc);
2430 hn_synth_detach(sc);
2433 ether_ifdetach(ifp);
2436 ifmedia_removeall(&sc->hn_media);
2437 hn_destroy_rx_data(sc);
2438 hn_destroy_tx_data(sc);
2440 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2443 for (i = 0; i < hn_tx_taskq_cnt; ++i)
2444 taskqueue_free(sc->hn_tx_taskqs[i]);
2445 free(sc->hn_tx_taskqs, M_DEVBUF);
2447 taskqueue_free(sc->hn_mgmt_taskq0);
2448 if (sc->hn_vf_taskq != NULL)
2449 taskqueue_free(sc->hn_vf_taskq);
2451 if (sc->hn_xact != NULL) {
2453 * Uninstall the orphan handler _before_ the xact is
2456 vmbus_chan_unset_orphan(sc->hn_prichan);
2457 vmbus_xact_ctx_destroy(sc->hn_xact);
2462 HN_LOCK_DESTROY(sc);
2463 rm_destroy(&sc->hn_vf_lock);
2468 hn_shutdown(device_t dev)
2475 hn_link_status(struct hn_softc *sc)
2477 uint32_t link_status;
2480 error = hn_rndis_get_linkstatus(sc, &link_status);
2482 /* XXX what to do? */
2486 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2487 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2489 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2490 if_link_state_change(sc->hn_ifp,
2491 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2492 LINK_STATE_UP : LINK_STATE_DOWN);
2496 hn_link_taskfunc(void *xsc, int pending __unused)
2498 struct hn_softc *sc = xsc;
2500 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2506 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2508 struct hn_softc *sc = xsc;
2510 /* Prevent any link status checks from running. */
2511 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2514 * Fake up a [link down --> link up] state change; 5 seconds
2515 * delay is used, which closely simulates miibus reaction
2516 * upon link down event.
2518 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2519 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2520 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2521 &sc->hn_netchg_status, 5 * hz);
2525 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2527 struct hn_softc *sc = xsc;
2529 /* Re-allow link status checks. */
2530 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2535 hn_update_link_status(struct hn_softc *sc)
2538 if (sc->hn_mgmt_taskq != NULL)
2539 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2543 hn_change_network(struct hn_softc *sc)
2546 if (sc->hn_mgmt_taskq != NULL)
2547 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2551 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2552 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2554 struct mbuf *m = *m_head;
2557 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2559 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2560 m, segs, nsegs, BUS_DMA_NOWAIT);
2561 if (error == EFBIG) {
2564 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2568 *m_head = m = m_new;
2569 txr->hn_tx_collapsed++;
2571 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2572 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2575 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2576 BUS_DMASYNC_PREWRITE);
2577 txd->flags |= HN_TXD_FLAG_DMAMAP;
2583 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2586 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2587 ("put an onlist txd %#x", txd->flags));
2588 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2589 ("put an onagg txd %#x", txd->flags));
2591 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2592 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2595 if (!STAILQ_EMPTY(&txd->agg_list)) {
2596 struct hn_txdesc *tmp_txd;
2598 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2601 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2602 ("resursive aggregation on aggregated txdesc"));
2603 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2604 ("not aggregated txdesc"));
2605 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2606 ("aggregated txdesc uses dmamap"));
2607 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2608 ("aggregated txdesc consumes "
2609 "chimney sending buffer"));
2610 KASSERT(tmp_txd->chim_size == 0,
2611 ("aggregated txdesc has non-zero "
2612 "chimney sending size"));
2614 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2615 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2616 freed = hn_txdesc_put(txr, tmp_txd);
2617 KASSERT(freed, ("failed to free aggregated txdesc"));
2621 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2622 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2623 ("chim txd uses dmamap"));
2624 hn_chim_free(txr->hn_sc, txd->chim_index);
2625 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2627 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2628 bus_dmamap_sync(txr->hn_tx_data_dtag,
2629 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2630 bus_dmamap_unload(txr->hn_tx_data_dtag,
2632 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2635 if (txd->m != NULL) {
2640 txd->flags |= HN_TXD_FLAG_ONLIST;
2641 #ifndef HN_USE_TXDESC_BUFRING
2642 mtx_lock_spin(&txr->hn_txlist_spin);
2643 KASSERT(txr->hn_txdesc_avail >= 0 &&
2644 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2645 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2646 txr->hn_txdesc_avail++;
2647 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2648 mtx_unlock_spin(&txr->hn_txlist_spin);
2649 #else /* HN_USE_TXDESC_BUFRING */
2651 atomic_add_int(&txr->hn_txdesc_avail, 1);
2653 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2654 #endif /* !HN_USE_TXDESC_BUFRING */
2659 static __inline struct hn_txdesc *
2660 hn_txdesc_get(struct hn_tx_ring *txr)
2662 struct hn_txdesc *txd;
2664 #ifndef HN_USE_TXDESC_BUFRING
2665 mtx_lock_spin(&txr->hn_txlist_spin);
2666 txd = SLIST_FIRST(&txr->hn_txlist);
2668 KASSERT(txr->hn_txdesc_avail > 0,
2669 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2670 txr->hn_txdesc_avail--;
2671 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2673 mtx_unlock_spin(&txr->hn_txlist_spin);
2675 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2679 #ifdef HN_USE_TXDESC_BUFRING
2681 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2683 #endif /* HN_USE_TXDESC_BUFRING */
2684 KASSERT(txd->m == NULL && txd->refs == 0 &&
2685 STAILQ_EMPTY(&txd->agg_list) &&
2686 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2687 txd->chim_size == 0 &&
2688 (txd->flags & HN_TXD_FLAG_ONLIST) &&
2689 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2690 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2691 txd->flags &= ~HN_TXD_FLAG_ONLIST;
2697 static __inline void
2698 hn_txdesc_hold(struct hn_txdesc *txd)
2701 /* 0->1 transition will never work */
2702 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2703 atomic_add_int(&txd->refs, 1);
2706 static __inline void
2707 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2710 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2711 ("recursive aggregation on aggregating txdesc"));
2713 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2714 ("already aggregated"));
2715 KASSERT(STAILQ_EMPTY(&txd->agg_list),
2716 ("recursive aggregation on to-be-aggregated txdesc"));
2718 txd->flags |= HN_TXD_FLAG_ONAGG;
2719 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2723 hn_tx_ring_pending(struct hn_tx_ring *txr)
2725 bool pending = false;
2727 #ifndef HN_USE_TXDESC_BUFRING
2728 mtx_lock_spin(&txr->hn_txlist_spin);
2729 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2731 mtx_unlock_spin(&txr->hn_txlist_spin);
2733 if (!buf_ring_full(txr->hn_txdesc_br))
2739 static __inline void
2740 hn_txeof(struct hn_tx_ring *txr)
2742 txr->hn_has_txeof = 0;
2747 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2748 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2750 struct hn_txdesc *txd = sndc->hn_cbarg;
2751 struct hn_tx_ring *txr;
2754 KASSERT(txr->hn_chan == chan,
2755 ("channel mismatch, on chan%u, should be chan%u",
2756 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2758 txr->hn_has_txeof = 1;
2759 hn_txdesc_put(txr, txd);
2761 ++txr->hn_txdone_cnt;
2762 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2763 txr->hn_txdone_cnt = 0;
2764 if (txr->hn_oactive)
2770 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2772 #if defined(INET) || defined(INET6)
2773 tcp_lro_flush_all(&rxr->hn_lro);
2778 * 'txr' could be NULL, if multiple channels and
2779 * ifnet.if_start method are enabled.
2781 if (txr == NULL || !txr->hn_has_txeof)
2784 txr->hn_txdone_cnt = 0;
2788 static __inline uint32_t
2789 hn_rndis_pktmsg_offset(uint32_t ofs)
2792 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2793 ("invalid RNDIS packet msg offset %u", ofs));
2794 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2797 static __inline void *
2798 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2799 size_t pi_dlen, uint32_t pi_type)
2801 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2802 struct rndis_pktinfo *pi;
2804 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2805 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2808 * Per-packet-info does not move; it only grows.
2811 * rm_pktinfooffset in this phase counts from the beginning
2812 * of rndis_packet_msg.
2814 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2815 ("%u pktinfo overflows RNDIS packet msg", pi_type));
2816 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2817 pkt->rm_pktinfolen);
2818 pkt->rm_pktinfolen += pi_size;
2820 pi->rm_size = pi_size;
2821 pi->rm_type = pi_type;
2822 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2824 return (pi->rm_data);
2828 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2830 struct hn_txdesc *txd;
2834 txd = txr->hn_agg_txd;
2835 KASSERT(txd != NULL, ("no aggregate txdesc"));
2838 * Since hn_txpkt() will reset this temporary stat, save
2839 * it now, so that oerrors can be updated properly, if
2840 * hn_txpkt() ever fails.
2842 pkts = txr->hn_stat_pkts;
2845 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2846 * failure, save it for later freeing, if hn_txpkt() ever
2850 error = hn_txpkt(ifp, txr, txd);
2851 if (__predict_false(error)) {
2852 /* txd is freed, but m is not. */
2855 txr->hn_flush_failed++;
2856 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2859 /* Reset all aggregation states. */
2860 txr->hn_agg_txd = NULL;
2861 txr->hn_agg_szleft = 0;
2862 txr->hn_agg_pktleft = 0;
2863 txr->hn_agg_prevpkt = NULL;
2869 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2874 if (txr->hn_agg_txd != NULL) {
2875 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2876 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2877 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2881 * Update the previous RNDIS packet's total length,
2882 * it can be increased due to the mandatory alignment
2883 * padding for this RNDIS packet. And update the
2884 * aggregating txdesc's chimney sending buffer size
2888 * Zero-out the padding, as required by the RNDIS spec.
2891 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2892 agg_txd->chim_size += pkt->rm_len - olen;
2894 /* Link this txdesc to the parent. */
2895 hn_txdesc_agg(agg_txd, txd);
2897 chim = (uint8_t *)pkt + pkt->rm_len;
2898 /* Save the current packet for later fixup. */
2899 txr->hn_agg_prevpkt = chim;
2901 txr->hn_agg_pktleft--;
2902 txr->hn_agg_szleft -= pktsize;
2903 if (txr->hn_agg_szleft <=
2904 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2906 * Probably can't aggregate more packets,
2907 * flush this aggregating txdesc proactively.
2909 txr->hn_agg_pktleft = 0;
2914 hn_flush_txagg(ifp, txr);
2916 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
2918 txr->hn_tx_chimney_tried++;
2919 txd->chim_index = hn_chim_alloc(txr->hn_sc);
2920 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
2922 txr->hn_tx_chimney++;
2924 chim = txr->hn_sc->hn_chim +
2925 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
2927 if (txr->hn_agg_pktmax > 1 &&
2928 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2929 txr->hn_agg_txd = txd;
2930 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
2931 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
2932 txr->hn_agg_prevpkt = chim;
2939 * If this function fails, then both txd and m_head0 will be freed.
2942 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2943 struct mbuf **m_head0)
2945 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
2946 int error, nsegs, i;
2947 struct mbuf *m_head = *m_head0;
2948 struct rndis_packet_msg *pkt;
2951 int pkt_hlen, pkt_size;
2953 pkt = txd->rndis_pkt;
2954 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
2955 if (pkt_size < txr->hn_chim_size) {
2956 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
2960 if (txr->hn_agg_txd != NULL)
2961 hn_flush_txagg(ifp, txr);
2964 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
2965 pkt->rm_len = m_head->m_pkthdr.len;
2966 pkt->rm_dataoffset = 0;
2967 pkt->rm_datalen = m_head->m_pkthdr.len;
2968 pkt->rm_oobdataoffset = 0;
2969 pkt->rm_oobdatalen = 0;
2970 pkt->rm_oobdataelements = 0;
2971 pkt->rm_pktinfooffset = sizeof(*pkt);
2972 pkt->rm_pktinfolen = 0;
2973 pkt->rm_vchandle = 0;
2974 pkt->rm_reserved = 0;
2976 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
2978 * Set the hash value for this packet, so that the host could
2979 * dispatch the TX done event for this packet back to this TX
2982 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2983 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
2984 *pi_data = txr->hn_tx_idx;
2987 if (m_head->m_flags & M_VLANTAG) {
2988 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2989 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
2990 *pi_data = NDIS_VLAN_INFO_MAKE(
2991 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
2992 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
2993 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
2996 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
2997 #if defined(INET6) || defined(INET)
2998 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2999 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3001 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3002 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
3003 m_head->m_pkthdr.tso_segsz);
3006 #if defined(INET6) && defined(INET)
3011 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
3012 m_head->m_pkthdr.tso_segsz);
3015 #endif /* INET6 || INET */
3016 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3017 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3018 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3019 if (m_head->m_pkthdr.csum_flags &
3020 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3021 *pi_data = NDIS_TXCSUM_INFO_IPV6;
3023 *pi_data = NDIS_TXCSUM_INFO_IPV4;
3024 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3025 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
3028 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
3029 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
3030 else if (m_head->m_pkthdr.csum_flags &
3031 (CSUM_IP_UDP | CSUM_IP6_UDP))
3032 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
3035 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3036 /* Fixup RNDIS packet message total length */
3037 pkt->rm_len += pkt_hlen;
3038 /* Convert RNDIS packet message offsets */
3039 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3040 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3043 * Fast path: Chimney sending.
3046 struct hn_txdesc *tgt_txd = txd;
3048 if (txr->hn_agg_txd != NULL) {
3049 tgt_txd = txr->hn_agg_txd;
3055 KASSERT(pkt == chim,
3056 ("RNDIS pkt not in chimney sending buffer"));
3057 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3058 ("chimney sending buffer is not used"));
3059 tgt_txd->chim_size += pkt->rm_len;
3061 m_copydata(m_head, 0, m_head->m_pkthdr.len,
3062 ((uint8_t *)chim) + pkt_hlen);
3064 txr->hn_gpa_cnt = 0;
3065 txr->hn_sendpkt = hn_txpkt_chim;
3069 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3070 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3071 ("chimney buffer is used"));
3072 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3074 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3075 if (__predict_false(error)) {
3079 * This mbuf is not linked w/ the txd yet, so free it now.
3084 freed = hn_txdesc_put(txr, txd);
3086 ("fail to free txd upon txdma error"));
3088 txr->hn_txdma_failed++;
3089 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3094 /* +1 RNDIS packet message */
3095 txr->hn_gpa_cnt = nsegs + 1;
3097 /* send packet with page buffer */
3098 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3099 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3100 txr->hn_gpa[0].gpa_len = pkt_hlen;
3103 * Fill the page buffers with mbuf info after the page
3104 * buffer for RNDIS packet message.
3106 for (i = 0; i < nsegs; ++i) {
3107 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3109 gpa->gpa_page = atop(segs[i].ds_addr);
3110 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3111 gpa->gpa_len = segs[i].ds_len;
3114 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3116 txr->hn_sendpkt = hn_txpkt_sglist;
3120 /* Set the completion routine */
3121 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3123 /* Update temporary stats for later use. */
3124 txr->hn_stat_pkts++;
3125 txr->hn_stat_size += m_head->m_pkthdr.len;
3126 if (m_head->m_flags & M_MCAST)
3127 txr->hn_stat_mcasts++;
3134 * If this function fails, then txd will be freed, but the mbuf
3135 * associated w/ the txd will _not_ be freed.
3138 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3140 int error, send_failed = 0, has_bpf;
3143 has_bpf = bpf_peers_present(ifp->if_bpf);
3146 * Make sure that this txd and any aggregated txds are not
3147 * freed before ETHER_BPF_MTAP.
3149 hn_txdesc_hold(txd);
3151 error = txr->hn_sendpkt(txr, txd);
3154 const struct hn_txdesc *tmp_txd;
3156 ETHER_BPF_MTAP(ifp, txd->m);
3157 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3158 ETHER_BPF_MTAP(ifp, tmp_txd->m);
3161 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3162 #ifdef HN_IFSTART_SUPPORT
3163 if (!hn_use_if_start)
3166 if_inc_counter(ifp, IFCOUNTER_OBYTES,
3168 if (txr->hn_stat_mcasts != 0) {
3169 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3170 txr->hn_stat_mcasts);
3173 txr->hn_pkts += txr->hn_stat_pkts;
3177 hn_txdesc_put(txr, txd);
3179 if (__predict_false(error)) {
3183 * This should "really rarely" happen.
3185 * XXX Too many RX to be acked or too many sideband
3186 * commands to run? Ask netvsc_channel_rollup()
3187 * to kick start later.
3189 txr->hn_has_txeof = 1;
3191 txr->hn_send_failed++;
3194 * Try sending again after set hn_has_txeof;
3195 * in case that we missed the last
3196 * netvsc_channel_rollup().
3200 if_printf(ifp, "send failed\n");
3203 * Caller will perform further processing on the
3204 * associated mbuf, so don't free it in hn_txdesc_put();
3205 * only unload it from the DMA map in hn_txdesc_put(),
3209 freed = hn_txdesc_put(txr, txd);
3211 ("fail to free txd upon send error"));
3213 txr->hn_send_failed++;
3216 /* Reset temporary stats, after this sending is done. */
3217 txr->hn_stat_size = 0;
3218 txr->hn_stat_pkts = 0;
3219 txr->hn_stat_mcasts = 0;
3225 * Append the specified data to the indicated mbuf chain,
3226 * Extend the mbuf chain if the new data does not fit in
3229 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3230 * There should be an equivalent in the kernel mbuf code,
3231 * but there does not appear to be one yet.
3233 * Differs from m_append() in that additional mbufs are
3234 * allocated with cluster size MJUMPAGESIZE, and filled
3237 * Return 1 if able to complete the job; otherwise 0.
3240 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3243 int remainder, space;
3245 for (m = m0; m->m_next != NULL; m = m->m_next)
3248 space = M_TRAILINGSPACE(m);
3251 * Copy into available space.
3253 if (space > remainder)
3255 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3260 while (remainder > 0) {
3262 * Allocate a new mbuf; could check space
3263 * and allocate a cluster instead.
3265 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
3268 n->m_len = min(MJUMPAGESIZE, remainder);
3269 bcopy(cp, mtod(n, caddr_t), n->m_len);
3271 remainder -= n->m_len;
3275 if (m0->m_flags & M_PKTHDR)
3276 m0->m_pkthdr.len += len - remainder;
3278 return (remainder == 0);
3281 #if defined(INET) || defined(INET6)
3283 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3285 #if __FreeBSD_version >= 1100095
3286 if (hn_lro_mbufq_depth) {
3287 tcp_lro_queue_mbuf(lc, m);
3291 return tcp_lro_rx(lc, m, 0);
3296 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3297 const struct hn_rxinfo *info)
3299 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3301 int size, do_lro = 0, do_csum = 1, is_vf = 0;
3302 int hash_type = M_HASHTYPE_NONE;
3305 if (rxr->hn_rxvf_ifp != NULL) {
3307 * Non-transparent mode VF; pretend this packet is from
3310 ifp = rxr->hn_rxvf_ifp;
3312 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3313 /* Transparent mode VF. */
3317 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3320 * See the NOTE of hn_rndis_init_fixat(). This
3321 * function can be reached, immediately after the
3322 * RNDIS is initialized but before the ifnet is
3323 * setup on the hn_attach() path; drop the unexpected
3329 if (__predict_false(dlen < ETHER_HDR_LEN)) {
3330 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3334 if (dlen <= MHLEN) {
3335 m_new = m_gethdr(M_NOWAIT, MT_DATA);
3336 if (m_new == NULL) {
3337 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3340 memcpy(mtod(m_new, void *), data, dlen);
3341 m_new->m_pkthdr.len = m_new->m_len = dlen;
3342 rxr->hn_small_pkts++;
3345 * Get an mbuf with a cluster. For packets 2K or less,
3346 * get a standard 2K cluster. For anything larger, get a
3347 * 4K cluster. Any buffers larger than 4K can cause problems
3348 * if looped around to the Hyper-V TX channel, so avoid them.
3351 if (dlen > MCLBYTES) {
3353 size = MJUMPAGESIZE;
3356 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3357 if (m_new == NULL) {
3358 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3362 hv_m_append(m_new, dlen, data);
3364 m_new->m_pkthdr.rcvif = ifp;
3366 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3369 /* receive side checksum offload */
3370 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3371 /* IP csum offload */
3372 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3373 m_new->m_pkthdr.csum_flags |=
3374 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3378 /* TCP/UDP csum offload */
3379 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3380 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3381 m_new->m_pkthdr.csum_flags |=
3382 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3383 m_new->m_pkthdr.csum_data = 0xffff;
3384 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3392 * As of this write (Oct 28th, 2016), host side will turn
3393 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3394 * the do_lro setting here is actually _not_ accurate. We
3395 * depend on the RSS hash type check to reset do_lro.
3397 if ((info->csum_info &
3398 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3399 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3402 const struct ether_header *eh;
3407 /* Checked at the beginning of this function. */
3408 KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
3410 eh = mtod(m_new, struct ether_header *);
3411 etype = ntohs(eh->ether_type);
3412 if (etype == ETHERTYPE_VLAN) {
3413 const struct ether_vlan_header *evl;
3415 hoff = sizeof(*evl);
3416 if (m_new->m_len < hoff)
3418 evl = mtod(m_new, struct ether_vlan_header *);
3419 etype = ntohs(evl->evl_proto);
3422 if (etype == ETHERTYPE_IP) {
3425 pr = hn_check_iplen(m_new, hoff);
3426 if (pr == IPPROTO_TCP) {
3428 (rxr->hn_trust_hcsum &
3429 HN_TRUST_HCSUM_TCP)) {
3430 rxr->hn_csum_trusted++;
3431 m_new->m_pkthdr.csum_flags |=
3432 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3433 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3434 m_new->m_pkthdr.csum_data = 0xffff;
3437 } else if (pr == IPPROTO_UDP) {
3439 (rxr->hn_trust_hcsum &
3440 HN_TRUST_HCSUM_UDP)) {
3441 rxr->hn_csum_trusted++;
3442 m_new->m_pkthdr.csum_flags |=
3443 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3444 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3445 m_new->m_pkthdr.csum_data = 0xffff;
3447 } else if (pr != IPPROTO_DONE && do_csum &&
3448 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3449 rxr->hn_csum_trusted++;
3450 m_new->m_pkthdr.csum_flags |=
3451 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3456 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3457 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3458 NDIS_VLAN_INFO_ID(info->vlan_info),
3459 NDIS_VLAN_INFO_PRI(info->vlan_info),
3460 NDIS_VLAN_INFO_CFI(info->vlan_info));
3461 m_new->m_flags |= M_VLANTAG;
3465 * If VF is activated (tranparent/non-transparent mode does not
3470 * hn(4) will only receive broadcast packets, multicast packets,
3471 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3474 * For non-transparent, we definitely _cannot_ enable LRO at
3475 * all, since the LRO flush will use hn(4) as the receiving
3476 * interface; i.e. hn_ifp->if_input(hn_ifp, m).
3482 * If VF is activated (tranparent/non-transparent mode does not
3483 * matter here), do _not_ mess with unsupported hash types or
3486 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3488 m_new->m_pkthdr.flowid = info->hash_value;
3490 hash_type = M_HASHTYPE_OPAQUE_HASH;
3491 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3492 NDIS_HASH_FUNCTION_TOEPLITZ) {
3493 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3498 * do_lro is resetted, if the hash types are not TCP
3499 * related. See the comment in the above csum_flags
3503 case NDIS_HASH_IPV4:
3504 hash_type = M_HASHTYPE_RSS_IPV4;
3508 case NDIS_HASH_TCP_IPV4:
3509 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3512 case NDIS_HASH_IPV6:
3513 hash_type = M_HASHTYPE_RSS_IPV6;
3517 case NDIS_HASH_IPV6_EX:
3518 hash_type = M_HASHTYPE_RSS_IPV6_EX;
3522 case NDIS_HASH_TCP_IPV6:
3523 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3526 case NDIS_HASH_TCP_IPV6_EX:
3527 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3531 } else if (!is_vf) {
3532 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3533 hash_type = M_HASHTYPE_OPAQUE;
3535 M_HASHTYPE_SET(m_new, hash_type);
3537 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3538 if (hn_ifp != ifp) {
3539 const struct ether_header *eh;
3542 * Non-transparent mode VF is activated.
3546 * Allow tapping on hn(4).
3548 ETHER_BPF_MTAP(hn_ifp, m_new);
3551 * Update hn(4)'s stats.
3553 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3554 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3555 /* Checked at the beginning of this function. */
3556 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3557 eh = mtod(m_new, struct ether_header *);
3558 if (ETHER_IS_MULTICAST(eh->ether_dhost))
3559 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3563 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3564 #if defined(INET) || defined(INET6)
3565 struct lro_ctrl *lro = &rxr->hn_lro;
3568 rxr->hn_lro_tried++;
3569 if (hn_lro_rx(lro, m_new) == 0) {
3576 ifp->if_input(ifp, m_new);
3582 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3584 struct hn_softc *sc = ifp->if_softc;
3585 struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3586 struct ifnet *vf_ifp;
3587 int mask, error = 0;
3588 struct ifrsskey *ifrk;
3589 struct ifrsshash *ifrh;
3593 if (ifr->ifr_mtu > HN_MTU_MAX) {
3600 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3605 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3606 /* Can't change MTU */
3612 if (ifp->if_mtu == ifr->ifr_mtu) {
3617 if (hn_xpnt_vf_isready(sc)) {
3618 vf_ifp = sc->hn_vf_ifp;
3620 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3621 sizeof(ifr_vf.ifr_name));
3622 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3626 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3627 vf_ifp->if_xname, ifr->ifr_mtu, error);
3633 * Suspend this interface before the synthetic parts
3639 * Detach the synthetics parts, i.e. NVS and RNDIS.
3641 hn_synth_detach(sc);
3644 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3645 * with the new MTU setting.
3647 error = hn_synth_attach(sc, ifr->ifr_mtu);
3654 * Commit the requested MTU, after the synthetic parts
3655 * have been successfully attached.
3657 ifp->if_mtu = ifr->ifr_mtu;
3660 * Synthetic parts' reattach may change the chimney
3661 * sending size; update it.
3663 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3664 hn_set_chim_size(sc, sc->hn_chim_szmax);
3667 * Make sure that various parameters based on MTU are
3668 * still valid, after the MTU change.
3670 hn_mtu_change_fixup(sc);
3673 * All done! Resume the interface now.
3677 if ((sc->hn_flags & HN_FLAG_RXVF) ||
3678 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3680 * Since we have reattached the NVS part,
3681 * change the datapath to VF again; in case
3682 * that it is lost, after the NVS was detached.
3684 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3693 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3698 if (hn_xpnt_vf_isready(sc))
3699 hn_xpnt_vf_saveifflags(sc);
3701 if (ifp->if_flags & IFF_UP) {
3702 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3704 * Caller meight hold mutex, e.g.
3705 * bpf; use busy-wait for the RNDIS
3709 hn_rxfilter_config(sc);
3712 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3713 error = hn_xpnt_vf_iocsetflags(sc);
3718 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3721 sc->hn_if_flags = ifp->if_flags;
3729 if (hn_xpnt_vf_isready(sc)) {
3731 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3732 sizeof(ifr_vf.ifr_name));
3733 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3739 * Fix up requested capabilities w/ supported capabilities,
3740 * since the supported capabilities could have been changed.
3742 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3745 if (mask & IFCAP_TXCSUM) {
3746 ifp->if_capenable ^= IFCAP_TXCSUM;
3747 if (ifp->if_capenable & IFCAP_TXCSUM)
3748 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3750 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3752 if (mask & IFCAP_TXCSUM_IPV6) {
3753 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3754 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3755 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3757 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3760 /* TODO: flip RNDIS offload parameters for RXCSUM. */
3761 if (mask & IFCAP_RXCSUM)
3762 ifp->if_capenable ^= IFCAP_RXCSUM;
3764 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
3765 if (mask & IFCAP_RXCSUM_IPV6)
3766 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3769 if (mask & IFCAP_LRO)
3770 ifp->if_capenable ^= IFCAP_LRO;
3772 if (mask & IFCAP_TSO4) {
3773 ifp->if_capenable ^= IFCAP_TSO4;
3774 if (ifp->if_capenable & IFCAP_TSO4)
3775 ifp->if_hwassist |= CSUM_IP_TSO;
3777 ifp->if_hwassist &= ~CSUM_IP_TSO;
3779 if (mask & IFCAP_TSO6) {
3780 ifp->if_capenable ^= IFCAP_TSO6;
3781 if (ifp->if_capenable & IFCAP_TSO6)
3782 ifp->if_hwassist |= CSUM_IP6_TSO;
3784 ifp->if_hwassist &= ~CSUM_IP6_TSO;
3794 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3798 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3800 * Multicast uses mutex; use busy-wait for
3804 hn_rxfilter_config(sc);
3808 /* XXX vlan(4) style mcast addr maintenance */
3809 if (hn_xpnt_vf_isready(sc)) {
3812 old_if_flags = sc->hn_vf_ifp->if_flags;
3813 hn_xpnt_vf_saveifflags(sc);
3815 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3816 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3818 error = hn_xpnt_vf_iocsetflags(sc);
3827 if (hn_xpnt_vf_isready(sc)) {
3829 * SIOCGIFMEDIA expects ifmediareq, so don't
3830 * create and pass ifr_vf to the VF here; just
3831 * replace the ifr_name.
3833 vf_ifp = sc->hn_vf_ifp;
3834 strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3835 sizeof(ifr->ifr_name));
3836 error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3837 /* Restore the ifr_name. */
3838 strlcpy(ifr->ifr_name, ifp->if_xname,
3839 sizeof(ifr->ifr_name));
3844 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3847 case SIOCGIFRSSHASH:
3848 ifrh = (struct ifrsshash *)data;
3850 if (sc->hn_rx_ring_inuse == 1) {
3852 ifrh->ifrh_func = RSS_FUNC_NONE;
3853 ifrh->ifrh_types = 0;
3857 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3858 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3860 ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3861 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3866 ifrk = (struct ifrsskey *)data;
3868 if (sc->hn_rx_ring_inuse == 1) {
3870 ifrk->ifrk_func = RSS_FUNC_NONE;
3871 ifrk->ifrk_keylen = 0;
3874 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3875 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3877 ifrk->ifrk_func = RSS_FUNC_PRIVATE;
3878 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
3879 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
3880 NDIS_HASH_KEYSIZE_TOEPLITZ);
3885 error = ether_ioctl(ifp, cmd, data);
3892 hn_stop(struct hn_softc *sc, bool detaching)
3894 struct ifnet *ifp = sc->hn_ifp;
3899 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3900 ("synthetic parts were not attached"));
3902 /* Clear RUNNING bit ASAP. */
3903 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3905 /* Disable polling. */
3908 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
3909 KASSERT(sc->hn_vf_ifp != NULL,
3910 ("%s: VF is not attached", ifp->if_xname));
3912 /* Mark transparent mode VF as disabled. */
3913 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
3917 * Datapath setting must happen _before_ bringing
3920 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
3923 * Bring the VF down.
3925 hn_xpnt_vf_saveifflags(sc);
3926 sc->hn_vf_ifp->if_flags &= ~IFF_UP;
3927 hn_xpnt_vf_iocsetflags(sc);
3930 /* Suspend data transfers. */
3931 hn_suspend_data(sc);
3933 /* Clear OACTIVE bit. */
3934 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3935 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3936 sc->hn_tx_ring[i].hn_oactive = 0;
3939 * If the non-transparent mode VF is active, make sure
3940 * that the RX filter still allows packet reception.
3942 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
3943 hn_rxfilter_config(sc);
3947 hn_init_locked(struct hn_softc *sc)
3949 struct ifnet *ifp = sc->hn_ifp;
3954 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
3957 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3960 /* Configure RX filter */
3961 hn_rxfilter_config(sc);
3963 /* Clear OACTIVE bit. */
3964 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3965 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3966 sc->hn_tx_ring[i].hn_oactive = 0;
3968 /* Clear TX 'suspended' bit. */
3969 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
3971 if (hn_xpnt_vf_isready(sc)) {
3972 /* Initialize transparent VF. */
3973 hn_xpnt_vf_init(sc);
3976 /* Everything is ready; unleash! */
3977 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3979 /* Re-enable polling if requested. */
3980 if (sc->hn_pollhz > 0)
3981 hn_polling(sc, sc->hn_pollhz);
3987 struct hn_softc *sc = xsc;
3994 #if __FreeBSD_version >= 1100099
3997 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
3999 struct hn_softc *sc = arg1;
4000 unsigned int lenlim;
4003 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4004 error = sysctl_handle_int(oidp, &lenlim, 0, req);
4005 if (error || req->newptr == NULL)
4009 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4010 lenlim > TCP_LRO_LENGTH_MAX) {
4014 hn_set_lro_lenlim(sc, lenlim);
4021 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4023 struct hn_softc *sc = arg1;
4024 int ackcnt, error, i;
4027 * lro_ackcnt_lim is append count limit,
4028 * +1 to turn it into aggregation limit.
4030 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4031 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4032 if (error || req->newptr == NULL)
4035 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4039 * Convert aggregation limit back to append
4044 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4045 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4053 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4055 struct hn_softc *sc = arg1;
4060 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4063 error = sysctl_handle_int(oidp, &on, 0, req);
4064 if (error || req->newptr == NULL)
4068 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4069 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4072 rxr->hn_trust_hcsum |= hcsum;
4074 rxr->hn_trust_hcsum &= ~hcsum;
4081 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4083 struct hn_softc *sc = arg1;
4084 int chim_size, error;
4086 chim_size = sc->hn_tx_ring[0].hn_chim_size;
4087 error = sysctl_handle_int(oidp, &chim_size, 0, req);
4088 if (error || req->newptr == NULL)
4091 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4095 hn_set_chim_size(sc, chim_size);
4100 #if __FreeBSD_version < 1100095
4102 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4104 struct hn_softc *sc = arg1;
4105 int ofs = arg2, i, error;
4106 struct hn_rx_ring *rxr;
4110 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4111 rxr = &sc->hn_rx_ring[i];
4112 stat += *((int *)((uint8_t *)rxr + ofs));
4115 error = sysctl_handle_64(oidp, &stat, 0, req);
4116 if (error || req->newptr == NULL)
4119 /* Zero out this stat. */
4120 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4121 rxr = &sc->hn_rx_ring[i];
4122 *((int *)((uint8_t *)rxr + ofs)) = 0;
4128 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4130 struct hn_softc *sc = arg1;
4131 int ofs = arg2, i, error;
4132 struct hn_rx_ring *rxr;
4136 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4137 rxr = &sc->hn_rx_ring[i];
4138 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4141 error = sysctl_handle_64(oidp, &stat, 0, req);
4142 if (error || req->newptr == NULL)
4145 /* Zero out this stat. */
4146 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4147 rxr = &sc->hn_rx_ring[i];
4148 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4156 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4158 struct hn_softc *sc = arg1;
4159 int ofs = arg2, i, error;
4160 struct hn_rx_ring *rxr;
4164 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4165 rxr = &sc->hn_rx_ring[i];
4166 stat += *((u_long *)((uint8_t *)rxr + ofs));
4169 error = sysctl_handle_long(oidp, &stat, 0, req);
4170 if (error || req->newptr == NULL)
4173 /* Zero out this stat. */
4174 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4175 rxr = &sc->hn_rx_ring[i];
4176 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
4182 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4184 struct hn_softc *sc = arg1;
4185 int ofs = arg2, i, error;
4186 struct hn_tx_ring *txr;
4190 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4191 txr = &sc->hn_tx_ring[i];
4192 stat += *((u_long *)((uint8_t *)txr + ofs));
4195 error = sysctl_handle_long(oidp, &stat, 0, req);
4196 if (error || req->newptr == NULL)
4199 /* Zero out this stat. */
4200 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4201 txr = &sc->hn_tx_ring[i];
4202 *((u_long *)((uint8_t *)txr + ofs)) = 0;
4208 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4210 struct hn_softc *sc = arg1;
4211 int ofs = arg2, i, error, conf;
4212 struct hn_tx_ring *txr;
4214 txr = &sc->hn_tx_ring[0];
4215 conf = *((int *)((uint8_t *)txr + ofs));
4217 error = sysctl_handle_int(oidp, &conf, 0, req);
4218 if (error || req->newptr == NULL)
4222 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4223 txr = &sc->hn_tx_ring[i];
4224 *((int *)((uint8_t *)txr + ofs)) = conf;
4232 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4234 struct hn_softc *sc = arg1;
4237 size = sc->hn_agg_size;
4238 error = sysctl_handle_int(oidp, &size, 0, req);
4239 if (error || req->newptr == NULL)
4243 sc->hn_agg_size = size;
4251 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4253 struct hn_softc *sc = arg1;
4256 pkts = sc->hn_agg_pkts;
4257 error = sysctl_handle_int(oidp, &pkts, 0, req);
4258 if (error || req->newptr == NULL)
4262 sc->hn_agg_pkts = pkts;
4270 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4272 struct hn_softc *sc = arg1;
4275 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4276 return (sysctl_handle_int(oidp, &pkts, 0, req));
4280 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4282 struct hn_softc *sc = arg1;
4285 align = sc->hn_tx_ring[0].hn_agg_align;
4286 return (sysctl_handle_int(oidp, &align, 0, req));
4290 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4293 vmbus_chan_poll_disable(chan);
4295 vmbus_chan_poll_enable(chan, pollhz);
4299 hn_polling(struct hn_softc *sc, u_int pollhz)
4301 int nsubch = sc->hn_rx_ring_inuse - 1;
4306 struct vmbus_channel **subch;
4309 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4310 for (i = 0; i < nsubch; ++i)
4311 hn_chan_polling(subch[i], pollhz);
4312 vmbus_subchan_rel(subch, nsubch);
4314 hn_chan_polling(sc->hn_prichan, pollhz);
4318 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4320 struct hn_softc *sc = arg1;
4323 pollhz = sc->hn_pollhz;
4324 error = sysctl_handle_int(oidp, &pollhz, 0, req);
4325 if (error || req->newptr == NULL)
4329 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4333 if (sc->hn_pollhz != pollhz) {
4334 sc->hn_pollhz = pollhz;
4335 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4336 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4337 hn_polling(sc, sc->hn_pollhz);
4345 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4347 struct hn_softc *sc = arg1;
4350 snprintf(verstr, sizeof(verstr), "%u.%u",
4351 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4352 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4353 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4357 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4359 struct hn_softc *sc = arg1;
4366 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4367 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4371 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4373 struct hn_softc *sc = arg1;
4374 char assist_str[128];
4378 hwassist = sc->hn_ifp->if_hwassist;
4380 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4381 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4385 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4387 struct hn_softc *sc = arg1;
4388 char filter_str[128];
4392 filter = sc->hn_rx_filter;
4394 snprintf(filter_str, sizeof(filter_str), "%b", filter,
4396 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4402 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4404 struct hn_softc *sc = arg1;
4409 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4410 if (error || req->newptr == NULL)
4413 if ((sc->hn_flags & HN_FLAG_RXVF) ||
4414 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4416 * RSS key is synchronized w/ VF's, don't allow users
4423 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4426 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4428 if (sc->hn_rx_ring_inuse > 1) {
4429 error = hn_rss_reconfig(sc);
4431 /* Not RSS capable, at least for now; just save the RSS key. */
4440 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4442 struct hn_softc *sc = arg1;
4447 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4448 if (error || req->newptr == NULL)
4452 * Don't allow RSS indirect table change, if this interface is not
4453 * RSS capable currently.
4455 if (sc->hn_rx_ring_inuse == 1) {
4460 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4463 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4465 hn_rss_ind_fixup(sc);
4466 error = hn_rss_reconfig(sc);
4475 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4477 struct hn_softc *sc = arg1;
4482 hash = sc->hn_rss_hash;
4484 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4485 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4489 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4491 struct hn_softc *sc = arg1;
4496 hash = sc->hn_rss_hcap;
4498 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4499 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4503 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4505 struct hn_softc *sc = arg1;
4510 hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4512 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4513 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4517 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4519 struct hn_softc *sc = arg1;
4520 char vf_name[IFNAMSIZ + 1];
4521 struct ifnet *vf_ifp;
4525 vf_ifp = sc->hn_vf_ifp;
4527 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4529 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4533 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4535 struct hn_softc *sc = arg1;
4536 char vf_name[IFNAMSIZ + 1];
4537 struct ifnet *vf_ifp;
4541 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4543 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4545 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4549 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4551 struct rm_priotracker pt;
4556 error = sysctl_wire_old_buffer(req, 0);
4560 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4564 rm_rlock(&hn_vfmap_lock, &pt);
4567 for (i = 0; i < hn_vfmap_size; ++i) {
4570 if (hn_vfmap[i] == NULL)
4573 ifp = ifnet_byindex(i);
4576 sbuf_printf(sb, "%s", ifp->if_xname);
4578 sbuf_printf(sb, " %s", ifp->if_xname);
4583 rm_runlock(&hn_vfmap_lock, &pt);
4585 error = sbuf_finish(sb);
4591 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4593 struct rm_priotracker pt;
4598 error = sysctl_wire_old_buffer(req, 0);
4602 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4606 rm_rlock(&hn_vfmap_lock, &pt);
4609 for (i = 0; i < hn_vfmap_size; ++i) {
4610 struct ifnet *ifp, *hn_ifp;
4612 hn_ifp = hn_vfmap[i];
4616 ifp = ifnet_byindex(i);
4619 sbuf_printf(sb, "%s:%s", ifp->if_xname,
4622 sbuf_printf(sb, " %s:%s", ifp->if_xname,
4629 rm_runlock(&hn_vfmap_lock, &pt);
4631 error = sbuf_finish(sb);
4637 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4639 struct hn_softc *sc = arg1;
4640 int error, onoff = 0;
4642 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4644 error = sysctl_handle_int(oidp, &onoff, 0, req);
4645 if (error || req->newptr == NULL)
4649 /* NOTE: hn_vf_lock for hn_transmit() */
4650 rm_wlock(&sc->hn_vf_lock);
4652 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4654 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4655 rm_wunlock(&sc->hn_vf_lock);
4662 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4664 struct hn_softc *sc = arg1;
4667 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4669 return (sysctl_handle_int(oidp, &enabled, 0, req));
4673 hn_check_iplen(const struct mbuf *m, int hoff)
4675 const struct ip *ip;
4676 int len, iphlen, iplen;
4677 const struct tcphdr *th;
4678 int thoff; /* TCP data offset */
4680 len = hoff + sizeof(struct ip);
4682 /* The packet must be at least the size of an IP header. */
4683 if (m->m_pkthdr.len < len)
4684 return IPPROTO_DONE;
4686 /* The fixed IP header must reside completely in the first mbuf. */
4688 return IPPROTO_DONE;
4690 ip = mtodo(m, hoff);
4692 /* Bound check the packet's stated IP header length. */
4693 iphlen = ip->ip_hl << 2;
4694 if (iphlen < sizeof(struct ip)) /* minimum header length */
4695 return IPPROTO_DONE;
4697 /* The full IP header must reside completely in the one mbuf. */
4698 if (m->m_len < hoff + iphlen)
4699 return IPPROTO_DONE;
4701 iplen = ntohs(ip->ip_len);
4704 * Check that the amount of data in the buffers is as
4705 * at least much as the IP header would have us expect.
4707 if (m->m_pkthdr.len < hoff + iplen)
4708 return IPPROTO_DONE;
4711 * Ignore IP fragments.
4713 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4714 return IPPROTO_DONE;
4717 * The TCP/IP or UDP/IP header must be entirely contained within
4718 * the first fragment of a packet.
4722 if (iplen < iphlen + sizeof(struct tcphdr))
4723 return IPPROTO_DONE;
4724 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4725 return IPPROTO_DONE;
4726 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4727 thoff = th->th_off << 2;
4728 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4729 return IPPROTO_DONE;
4730 if (m->m_len < hoff + iphlen + thoff)
4731 return IPPROTO_DONE;
4734 if (iplen < iphlen + sizeof(struct udphdr))
4735 return IPPROTO_DONE;
4736 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4737 return IPPROTO_DONE;
4741 return IPPROTO_DONE;
4748 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4750 struct sysctl_oid_list *child;
4751 struct sysctl_ctx_list *ctx;
4752 device_t dev = sc->hn_dev;
4753 #if defined(INET) || defined(INET6)
4754 #if __FreeBSD_version >= 1100095
4761 * Create RXBUF for reception.
4764 * - It is shared by all channels.
4765 * - A large enough buffer is allocated, certain version of NVSes
4766 * may further limit the usable space.
4768 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4769 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4770 BUS_DMA_WAITOK | BUS_DMA_ZERO);
4771 if (sc->hn_rxbuf == NULL) {
4772 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4776 sc->hn_rx_ring_cnt = ring_cnt;
4777 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4779 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4780 M_DEVBUF, M_WAITOK | M_ZERO);
4782 #if defined(INET) || defined(INET6)
4783 #if __FreeBSD_version >= 1100095
4784 lroent_cnt = hn_lro_entry_count;
4785 if (lroent_cnt < TCP_LRO_ENTRIES)
4786 lroent_cnt = TCP_LRO_ENTRIES;
4788 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4790 #endif /* INET || INET6 */
4792 ctx = device_get_sysctl_ctx(dev);
4793 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4795 /* Create dev.hn.UNIT.rx sysctl tree */
4796 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4797 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4799 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4800 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4802 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4803 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4804 &rxr->hn_br_dma, BUS_DMA_WAITOK);
4805 if (rxr->hn_br == NULL) {
4806 device_printf(dev, "allocate bufring failed\n");
4810 if (hn_trust_hosttcp)
4811 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4812 if (hn_trust_hostudp)
4813 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4814 if (hn_trust_hostip)
4815 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4816 rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4817 rxr->hn_ifp = sc->hn_ifp;
4818 if (i < sc->hn_tx_ring_cnt)
4819 rxr->hn_txr = &sc->hn_tx_ring[i];
4820 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4821 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4823 rxr->hn_rxbuf = sc->hn_rxbuf;
4828 #if defined(INET) || defined(INET6)
4829 #if __FreeBSD_version >= 1100095
4830 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4831 hn_lro_mbufq_depth);
4833 tcp_lro_init(&rxr->hn_lro);
4834 rxr->hn_lro.ifp = sc->hn_ifp;
4836 #if __FreeBSD_version >= 1100099
4837 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4838 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4840 #endif /* INET || INET6 */
4842 if (sc->hn_rx_sysctl_tree != NULL) {
4846 * Create per RX ring sysctl tree:
4847 * dev.hn.UNIT.rx.RINGID
4849 snprintf(name, sizeof(name), "%d", i);
4850 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4851 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4852 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4854 if (rxr->hn_rx_sysctl_tree != NULL) {
4855 SYSCTL_ADD_ULONG(ctx,
4856 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4857 OID_AUTO, "packets", CTLFLAG_RW,
4858 &rxr->hn_pkts, "# of packets received");
4859 SYSCTL_ADD_ULONG(ctx,
4860 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4861 OID_AUTO, "rss_pkts", CTLFLAG_RW,
4863 "# of packets w/ RSS info received");
4865 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4866 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
4867 &rxr->hn_pktbuf_len, 0,
4868 "Temporary channel packet buffer length");
4873 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
4874 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4875 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
4876 #if __FreeBSD_version < 1100095
4877 hn_rx_stat_int_sysctl,
4879 hn_rx_stat_u64_sysctl,
4881 "LU", "LRO queued");
4882 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
4883 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4884 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
4885 #if __FreeBSD_version < 1100095
4886 hn_rx_stat_int_sysctl,
4888 hn_rx_stat_u64_sysctl,
4890 "LU", "LRO flushed");
4891 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
4892 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4893 __offsetof(struct hn_rx_ring, hn_lro_tried),
4894 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
4895 #if __FreeBSD_version >= 1100099
4896 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
4897 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4898 hn_lro_lenlim_sysctl, "IU",
4899 "Max # of data bytes to be aggregated by LRO");
4900 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
4901 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4902 hn_lro_ackcnt_sysctl, "I",
4903 "Max # of ACKs to be aggregated by LRO");
4905 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
4906 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
4907 hn_trust_hcsum_sysctl, "I",
4908 "Trust tcp segement verification on host side, "
4909 "when csum info is missing");
4910 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
4911 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
4912 hn_trust_hcsum_sysctl, "I",
4913 "Trust udp datagram verification on host side, "
4914 "when csum info is missing");
4915 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
4916 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
4917 hn_trust_hcsum_sysctl, "I",
4918 "Trust ip packet verification on host side, "
4919 "when csum info is missing");
4920 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
4921 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4922 __offsetof(struct hn_rx_ring, hn_csum_ip),
4923 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
4924 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
4925 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4926 __offsetof(struct hn_rx_ring, hn_csum_tcp),
4927 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
4928 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
4929 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4930 __offsetof(struct hn_rx_ring, hn_csum_udp),
4931 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
4932 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
4933 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4934 __offsetof(struct hn_rx_ring, hn_csum_trusted),
4935 hn_rx_stat_ulong_sysctl, "LU",
4936 "# of packets that we trust host's csum verification");
4937 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
4938 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4939 __offsetof(struct hn_rx_ring, hn_small_pkts),
4940 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
4941 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
4942 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4943 __offsetof(struct hn_rx_ring, hn_ack_failed),
4944 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
4945 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
4946 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
4947 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
4948 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
4954 hn_destroy_rx_data(struct hn_softc *sc)
4958 if (sc->hn_rxbuf != NULL) {
4959 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
4960 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
4962 device_printf(sc->hn_dev, "RXBUF is referenced\n");
4963 sc->hn_rxbuf = NULL;
4966 if (sc->hn_rx_ring_cnt == 0)
4969 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4970 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4972 if (rxr->hn_br == NULL)
4974 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
4975 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
4977 device_printf(sc->hn_dev,
4978 "%dth channel bufring is referenced", i);
4982 #if defined(INET) || defined(INET6)
4983 tcp_lro_free(&rxr->hn_lro);
4985 free(rxr->hn_pktbuf, M_DEVBUF);
4987 free(sc->hn_rx_ring, M_DEVBUF);
4988 sc->hn_rx_ring = NULL;
4990 sc->hn_rx_ring_cnt = 0;
4991 sc->hn_rx_ring_inuse = 0;
4995 hn_tx_ring_create(struct hn_softc *sc, int id)
4997 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
4998 device_t dev = sc->hn_dev;
4999 bus_dma_tag_t parent_dtag;
5003 txr->hn_tx_idx = id;
5005 #ifndef HN_USE_TXDESC_BUFRING
5006 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5008 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5010 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5011 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5012 M_DEVBUF, M_WAITOK | M_ZERO);
5013 #ifndef HN_USE_TXDESC_BUFRING
5014 SLIST_INIT(&txr->hn_txlist);
5016 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5017 M_WAITOK, &txr->hn_tx_lock);
5020 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5021 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5022 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5024 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5027 #ifdef HN_IFSTART_SUPPORT
5028 if (hn_use_if_start) {
5029 txr->hn_txeof = hn_start_txeof;
5030 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5031 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5037 txr->hn_txeof = hn_xmit_txeof;
5038 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5039 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5041 br_depth = hn_get_txswq_depth(txr);
5042 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5043 M_WAITOK, &txr->hn_tx_lock);
5046 txr->hn_direct_tx_size = hn_direct_tx_size;
5049 * Always schedule transmission instead of trying to do direct
5050 * transmission. This one gives the best performance so far.
5052 txr->hn_sched_tx = 1;
5054 parent_dtag = bus_get_dma_tag(dev);
5056 /* DMA tag for RNDIS packet messages. */
5057 error = bus_dma_tag_create(parent_dtag, /* parent */
5058 HN_RNDIS_PKT_ALIGN, /* alignment */
5059 HN_RNDIS_PKT_BOUNDARY, /* boundary */
5060 BUS_SPACE_MAXADDR, /* lowaddr */
5061 BUS_SPACE_MAXADDR, /* highaddr */
5062 NULL, NULL, /* filter, filterarg */
5063 HN_RNDIS_PKT_LEN, /* maxsize */
5065 HN_RNDIS_PKT_LEN, /* maxsegsize */
5067 NULL, /* lockfunc */
5068 NULL, /* lockfuncarg */
5069 &txr->hn_tx_rndis_dtag);
5071 device_printf(dev, "failed to create rndis dmatag\n");
5075 /* DMA tag for data. */
5076 error = bus_dma_tag_create(parent_dtag, /* parent */
5078 HN_TX_DATA_BOUNDARY, /* boundary */
5079 BUS_SPACE_MAXADDR, /* lowaddr */
5080 BUS_SPACE_MAXADDR, /* highaddr */
5081 NULL, NULL, /* filter, filterarg */
5082 HN_TX_DATA_MAXSIZE, /* maxsize */
5083 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
5084 HN_TX_DATA_SEGSIZE, /* maxsegsize */
5086 NULL, /* lockfunc */
5087 NULL, /* lockfuncarg */
5088 &txr->hn_tx_data_dtag);
5090 device_printf(dev, "failed to create data dmatag\n");
5094 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5095 struct hn_txdesc *txd = &txr->hn_txdesc[i];
5098 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5099 STAILQ_INIT(&txd->agg_list);
5102 * Allocate and load RNDIS packet message.
5104 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5105 (void **)&txd->rndis_pkt,
5106 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5107 &txd->rndis_pkt_dmap);
5110 "failed to allocate rndis_packet_msg, %d\n", i);
5114 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5115 txd->rndis_pkt_dmap,
5116 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5117 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5121 "failed to load rndis_packet_msg, %d\n", i);
5122 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5123 txd->rndis_pkt, txd->rndis_pkt_dmap);
5127 /* DMA map for TX data. */
5128 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5132 "failed to allocate tx data dmamap\n");
5133 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5134 txd->rndis_pkt_dmap);
5135 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5136 txd->rndis_pkt, txd->rndis_pkt_dmap);
5140 /* All set, put it to list */
5141 txd->flags |= HN_TXD_FLAG_ONLIST;
5142 #ifndef HN_USE_TXDESC_BUFRING
5143 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5145 buf_ring_enqueue(txr->hn_txdesc_br, txd);
5148 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5150 if (sc->hn_tx_sysctl_tree != NULL) {
5151 struct sysctl_oid_list *child;
5152 struct sysctl_ctx_list *ctx;
5156 * Create per TX ring sysctl tree:
5157 * dev.hn.UNIT.tx.RINGID
5159 ctx = device_get_sysctl_ctx(dev);
5160 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5162 snprintf(name, sizeof(name), "%d", id);
5163 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5164 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5166 if (txr->hn_tx_sysctl_tree != NULL) {
5167 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5170 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5171 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5172 "# of available TX descs");
5174 #ifdef HN_IFSTART_SUPPORT
5175 if (!hn_use_if_start)
5178 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5179 CTLFLAG_RD, &txr->hn_oactive, 0,
5182 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5183 CTLFLAG_RW, &txr->hn_pkts,
5184 "# of packets transmitted");
5185 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5186 CTLFLAG_RW, &txr->hn_sends, "# of sends");
5194 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5196 struct hn_tx_ring *txr = txd->txr;
5198 KASSERT(txd->m == NULL, ("still has mbuf installed"));
5199 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5201 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5202 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5203 txd->rndis_pkt_dmap);
5204 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5208 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5211 KASSERT(txd->refs == 0 || txd->refs == 1,
5212 ("invalid txd refs %d", txd->refs));
5214 /* Aggregated txds will be freed by their aggregating txd. */
5215 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5218 freed = hn_txdesc_put(txr, txd);
5219 KASSERT(freed, ("can't free txdesc"));
5224 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5228 if (txr->hn_txdesc == NULL)
5233 * Because the freeing of aggregated txds will be deferred
5234 * to the aggregating txd, two passes are used here:
5235 * - The first pass GCes any pending txds. This GC is necessary,
5236 * since if the channels are revoked, hypervisor will not
5237 * deliver send-done for all pending txds.
5238 * - The second pass frees the busdma stuffs, i.e. after all txds
5241 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5242 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5243 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5244 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5246 if (txr->hn_tx_data_dtag != NULL)
5247 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5248 if (txr->hn_tx_rndis_dtag != NULL)
5249 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5251 #ifdef HN_USE_TXDESC_BUFRING
5252 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5255 free(txr->hn_txdesc, M_DEVBUF);
5256 txr->hn_txdesc = NULL;
5258 if (txr->hn_mbuf_br != NULL)
5259 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5261 #ifndef HN_USE_TXDESC_BUFRING
5262 mtx_destroy(&txr->hn_txlist_spin);
5264 mtx_destroy(&txr->hn_tx_lock);
5268 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5270 struct sysctl_oid_list *child;
5271 struct sysctl_ctx_list *ctx;
5275 * Create TXBUF for chimney sending.
5277 * NOTE: It is shared by all channels.
5279 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5280 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5281 BUS_DMA_WAITOK | BUS_DMA_ZERO);
5282 if (sc->hn_chim == NULL) {
5283 device_printf(sc->hn_dev, "allocate txbuf failed\n");
5287 sc->hn_tx_ring_cnt = ring_cnt;
5288 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5290 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5291 M_DEVBUF, M_WAITOK | M_ZERO);
5293 ctx = device_get_sysctl_ctx(sc->hn_dev);
5294 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5296 /* Create dev.hn.UNIT.tx sysctl tree */
5297 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5298 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5300 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5303 error = hn_tx_ring_create(sc, i);
5308 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5309 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5310 __offsetof(struct hn_tx_ring, hn_no_txdescs),
5311 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5312 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5313 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5314 __offsetof(struct hn_tx_ring, hn_send_failed),
5315 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5316 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5317 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5318 __offsetof(struct hn_tx_ring, hn_txdma_failed),
5319 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5320 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5321 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5322 __offsetof(struct hn_tx_ring, hn_flush_failed),
5323 hn_tx_stat_ulong_sysctl, "LU",
5324 "# of packet transmission aggregation flush failure");
5325 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5326 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5327 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5328 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5329 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5330 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5331 __offsetof(struct hn_tx_ring, hn_tx_chimney),
5332 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5333 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5334 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5335 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5336 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5337 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5338 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5339 "# of total TX descs");
5340 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5341 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5342 "Chimney send packet size upper boundary");
5343 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5344 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5345 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5346 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5347 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5348 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5349 hn_tx_conf_int_sysctl, "I",
5350 "Size of the packet for direct transmission");
5351 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5352 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5353 __offsetof(struct hn_tx_ring, hn_sched_tx),
5354 hn_tx_conf_int_sysctl, "I",
5355 "Always schedule transmission "
5356 "instead of doing direct transmission");
5357 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5358 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5359 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5360 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5361 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5362 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5363 "Applied packet transmission aggregation size");
5364 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5365 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5366 hn_txagg_pktmax_sysctl, "I",
5367 "Applied packet transmission aggregation packets");
5368 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5369 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5370 hn_txagg_align_sysctl, "I",
5371 "Applied packet transmission aggregation alignment");
5377 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5381 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5382 sc->hn_tx_ring[i].hn_chim_size = chim_size;
5386 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5388 struct ifnet *ifp = sc->hn_ifp;
5394 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5397 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5398 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5399 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5401 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5402 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5403 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5405 if (tso_maxlen < tso_minlen)
5406 tso_maxlen = tso_minlen;
5407 else if (tso_maxlen > IP_MAXPACKET)
5408 tso_maxlen = IP_MAXPACKET;
5409 if (tso_maxlen > sc->hn_ndis_tso_szmax)
5410 tso_maxlen = sc->hn_ndis_tso_szmax;
5411 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5413 if (hn_xpnt_vf_isready(sc)) {
5414 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5415 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5417 ifp->if_hw_tsomax = hw_tsomax;
5419 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5423 hn_fixup_tx_data(struct hn_softc *sc)
5425 uint64_t csum_assist;
5428 hn_set_chim_size(sc, sc->hn_chim_szmax);
5429 if (hn_tx_chimney_size > 0 &&
5430 hn_tx_chimney_size < sc->hn_chim_szmax)
5431 hn_set_chim_size(sc, hn_tx_chimney_size);
5434 if (sc->hn_caps & HN_CAP_IPCS)
5435 csum_assist |= CSUM_IP;
5436 if (sc->hn_caps & HN_CAP_TCP4CS)
5437 csum_assist |= CSUM_IP_TCP;
5438 if (sc->hn_caps & HN_CAP_UDP4CS)
5439 csum_assist |= CSUM_IP_UDP;
5440 if (sc->hn_caps & HN_CAP_TCP6CS)
5441 csum_assist |= CSUM_IP6_TCP;
5442 if (sc->hn_caps & HN_CAP_UDP6CS)
5443 csum_assist |= CSUM_IP6_UDP;
5444 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5445 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5447 if (sc->hn_caps & HN_CAP_HASHVAL) {
5449 * Support HASHVAL pktinfo on TX path.
5452 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5453 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5454 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5459 hn_destroy_tx_data(struct hn_softc *sc)
5463 if (sc->hn_chim != NULL) {
5464 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5465 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5467 device_printf(sc->hn_dev,
5468 "chimney sending buffer is referenced");
5473 if (sc->hn_tx_ring_cnt == 0)
5476 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5477 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5479 free(sc->hn_tx_ring, M_DEVBUF);
5480 sc->hn_tx_ring = NULL;
5482 sc->hn_tx_ring_cnt = 0;
5483 sc->hn_tx_ring_inuse = 0;
5486 #ifdef HN_IFSTART_SUPPORT
5489 hn_start_taskfunc(void *xtxr, int pending __unused)
5491 struct hn_tx_ring *txr = xtxr;
5493 mtx_lock(&txr->hn_tx_lock);
5494 hn_start_locked(txr, 0);
5495 mtx_unlock(&txr->hn_tx_lock);
5499 hn_start_locked(struct hn_tx_ring *txr, int len)
5501 struct hn_softc *sc = txr->hn_sc;
5502 struct ifnet *ifp = sc->hn_ifp;
5505 KASSERT(hn_use_if_start,
5506 ("hn_start_locked is called, when if_start is disabled"));
5507 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5508 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5509 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5511 if (__predict_false(txr->hn_suspended))
5514 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5518 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5519 struct hn_txdesc *txd;
5520 struct mbuf *m_head;
5523 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5527 if (len > 0 && m_head->m_pkthdr.len > len) {
5529 * This sending could be time consuming; let callers
5530 * dispatch this packet sending (and sending of any
5531 * following up packets) to tx taskqueue.
5533 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5538 #if defined(INET6) || defined(INET)
5539 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5540 m_head = hn_tso_fixup(m_head);
5541 if (__predict_false(m_head == NULL)) {
5542 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5548 txd = hn_txdesc_get(txr);
5550 txr->hn_no_txdescs++;
5551 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5552 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5556 error = hn_encap(ifp, txr, txd, &m_head);
5558 /* Both txd and m_head are freed */
5559 KASSERT(txr->hn_agg_txd == NULL,
5560 ("encap failed w/ pending aggregating txdesc"));
5564 if (txr->hn_agg_pktleft == 0) {
5565 if (txr->hn_agg_txd != NULL) {
5566 KASSERT(m_head == NULL,
5567 ("pending mbuf for aggregating txdesc"));
5568 error = hn_flush_txagg(ifp, txr);
5569 if (__predict_false(error)) {
5570 atomic_set_int(&ifp->if_drv_flags,
5575 KASSERT(m_head != NULL, ("mbuf was freed"));
5576 error = hn_txpkt(ifp, txr, txd);
5577 if (__predict_false(error)) {
5578 /* txd is freed, but m_head is not */
5579 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5580 atomic_set_int(&ifp->if_drv_flags,
5588 KASSERT(txr->hn_agg_txd != NULL,
5589 ("no aggregating txdesc"));
5590 KASSERT(m_head == NULL,
5591 ("pending mbuf for aggregating txdesc"));
5596 /* Flush pending aggerated transmission. */
5597 if (txr->hn_agg_txd != NULL)
5598 hn_flush_txagg(ifp, txr);
5603 hn_start(struct ifnet *ifp)
5605 struct hn_softc *sc = ifp->if_softc;
5606 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5608 if (txr->hn_sched_tx)
5611 if (mtx_trylock(&txr->hn_tx_lock)) {
5614 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5615 mtx_unlock(&txr->hn_tx_lock);
5620 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5624 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5626 struct hn_tx_ring *txr = xtxr;
5628 mtx_lock(&txr->hn_tx_lock);
5629 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5630 hn_start_locked(txr, 0);
5631 mtx_unlock(&txr->hn_tx_lock);
5635 hn_start_txeof(struct hn_tx_ring *txr)
5637 struct hn_softc *sc = txr->hn_sc;
5638 struct ifnet *ifp = sc->hn_ifp;
5640 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5642 if (txr->hn_sched_tx)
5645 if (mtx_trylock(&txr->hn_tx_lock)) {
5648 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5649 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5650 mtx_unlock(&txr->hn_tx_lock);
5652 taskqueue_enqueue(txr->hn_tx_taskq,
5658 * Release the OACTIVE earlier, with the hope, that
5659 * others could catch up. The task will clear the
5660 * flag again with the hn_tx_lock to avoid possible
5663 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5664 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5668 #endif /* HN_IFSTART_SUPPORT */
5671 hn_xmit(struct hn_tx_ring *txr, int len)
5673 struct hn_softc *sc = txr->hn_sc;
5674 struct ifnet *ifp = sc->hn_ifp;
5675 struct mbuf *m_head;
5678 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5679 #ifdef HN_IFSTART_SUPPORT
5680 KASSERT(hn_use_if_start == 0,
5681 ("hn_xmit is called, when if_start is enabled"));
5683 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5685 if (__predict_false(txr->hn_suspended))
5688 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5691 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5692 struct hn_txdesc *txd;
5695 if (len > 0 && m_head->m_pkthdr.len > len) {
5697 * This sending could be time consuming; let callers
5698 * dispatch this packet sending (and sending of any
5699 * following up packets) to tx taskqueue.
5701 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5706 txd = hn_txdesc_get(txr);
5708 txr->hn_no_txdescs++;
5709 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5710 txr->hn_oactive = 1;
5714 error = hn_encap(ifp, txr, txd, &m_head);
5716 /* Both txd and m_head are freed; discard */
5717 KASSERT(txr->hn_agg_txd == NULL,
5718 ("encap failed w/ pending aggregating txdesc"));
5719 drbr_advance(ifp, txr->hn_mbuf_br);
5723 if (txr->hn_agg_pktleft == 0) {
5724 if (txr->hn_agg_txd != NULL) {
5725 KASSERT(m_head == NULL,
5726 ("pending mbuf for aggregating txdesc"));
5727 error = hn_flush_txagg(ifp, txr);
5728 if (__predict_false(error)) {
5729 txr->hn_oactive = 1;
5733 KASSERT(m_head != NULL, ("mbuf was freed"));
5734 error = hn_txpkt(ifp, txr, txd);
5735 if (__predict_false(error)) {
5736 /* txd is freed, but m_head is not */
5737 drbr_putback(ifp, txr->hn_mbuf_br,
5739 txr->hn_oactive = 1;
5746 KASSERT(txr->hn_agg_txd != NULL,
5747 ("no aggregating txdesc"));
5748 KASSERT(m_head == NULL,
5749 ("pending mbuf for aggregating txdesc"));
5754 drbr_advance(ifp, txr->hn_mbuf_br);
5757 /* Flush pending aggerated transmission. */
5758 if (txr->hn_agg_txd != NULL)
5759 hn_flush_txagg(ifp, txr);
5764 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5766 struct hn_softc *sc = ifp->if_softc;
5767 struct hn_tx_ring *txr;
5770 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5771 struct rm_priotracker pt;
5773 rm_rlock(&sc->hn_vf_lock, &pt);
5774 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5775 struct mbuf *m_bpf = NULL;
5778 obytes = m->m_pkthdr.len;
5779 if (m->m_flags & M_MCAST)
5782 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5783 if (bpf_peers_present(ifp->if_bpf)) {
5784 m_bpf = m_copypacket(m, M_NOWAIT);
5785 if (m_bpf == NULL) {
5787 * Failed to grab a shallow
5790 ETHER_BPF_MTAP(ifp, m);
5794 ETHER_BPF_MTAP(ifp, m);
5797 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5798 rm_runlock(&sc->hn_vf_lock, &pt);
5800 if (m_bpf != NULL) {
5802 ETHER_BPF_MTAP(ifp, m_bpf);
5806 if (error == ENOBUFS) {
5807 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5809 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5811 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5812 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5814 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5820 rm_runlock(&sc->hn_vf_lock, &pt);
5823 #if defined(INET6) || defined(INET)
5825 * Perform TSO packet header fixup now, since the TSO
5826 * packet header should be cache-hot.
5828 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5829 m = hn_tso_fixup(m);
5830 if (__predict_false(m == NULL)) {
5831 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5838 * Select the TX ring based on flowid
5840 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
5844 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
5846 idx = bid % sc->hn_tx_ring_inuse;
5850 #if defined(INET6) || defined(INET)
5853 if (m->m_pkthdr.len < 128 &&
5854 (m->m_pkthdr.csum_flags &
5855 (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
5856 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
5857 m = hn_check_tcpsyn(m, &tcpsyn);
5858 if (__predict_false(m == NULL)) {
5860 IFCOUNTER_OERRORS, 1);
5865 const int tcpsyn = 0;
5870 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
5873 txr = &sc->hn_tx_ring[idx];
5875 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
5877 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5881 if (txr->hn_oactive)
5884 if (txr->hn_sched_tx)
5887 if (mtx_trylock(&txr->hn_tx_lock)) {
5890 sched = hn_xmit(txr, txr->hn_direct_tx_size);
5891 mtx_unlock(&txr->hn_tx_lock);
5896 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5901 hn_tx_ring_qflush(struct hn_tx_ring *txr)
5905 mtx_lock(&txr->hn_tx_lock);
5906 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
5908 mtx_unlock(&txr->hn_tx_lock);
5912 hn_xmit_qflush(struct ifnet *ifp)
5914 struct hn_softc *sc = ifp->if_softc;
5915 struct rm_priotracker pt;
5918 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
5919 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5922 rm_rlock(&sc->hn_vf_lock, &pt);
5923 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
5924 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
5925 rm_runlock(&sc->hn_vf_lock, &pt);
5929 hn_xmit_txeof(struct hn_tx_ring *txr)
5932 if (txr->hn_sched_tx)
5935 if (mtx_trylock(&txr->hn_tx_lock)) {
5938 txr->hn_oactive = 0;
5939 sched = hn_xmit(txr, txr->hn_direct_tx_size);
5940 mtx_unlock(&txr->hn_tx_lock);
5942 taskqueue_enqueue(txr->hn_tx_taskq,
5948 * Release the oactive earlier, with the hope, that
5949 * others could catch up. The task will clear the
5950 * oactive again with the hn_tx_lock to avoid possible
5953 txr->hn_oactive = 0;
5954 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5959 hn_xmit_taskfunc(void *xtxr, int pending __unused)
5961 struct hn_tx_ring *txr = xtxr;
5963 mtx_lock(&txr->hn_tx_lock);
5965 mtx_unlock(&txr->hn_tx_lock);
5969 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
5971 struct hn_tx_ring *txr = xtxr;
5973 mtx_lock(&txr->hn_tx_lock);
5974 txr->hn_oactive = 0;
5976 mtx_unlock(&txr->hn_tx_lock);
5980 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
5982 struct vmbus_chan_br cbr;
5983 struct hn_rx_ring *rxr;
5984 struct hn_tx_ring *txr = NULL;
5987 idx = vmbus_chan_subidx(chan);
5990 * Link this channel to RX/TX ring.
5992 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
5993 ("invalid channel index %d, should > 0 && < %d",
5994 idx, sc->hn_rx_ring_inuse));
5995 rxr = &sc->hn_rx_ring[idx];
5996 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
5997 ("RX ring %d already attached", idx));
5998 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
5999 rxr->hn_chan = chan;
6002 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6003 idx, vmbus_chan_id(chan));
6006 if (idx < sc->hn_tx_ring_inuse) {
6007 txr = &sc->hn_tx_ring[idx];
6008 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6009 ("TX ring %d already attached", idx));
6010 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6012 txr->hn_chan = chan;
6014 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6015 idx, vmbus_chan_id(chan));
6019 /* Bind this channel to a proper CPU. */
6020 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6025 cbr.cbr = rxr->hn_br;
6026 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6027 cbr.cbr_txsz = HN_TXBR_SIZE;
6028 cbr.cbr_rxsz = HN_RXBR_SIZE;
6029 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6031 if (error == EISCONN) {
6032 if_printf(sc->hn_ifp, "bufring is connected after "
6033 "chan%u open failure\n", vmbus_chan_id(chan));
6034 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6036 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6037 vmbus_chan_id(chan), error);
6044 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6046 struct hn_rx_ring *rxr;
6049 idx = vmbus_chan_subidx(chan);
6052 * Link this channel to RX/TX ring.
6054 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6055 ("invalid channel index %d, should > 0 && < %d",
6056 idx, sc->hn_rx_ring_inuse));
6057 rxr = &sc->hn_rx_ring[idx];
6058 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6059 ("RX ring %d is not attached", idx));
6060 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6062 if (idx < sc->hn_tx_ring_inuse) {
6063 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6065 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6066 ("TX ring %d is not attached attached", idx));
6067 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6071 * Close this channel.
6074 * Channel closing does _not_ destroy the target channel.
6076 error = vmbus_chan_close_direct(chan);
6077 if (error == EISCONN) {
6078 if_printf(sc->hn_ifp, "chan%u bufring is connected "
6079 "after being closed\n", vmbus_chan_id(chan));
6080 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6082 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6083 vmbus_chan_id(chan), error);
6088 hn_attach_subchans(struct hn_softc *sc)
6090 struct vmbus_channel **subchans;
6091 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6094 KASSERT(subchan_cnt > 0, ("no sub-channels"));
6096 /* Attach the sub-channels. */
6097 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6098 for (i = 0; i < subchan_cnt; ++i) {
6101 error1 = hn_chan_attach(sc, subchans[i]);
6104 /* Move on; all channels will be detached later. */
6107 vmbus_subchan_rel(subchans, subchan_cnt);
6110 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6113 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6121 hn_detach_allchans(struct hn_softc *sc)
6123 struct vmbus_channel **subchans;
6124 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6127 if (subchan_cnt == 0)
6130 /* Detach the sub-channels. */
6131 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6132 for (i = 0; i < subchan_cnt; ++i)
6133 hn_chan_detach(sc, subchans[i]);
6134 vmbus_subchan_rel(subchans, subchan_cnt);
6138 * Detach the primary channel, _after_ all sub-channels
6141 hn_chan_detach(sc, sc->hn_prichan);
6143 /* Wait for sub-channels to be destroyed, if any. */
6144 vmbus_subchan_drain(sc->hn_prichan);
6147 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6148 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6149 HN_RX_FLAG_ATTACHED) == 0,
6150 ("%dth RX ring is still attached", i));
6152 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6153 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6154 HN_TX_FLAG_ATTACHED) == 0,
6155 ("%dth TX ring is still attached", i));
6161 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6163 struct vmbus_channel **subchans;
6164 int nchan, rxr_cnt, error;
6166 nchan = *nsubch + 1;
6169 * Multiple RX/TX rings are not requested.
6176 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6179 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6181 /* No RSS; this is benign. */
6186 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6190 if (nchan > rxr_cnt)
6193 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6199 * Allocate sub-channels from NVS.
6201 *nsubch = nchan - 1;
6202 error = hn_nvs_alloc_subchans(sc, nsubch);
6203 if (error || *nsubch == 0) {
6204 /* Failed to allocate sub-channels. */
6210 * Wait for all sub-channels to become ready before moving on.
6212 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6213 vmbus_subchan_rel(subchans, *nsubch);
6218 hn_synth_attachable(const struct hn_softc *sc)
6222 if (sc->hn_flags & HN_FLAG_ERRORS)
6225 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6226 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6228 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6235 * Make sure that the RX filter is zero after the successful
6236 * RNDIS initialization.
6239 * Under certain conditions on certain versions of Hyper-V,
6240 * the RNDIS rxfilter is _not_ zero on the hypervisor side
6241 * after the successful RNDIS initialization, which breaks
6242 * the assumption of any following code (well, it breaks the
6243 * RNDIS API contract actually). Clear the RNDIS rxfilter
6244 * explicitly, drain packets sneaking through, and drain the
6245 * interrupt taskqueues scheduled due to the stealth packets.
6248 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6252 hn_drain_rxtx(sc, nchan);
6256 hn_synth_attach(struct hn_softc *sc, int mtu)
6258 #define ATTACHED_NVS 0x0002
6259 #define ATTACHED_RNDIS 0x0004
6261 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6262 int error, nsubch, nchan = 1, i, rndis_inited;
6263 uint32_t old_caps, attached = 0;
6265 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6266 ("synthetic parts were attached"));
6268 if (!hn_synth_attachable(sc))
6271 /* Save capabilities for later verification. */
6272 old_caps = sc->hn_caps;
6275 /* Clear RSS stuffs. */
6276 sc->hn_rss_ind_size = 0;
6277 sc->hn_rss_hash = 0;
6278 sc->hn_rss_hcap = 0;
6281 * Attach the primary channel _before_ attaching NVS and RNDIS.
6283 error = hn_chan_attach(sc, sc->hn_prichan);
6290 error = hn_nvs_attach(sc, mtu);
6293 attached |= ATTACHED_NVS;
6296 * Attach RNDIS _after_ NVS is attached.
6298 error = hn_rndis_attach(sc, mtu, &rndis_inited);
6300 attached |= ATTACHED_RNDIS;
6305 * Make sure capabilities are not changed.
6307 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6308 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6309 old_caps, sc->hn_caps);
6315 * Allocate sub-channels for multi-TX/RX rings.
6318 * The # of RX rings that can be used is equivalent to the # of
6319 * channels to be requested.
6321 nsubch = sc->hn_rx_ring_cnt - 1;
6322 error = hn_synth_alloc_subchans(sc, &nsubch);
6325 /* NOTE: _Full_ synthetic parts detach is required now. */
6326 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6329 * Set the # of TX/RX rings that could be used according to
6330 * the # of channels that NVS offered.
6333 hn_set_ring_inuse(sc, nchan);
6335 /* Only the primary channel can be used; done */
6340 * Attach the sub-channels.
6342 * NOTE: hn_set_ring_inuse() _must_ have been called.
6344 error = hn_attach_subchans(sc);
6349 * Configure RSS key and indirect table _after_ all sub-channels
6352 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6354 * RSS key is not set yet; set it to the default RSS key.
6357 if_printf(sc->hn_ifp, "setup default RSS key\n");
6359 rss_getkey(rss->rss_key);
6361 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6363 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6366 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6368 * RSS indirect table is not set yet; set it up in round-
6372 if_printf(sc->hn_ifp, "setup default RSS indirect "
6375 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
6379 subidx = rss_get_indirection_to_bucket(i);
6383 rss->rss_ind[i] = subidx % nchan;
6385 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6388 * # of usable channels may be changed, so we have to
6389 * make sure that all entries in RSS indirect table
6392 * NOTE: hn_set_ring_inuse() _must_ have been called.
6394 hn_rss_ind_fixup(sc);
6397 sc->hn_rss_hash = sc->hn_rss_hcap;
6398 if ((sc->hn_flags & HN_FLAG_RXVF) ||
6399 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6400 /* NOTE: Don't reconfigure RSS; will do immediately. */
6401 hn_vf_rss_fixup(sc, false);
6403 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6408 * Fixup transmission aggregation setup.
6411 hn_rndis_init_fixat(sc, nchan);
6415 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6416 hn_rndis_init_fixat(sc, nchan);
6417 hn_synth_detach(sc);
6419 if (attached & ATTACHED_RNDIS) {
6420 hn_rndis_init_fixat(sc, nchan);
6421 hn_rndis_detach(sc);
6423 if (attached & ATTACHED_NVS)
6425 hn_chan_detach(sc, sc->hn_prichan);
6426 /* Restore old capabilities. */
6427 sc->hn_caps = old_caps;
6431 #undef ATTACHED_RNDIS
6437 * The interface must have been suspended though hn_suspend(), before
6438 * this function get called.
6441 hn_synth_detach(struct hn_softc *sc)
6444 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6445 ("synthetic parts were not attached"));
6447 /* Detach the RNDIS first. */
6448 hn_rndis_detach(sc);
6453 /* Detach all of the channels. */
6454 hn_detach_allchans(sc);
6456 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6460 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6462 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6463 ("invalid ring count %d", ring_cnt));
6465 if (sc->hn_tx_ring_cnt > ring_cnt)
6466 sc->hn_tx_ring_inuse = ring_cnt;
6468 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6469 sc->hn_rx_ring_inuse = ring_cnt;
6472 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
6473 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
6474 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
6475 rss_getnumbuckets());
6480 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6481 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6486 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6491 * The TX bufring will not be drained by the hypervisor,
6492 * if the primary channel is revoked.
6494 while (!vmbus_chan_rx_empty(chan) ||
6495 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6496 !vmbus_chan_tx_empty(chan)))
6498 vmbus_chan_intr_drain(chan);
6502 hn_disable_rx(struct hn_softc *sc)
6506 * Disable RX by clearing RX filter forcefully.
6508 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6509 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6512 * Give RNDIS enough time to flush all pending data packets.
6514 pause("waitrx", (200 * hz) / 1000);
6519 * RX/TX _must_ have been suspended/disabled, before this function
6523 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6525 struct vmbus_channel **subch = NULL;
6529 * Drain RX/TX bufrings and interrupts.
6533 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6535 if (subch != NULL) {
6538 for (i = 0; i < nsubch; ++i)
6539 hn_chan_drain(sc, subch[i]);
6541 hn_chan_drain(sc, sc->hn_prichan);
6544 vmbus_subchan_rel(subch, nsubch);
6548 hn_suspend_data(struct hn_softc *sc)
6550 struct hn_tx_ring *txr;
6558 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6559 txr = &sc->hn_tx_ring[i];
6561 mtx_lock(&txr->hn_tx_lock);
6562 txr->hn_suspended = 1;
6563 mtx_unlock(&txr->hn_tx_lock);
6564 /* No one is able send more packets now. */
6567 * Wait for all pending sends to finish.
6570 * We will _not_ receive all pending send-done, if the
6571 * primary channel is revoked.
6573 while (hn_tx_ring_pending(txr) &&
6574 !vmbus_chan_is_revoked(sc->hn_prichan))
6575 pause("hnwtx", 1 /* 1 tick */);
6586 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6589 * Drain any pending TX tasks.
6592 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6593 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6595 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6596 txr = &sc->hn_tx_ring[i];
6598 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6599 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6604 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6607 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6611 hn_suspend_mgmt(struct hn_softc *sc)
6618 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6619 * through hn_mgmt_taskq.
6621 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6622 vmbus_chan_run_task(sc->hn_prichan, &task);
6625 * Make sure that all pending management tasks are completed.
6627 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6628 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6629 taskqueue_drain_all(sc->hn_mgmt_taskq0);
6633 hn_suspend(struct hn_softc *sc)
6636 /* Disable polling. */
6640 * If the non-transparent mode VF is activated, the synthetic
6641 * device is receiving packets, so the data path of the
6642 * synthetic device must be suspended.
6644 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6645 (sc->hn_flags & HN_FLAG_RXVF))
6646 hn_suspend_data(sc);
6647 hn_suspend_mgmt(sc);
6651 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6655 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6656 ("invalid TX ring count %d", tx_ring_cnt));
6658 for (i = 0; i < tx_ring_cnt; ++i) {
6659 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6661 mtx_lock(&txr->hn_tx_lock);
6662 txr->hn_suspended = 0;
6663 mtx_unlock(&txr->hn_tx_lock);
6668 hn_resume_data(struct hn_softc *sc)
6677 hn_rxfilter_config(sc);
6680 * Make sure to clear suspend status on "all" TX rings,
6681 * since hn_tx_ring_inuse can be changed after
6682 * hn_suspend_data().
6684 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6686 #ifdef HN_IFSTART_SUPPORT
6687 if (!hn_use_if_start)
6691 * Flush unused drbrs, since hn_tx_ring_inuse may be
6694 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6695 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6701 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6702 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6705 * Use txeof task, so that any pending oactive can be
6708 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6713 hn_resume_mgmt(struct hn_softc *sc)
6716 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6719 * Kick off network change detection, if it was pending.
6720 * If no network change was pending, start link status
6721 * checks, which is more lightweight than network change
6724 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6725 hn_change_network(sc);
6727 hn_update_link_status(sc);
6731 hn_resume(struct hn_softc *sc)
6735 * If the non-transparent mode VF is activated, the synthetic
6736 * device have to receive packets, so the data path of the
6737 * synthetic device must be resumed.
6739 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6740 (sc->hn_flags & HN_FLAG_RXVF))
6744 * Don't resume link status change if VF is attached/activated.
6745 * - In the non-transparent VF mode, the synthetic device marks
6746 * link down until the VF is deactivated; i.e. VF is down.
6747 * - In transparent VF mode, VF's media status is used until
6748 * the VF is detached.
6750 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6751 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6755 * Re-enable polling if this interface is running and
6756 * the polling is requested.
6758 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6759 hn_polling(sc, sc->hn_pollhz);
6763 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6765 const struct rndis_status_msg *msg;
6768 if (dlen < sizeof(*msg)) {
6769 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6774 switch (msg->rm_status) {
6775 case RNDIS_STATUS_MEDIA_CONNECT:
6776 case RNDIS_STATUS_MEDIA_DISCONNECT:
6777 hn_update_link_status(sc);
6780 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6781 case RNDIS_STATUS_LINK_SPEED_CHANGE:
6782 /* Not really useful; ignore. */
6785 case RNDIS_STATUS_NETWORK_CHANGE:
6786 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6787 if (dlen < ofs + msg->rm_stbuflen ||
6788 msg->rm_stbuflen < sizeof(uint32_t)) {
6789 if_printf(sc->hn_ifp, "network changed\n");
6793 memcpy(&change, ((const uint8_t *)msg) + ofs,
6795 if_printf(sc->hn_ifp, "network changed, change %u\n",
6798 hn_change_network(sc);
6802 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6809 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6811 const struct rndis_pktinfo *pi = info_data;
6814 while (info_dlen != 0) {
6818 if (__predict_false(info_dlen < sizeof(*pi)))
6820 if (__predict_false(info_dlen < pi->rm_size))
6822 info_dlen -= pi->rm_size;
6824 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6826 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6828 dlen = pi->rm_size - pi->rm_pktinfooffset;
6831 switch (pi->rm_type) {
6832 case NDIS_PKTINFO_TYPE_VLAN:
6833 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
6835 info->vlan_info = *((const uint32_t *)data);
6836 mask |= HN_RXINFO_VLAN;
6839 case NDIS_PKTINFO_TYPE_CSUM:
6840 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
6842 info->csum_info = *((const uint32_t *)data);
6843 mask |= HN_RXINFO_CSUM;
6846 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
6847 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
6849 info->hash_value = *((const uint32_t *)data);
6850 mask |= HN_RXINFO_HASHVAL;
6853 case HN_NDIS_PKTINFO_TYPE_HASHINF:
6854 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
6856 info->hash_info = *((const uint32_t *)data);
6857 mask |= HN_RXINFO_HASHINF;
6864 if (mask == HN_RXINFO_ALL) {
6865 /* All found; done */
6869 pi = (const struct rndis_pktinfo *)
6870 ((const uint8_t *)pi + pi->rm_size);
6875 * - If there is no hash value, invalidate the hash info.
6877 if ((mask & HN_RXINFO_HASHVAL) == 0)
6878 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
6882 static __inline bool
6883 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
6886 if (off < check_off) {
6887 if (__predict_true(off + len <= check_off))
6889 } else if (off > check_off) {
6890 if (__predict_true(check_off + check_len <= off))
6897 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
6899 const struct rndis_packet_msg *pkt;
6900 struct hn_rxinfo info;
6901 int data_off, pktinfo_off, data_len, pktinfo_len;
6906 if (__predict_false(dlen < sizeof(*pkt))) {
6907 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
6912 if (__predict_false(dlen < pkt->rm_len)) {
6913 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
6914 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
6917 if (__predict_false(pkt->rm_len <
6918 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
6919 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
6920 "msglen %u, data %u, oob %u, pktinfo %u\n",
6921 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
6922 pkt->rm_pktinfolen);
6925 if (__predict_false(pkt->rm_datalen == 0)) {
6926 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
6933 #define IS_OFFSET_INVALID(ofs) \
6934 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
6935 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
6937 /* XXX Hyper-V does not meet data offset alignment requirement */
6938 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
6939 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6940 "data offset %u\n", pkt->rm_dataoffset);
6943 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
6944 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
6945 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6946 "oob offset %u\n", pkt->rm_oobdataoffset);
6949 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
6950 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
6951 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6952 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
6956 #undef IS_OFFSET_INVALID
6958 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
6959 data_len = pkt->rm_datalen;
6960 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
6961 pktinfo_len = pkt->rm_pktinfolen;
6964 * Check OOB coverage.
6966 if (__predict_false(pkt->rm_oobdatalen != 0)) {
6967 int oob_off, oob_len;
6969 if_printf(rxr->hn_ifp, "got oobdata\n");
6970 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
6971 oob_len = pkt->rm_oobdatalen;
6973 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
6974 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6975 "oob overflow, msglen %u, oob abs %d len %d\n",
6976 pkt->rm_len, oob_off, oob_len);
6981 * Check against data.
6983 if (hn_rndis_check_overlap(oob_off, oob_len,
6984 data_off, data_len)) {
6985 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6986 "oob overlaps data, oob abs %d len %d, "
6987 "data abs %d len %d\n",
6988 oob_off, oob_len, data_off, data_len);
6993 * Check against pktinfo.
6995 if (pktinfo_len != 0 &&
6996 hn_rndis_check_overlap(oob_off, oob_len,
6997 pktinfo_off, pktinfo_len)) {
6998 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6999 "oob overlaps pktinfo, oob abs %d len %d, "
7000 "pktinfo abs %d len %d\n",
7001 oob_off, oob_len, pktinfo_off, pktinfo_len);
7007 * Check per-packet-info coverage and find useful per-packet-info.
7009 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7010 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7011 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7012 if (__predict_true(pktinfo_len != 0)) {
7016 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7017 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7018 "pktinfo overflow, msglen %u, "
7019 "pktinfo abs %d len %d\n",
7020 pkt->rm_len, pktinfo_off, pktinfo_len);
7025 * Check packet info coverage.
7027 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7028 data_off, data_len);
7029 if (__predict_false(overlap)) {
7030 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7031 "pktinfo overlap data, pktinfo abs %d len %d, "
7032 "data abs %d len %d\n",
7033 pktinfo_off, pktinfo_len, data_off, data_len);
7038 * Find useful per-packet-info.
7040 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7041 pktinfo_len, &info);
7042 if (__predict_false(error)) {
7043 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7049 if (__predict_false(data_off + data_len > pkt->rm_len)) {
7050 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7051 "data overflow, msglen %u, data abs %d len %d\n",
7052 pkt->rm_len, data_off, data_len);
7055 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7058 static __inline void
7059 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7061 const struct rndis_msghdr *hdr;
7063 if (__predict_false(dlen < sizeof(*hdr))) {
7064 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7069 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7070 /* Hot data path. */
7071 hn_rndis_rx_data(rxr, data, dlen);
7076 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7077 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7079 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7083 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7085 const struct hn_nvs_hdr *hdr;
7087 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7088 if_printf(sc->hn_ifp, "invalid nvs notify\n");
7091 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7093 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7094 /* Useless; ignore */
7097 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7101 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7102 const struct vmbus_chanpkt_hdr *pkt)
7104 struct hn_nvs_sendctx *sndc;
7106 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7107 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7108 VMBUS_CHANPKT_DATALEN(pkt));
7111 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7117 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7118 const struct vmbus_chanpkt_hdr *pkthdr)
7120 const struct vmbus_chanpkt_rxbuf *pkt;
7121 const struct hn_nvs_hdr *nvs_hdr;
7124 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7125 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7128 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7130 /* Make sure that this is a RNDIS message. */
7131 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7132 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7137 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7138 if (__predict_false(hlen < sizeof(*pkt))) {
7139 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7142 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7144 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7145 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7150 count = pkt->cp_rxbuf_cnt;
7151 if (__predict_false(hlen <
7152 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7153 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7157 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7158 for (i = 0; i < count; ++i) {
7161 ofs = pkt->cp_rxbuf[i].rb_ofs;
7162 len = pkt->cp_rxbuf[i].rb_len;
7163 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7164 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7165 "ofs %d, len %d\n", i, ofs, len);
7168 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7172 * Ack the consumed RXBUF associated w/ this channel packet,
7173 * so that this RXBUF can be recycled by the hypervisor.
7175 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7179 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7182 struct hn_nvs_rndis_ack ack;
7185 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7186 ack.nvs_status = HN_NVS_STATUS_OK;
7190 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7191 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7192 if (__predict_false(error == EAGAIN)) {
7195 * This should _not_ happen in real world, since the
7196 * consumption of the TX bufring from the TX path is
7199 if (rxr->hn_ack_failed == 0)
7200 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7201 rxr->hn_ack_failed++;
7208 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7213 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7215 struct hn_rx_ring *rxr = xrxr;
7216 struct hn_softc *sc = rxr->hn_ifp->if_softc;
7219 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7222 pktlen = rxr->hn_pktbuf_len;
7223 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7224 if (__predict_false(error == ENOBUFS)) {
7229 * Expand channel packet buffer.
7232 * Use M_WAITOK here, since allocation failure
7235 nlen = rxr->hn_pktbuf_len * 2;
7236 while (nlen < pktlen)
7238 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7240 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7241 rxr->hn_pktbuf_len, nlen);
7243 free(rxr->hn_pktbuf, M_DEVBUF);
7244 rxr->hn_pktbuf = nbuf;
7245 rxr->hn_pktbuf_len = nlen;
7248 } else if (__predict_false(error == EAGAIN)) {
7249 /* No more channel packets; done! */
7252 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7254 switch (pkt->cph_type) {
7255 case VMBUS_CHANPKT_TYPE_COMP:
7256 hn_nvs_handle_comp(sc, chan, pkt);
7259 case VMBUS_CHANPKT_TYPE_RXBUF:
7260 hn_nvs_handle_rxbuf(rxr, chan, pkt);
7263 case VMBUS_CHANPKT_TYPE_INBAND:
7264 hn_nvs_handle_notify(sc, pkt);
7268 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7273 hn_chan_rollup(rxr, rxr->hn_txr);
7277 hn_sysinit(void *arg __unused)
7281 #ifdef HN_IFSTART_SUPPORT
7283 * Don't use ifnet.if_start if transparent VF mode is requested;
7284 * mainly due to the IFF_DRV_OACTIVE flag.
7286 if (hn_xpnt_vf && hn_use_if_start) {
7287 hn_use_if_start = 0;
7288 printf("hn: tranparent VF mode, if_transmit will be used, "
7289 "instead of if_start\n");
7292 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7293 printf("hn: invalid transparent VF attach routing "
7294 "wait timeout %d, reset to %d\n",
7295 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7296 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7300 * Initialize VF map.
7302 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7303 hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7304 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7308 * Fix the # of TX taskqueues.
7310 if (hn_tx_taskq_cnt <= 0)
7311 hn_tx_taskq_cnt = 1;
7312 else if (hn_tx_taskq_cnt > mp_ncpus)
7313 hn_tx_taskq_cnt = mp_ncpus;
7316 * Fix the TX taskqueue mode.
7318 switch (hn_tx_taskq_mode) {
7319 case HN_TX_TASKQ_M_INDEP:
7320 case HN_TX_TASKQ_M_GLOBAL:
7321 case HN_TX_TASKQ_M_EVTTQ:
7324 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7328 if (vm_guest != VM_GUEST_HV)
7331 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7334 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7335 M_DEVBUF, M_WAITOK);
7336 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7337 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7338 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7339 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7343 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7346 hn_sysuninit(void *arg __unused)
7349 if (hn_tx_taskque != NULL) {
7352 for (i = 0; i < hn_tx_taskq_cnt; ++i)
7353 taskqueue_free(hn_tx_taskque[i]);
7354 free(hn_tx_taskque, M_DEVBUF);
7357 if (hn_vfmap != NULL)
7358 free(hn_vfmap, M_DEVBUF);
7359 rm_destroy(&hn_vfmap_lock);
7361 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);