2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
62 #include <sys/param.h>
64 #include <sys/kernel.h>
65 #include <sys/limits.h>
66 #include <sys/malloc.h>
68 #include <sys/module.h>
70 #include <sys/queue.h>
72 #include <sys/rmlock.h>
75 #include <sys/socket.h>
76 #include <sys/sockio.h>
78 #include <sys/sysctl.h>
79 #include <sys/systm.h>
80 #include <sys/taskqueue.h>
81 #include <sys/buf_ring.h>
82 #include <sys/eventhandler.h>
84 #include <machine/atomic.h>
85 #include <machine/in_cksum.h>
88 #include <net/ethernet.h>
90 #include <net/if_arp.h>
91 #include <net/if_dl.h>
92 #include <net/if_media.h>
93 #include <net/if_types.h>
94 #include <net/if_var.h>
95 #include <net/if_vlan_var.h>
96 #include <net/rndis.h>
98 #include <netinet/in_systm.h>
99 #include <netinet/in.h>
100 #include <netinet/ip.h>
101 #include <netinet/ip6.h>
102 #include <netinet/tcp.h>
103 #include <netinet/tcp_lro.h>
104 #include <netinet/udp.h>
106 #include <dev/hyperv/include/hyperv.h>
107 #include <dev/hyperv/include/hyperv_busdma.h>
108 #include <dev/hyperv/include/vmbus.h>
109 #include <dev/hyperv/include/vmbus_xact.h>
111 #include <dev/hyperv/netvsc/ndis.h>
112 #include <dev/hyperv/netvsc/if_hnreg.h>
113 #include <dev/hyperv/netvsc/if_hnvar.h>
114 #include <dev/hyperv/netvsc/hn_nvs.h>
115 #include <dev/hyperv/netvsc/hn_rndis.h>
117 #include "vmbus_if.h"
119 #define HN_IFSTART_SUPPORT
121 #define HN_RING_CNT_DEF_MAX 8
123 #define HN_VFMAP_SIZE_DEF 8
125 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */
127 /* YYY should get it from the underlying channel */
128 #define HN_TX_DESC_CNT 512
130 #define HN_RNDIS_PKT_LEN \
131 (sizeof(struct rndis_packet_msg) + \
132 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
133 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
134 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
135 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
136 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
137 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
139 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
140 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
141 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
142 /* -1 for RNDIS packet message */
143 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
145 #define HN_DIRECT_TX_SIZE_DEF 128
147 #define HN_EARLY_TXEOF_THRESH 8
149 #define HN_PKTBUF_LEN_DEF (16 * 1024)
151 #define HN_LROENT_CNT_DEF 128
153 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
154 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
155 /* YYY 2*MTU is a bit rough, but should be good enough. */
156 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
158 #define HN_LRO_ACKCNT_DEF 1
160 #define HN_LOCK_INIT(sc) \
161 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
162 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
163 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
164 #define HN_LOCK(sc) \
166 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
169 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
171 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
172 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
173 #define HN_CSUM_IP_HWASSIST(sc) \
174 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
175 #define HN_CSUM_IP6_HWASSIST(sc) \
176 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
178 #define HN_PKTSIZE_MIN(align) \
179 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
180 HN_RNDIS_PKT_LEN, (align))
181 #define HN_PKTSIZE(m, align) \
182 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
184 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
187 #ifndef HN_USE_TXDESC_BUFRING
188 SLIST_ENTRY(hn_txdesc) link;
190 STAILQ_ENTRY(hn_txdesc) agg_link;
192 /* Aggregated txdescs, in sending order. */
193 STAILQ_HEAD(, hn_txdesc) agg_list;
195 /* The oldest packet, if transmission aggregation happens. */
197 struct hn_tx_ring *txr;
199 uint32_t flags; /* HN_TXD_FLAG_ */
200 struct hn_nvs_sendctx send_ctx;
204 bus_dmamap_t data_dmap;
206 bus_addr_t rndis_pkt_paddr;
207 struct rndis_packet_msg *rndis_pkt;
208 bus_dmamap_t rndis_pkt_dmap;
211 #define HN_TXD_FLAG_ONLIST 0x0001
212 #define HN_TXD_FLAG_DMAMAP 0x0002
213 #define HN_TXD_FLAG_ONAGG 0x0004
222 struct hn_rxvf_setarg {
223 struct hn_rx_ring *rxr;
224 struct ifnet *vf_ifp;
227 #define HN_RXINFO_VLAN 0x0001
228 #define HN_RXINFO_CSUM 0x0002
229 #define HN_RXINFO_HASHINF 0x0004
230 #define HN_RXINFO_HASHVAL 0x0008
231 #define HN_RXINFO_ALL \
234 HN_RXINFO_HASHINF | \
237 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
238 #define HN_NDIS_RXCSUM_INFO_INVALID 0
239 #define HN_NDIS_HASH_INFO_INVALID 0
241 static int hn_probe(device_t);
242 static int hn_attach(device_t);
243 static int hn_detach(device_t);
244 static int hn_shutdown(device_t);
245 static void hn_chan_callback(struct vmbus_channel *,
248 static void hn_init(void *);
249 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
250 #ifdef HN_IFSTART_SUPPORT
251 static void hn_start(struct ifnet *);
253 static int hn_transmit(struct ifnet *, struct mbuf *);
254 static void hn_xmit_qflush(struct ifnet *);
255 static int hn_ifmedia_upd(struct ifnet *);
256 static void hn_ifmedia_sts(struct ifnet *,
257 struct ifmediareq *);
259 static void hn_ifnet_event(void *, struct ifnet *, int);
260 static void hn_ifaddr_event(void *, struct ifnet *);
261 static void hn_ifnet_attevent(void *, struct ifnet *);
262 static void hn_ifnet_detevent(void *, struct ifnet *);
263 static void hn_ifnet_lnkevent(void *, struct ifnet *, int);
265 static bool hn_ismyvf(const struct hn_softc *,
266 const struct ifnet *);
267 static void hn_rxvf_change(struct hn_softc *,
268 struct ifnet *, bool);
269 static void hn_rxvf_set(struct hn_softc *, struct ifnet *);
270 static void hn_rxvf_set_task(void *, int);
271 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
272 static int hn_xpnt_vf_iocsetflags(struct hn_softc *);
273 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *,
275 static void hn_xpnt_vf_saveifflags(struct hn_softc *);
276 static bool hn_xpnt_vf_isready(struct hn_softc *);
277 static void hn_xpnt_vf_setready(struct hn_softc *);
278 static void hn_xpnt_vf_init_taskfunc(void *, int);
279 static void hn_xpnt_vf_init(struct hn_softc *);
280 static void hn_xpnt_vf_setenable(struct hn_softc *);
281 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool);
282 static void hn_vf_rss_fixup(struct hn_softc *, bool);
283 static void hn_vf_rss_restore(struct hn_softc *);
285 static int hn_rndis_rxinfo(const void *, int,
287 static void hn_rndis_rx_data(struct hn_rx_ring *,
289 static void hn_rndis_rx_status(struct hn_softc *,
291 static void hn_rndis_init_fixat(struct hn_softc *, int);
293 static void hn_nvs_handle_notify(struct hn_softc *,
294 const struct vmbus_chanpkt_hdr *);
295 static void hn_nvs_handle_comp(struct hn_softc *,
296 struct vmbus_channel *,
297 const struct vmbus_chanpkt_hdr *);
298 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
299 struct vmbus_channel *,
300 const struct vmbus_chanpkt_hdr *);
301 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
302 struct vmbus_channel *, uint64_t);
304 #if __FreeBSD_version >= 1100099
305 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
306 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
308 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
309 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
310 #if __FreeBSD_version < 1100095
311 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
313 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
315 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
316 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
317 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
318 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
319 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
320 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
321 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
322 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
323 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
324 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
325 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
326 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
327 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
328 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
329 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
330 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
331 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
332 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
333 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
334 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
335 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
336 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
337 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
339 static void hn_stop(struct hn_softc *, bool);
340 static void hn_init_locked(struct hn_softc *);
341 static int hn_chan_attach(struct hn_softc *,
342 struct vmbus_channel *);
343 static void hn_chan_detach(struct hn_softc *,
344 struct vmbus_channel *);
345 static int hn_attach_subchans(struct hn_softc *);
346 static void hn_detach_allchans(struct hn_softc *);
347 static void hn_chan_rollup(struct hn_rx_ring *,
348 struct hn_tx_ring *);
349 static void hn_set_ring_inuse(struct hn_softc *, int);
350 static int hn_synth_attach(struct hn_softc *, int);
351 static void hn_synth_detach(struct hn_softc *);
352 static int hn_synth_alloc_subchans(struct hn_softc *,
354 static bool hn_synth_attachable(const struct hn_softc *);
355 static void hn_suspend(struct hn_softc *);
356 static void hn_suspend_data(struct hn_softc *);
357 static void hn_suspend_mgmt(struct hn_softc *);
358 static void hn_resume(struct hn_softc *);
359 static void hn_resume_data(struct hn_softc *);
360 static void hn_resume_mgmt(struct hn_softc *);
361 static void hn_suspend_mgmt_taskfunc(void *, int);
362 static void hn_chan_drain(struct hn_softc *,
363 struct vmbus_channel *);
364 static void hn_disable_rx(struct hn_softc *);
365 static void hn_drain_rxtx(struct hn_softc *, int);
366 static void hn_polling(struct hn_softc *, u_int);
367 static void hn_chan_polling(struct vmbus_channel *, u_int);
368 static void hn_mtu_change_fixup(struct hn_softc *);
370 static void hn_update_link_status(struct hn_softc *);
371 static void hn_change_network(struct hn_softc *);
372 static void hn_link_taskfunc(void *, int);
373 static void hn_netchg_init_taskfunc(void *, int);
374 static void hn_netchg_status_taskfunc(void *, int);
375 static void hn_link_status(struct hn_softc *);
377 static int hn_create_rx_data(struct hn_softc *, int);
378 static void hn_destroy_rx_data(struct hn_softc *);
379 static int hn_check_iplen(const struct mbuf *, int);
380 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
381 static int hn_rxfilter_config(struct hn_softc *);
382 static int hn_rss_reconfig(struct hn_softc *);
383 static void hn_rss_ind_fixup(struct hn_softc *);
384 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
385 static int hn_rxpkt(struct hn_rx_ring *, const void *,
386 int, const struct hn_rxinfo *);
387 static uint32_t hn_rss_type_fromndis(uint32_t);
388 static uint32_t hn_rss_type_tondis(uint32_t);
390 static int hn_tx_ring_create(struct hn_softc *, int);
391 static void hn_tx_ring_destroy(struct hn_tx_ring *);
392 static int hn_create_tx_data(struct hn_softc *, int);
393 static void hn_fixup_tx_data(struct hn_softc *);
394 static void hn_destroy_tx_data(struct hn_softc *);
395 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
396 static void hn_txdesc_gc(struct hn_tx_ring *,
398 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
399 struct hn_txdesc *, struct mbuf **);
400 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
402 static void hn_set_chim_size(struct hn_softc *, int);
403 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
404 static bool hn_tx_ring_pending(struct hn_tx_ring *);
405 static void hn_tx_ring_qflush(struct hn_tx_ring *);
406 static void hn_resume_tx(struct hn_softc *, int);
407 static void hn_set_txagg(struct hn_softc *);
408 static void *hn_try_txagg(struct ifnet *,
409 struct hn_tx_ring *, struct hn_txdesc *,
411 static int hn_get_txswq_depth(const struct hn_tx_ring *);
412 static void hn_txpkt_done(struct hn_nvs_sendctx *,
413 struct hn_softc *, struct vmbus_channel *,
415 static int hn_txpkt_sglist(struct hn_tx_ring *,
417 static int hn_txpkt_chim(struct hn_tx_ring *,
419 static int hn_xmit(struct hn_tx_ring *, int);
420 static void hn_xmit_taskfunc(void *, int);
421 static void hn_xmit_txeof(struct hn_tx_ring *);
422 static void hn_xmit_txeof_taskfunc(void *, int);
423 #ifdef HN_IFSTART_SUPPORT
424 static int hn_start_locked(struct hn_tx_ring *, int);
425 static void hn_start_taskfunc(void *, int);
426 static void hn_start_txeof(struct hn_tx_ring *);
427 static void hn_start_txeof_taskfunc(void *, int);
430 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
431 "Hyper-V network interface");
433 /* Trust tcp segements verification on host side. */
434 static int hn_trust_hosttcp = 1;
435 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
436 &hn_trust_hosttcp, 0,
437 "Trust tcp segement verification on host side, "
438 "when csum info is missing (global setting)");
440 /* Trust udp datagrams verification on host side. */
441 static int hn_trust_hostudp = 1;
442 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
443 &hn_trust_hostudp, 0,
444 "Trust udp datagram verification on host side, "
445 "when csum info is missing (global setting)");
447 /* Trust ip packets verification on host side. */
448 static int hn_trust_hostip = 1;
449 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
451 "Trust ip packet verification on host side, "
452 "when csum info is missing (global setting)");
454 /* Limit TSO burst size */
455 static int hn_tso_maxlen = IP_MAXPACKET;
456 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
457 &hn_tso_maxlen, 0, "TSO burst limit");
459 /* Limit chimney send size */
460 static int hn_tx_chimney_size = 0;
461 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
462 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
464 /* Limit the size of packet for direct transmission */
465 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
466 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
467 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
469 /* # of LRO entries per RX ring */
470 #if defined(INET) || defined(INET6)
471 #if __FreeBSD_version >= 1100095
472 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
473 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
474 &hn_lro_entry_count, 0, "LRO entry count");
478 static int hn_tx_taskq_cnt = 1;
479 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
480 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
482 #define HN_TX_TASKQ_M_INDEP 0
483 #define HN_TX_TASKQ_M_GLOBAL 1
484 #define HN_TX_TASKQ_M_EVTTQ 2
486 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
487 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
488 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
489 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
491 #ifndef HN_USE_TXDESC_BUFRING
492 static int hn_use_txdesc_bufring = 0;
494 static int hn_use_txdesc_bufring = 1;
496 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
497 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
499 #ifdef HN_IFSTART_SUPPORT
500 /* Use ifnet.if_start instead of ifnet.if_transmit */
501 static int hn_use_if_start = 0;
502 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
503 &hn_use_if_start, 0, "Use if_start TX method");
506 /* # of channels to use */
507 static int hn_chan_cnt = 0;
508 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
510 "# of channels to use; each channel has one RX ring and one TX ring");
512 /* # of transmit rings to use */
513 static int hn_tx_ring_cnt = 0;
514 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
515 &hn_tx_ring_cnt, 0, "# of TX rings to use");
517 /* Software TX ring deptch */
518 static int hn_tx_swq_depth = 0;
519 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
520 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
522 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
523 #if __FreeBSD_version >= 1100095
524 static u_int hn_lro_mbufq_depth = 0;
525 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
526 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
529 /* Packet transmission aggregation size limit */
530 static int hn_tx_agg_size = -1;
531 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
532 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
534 /* Packet transmission aggregation count limit */
535 static int hn_tx_agg_pkts = -1;
536 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
537 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
540 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
541 0, 0, hn_vflist_sysctl, "A", "VF list");
544 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
545 0, 0, hn_vfmap_sysctl, "A", "VF mapping");
548 static int hn_xpnt_vf = 0;
549 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
550 &hn_xpnt_vf, 0, "Transparent VF mod");
552 /* Accurate BPF support for Transparent VF */
553 static int hn_xpnt_vf_accbpf = 0;
554 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
555 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
557 /* Extra wait for transparent VF attach routing; unit seconds. */
558 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
559 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
560 &hn_xpnt_vf_attwait, 0,
561 "Extra wait for transparent VF attach routing; unit: seconds");
563 static u_int hn_cpu_index; /* next CPU for channel */
564 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
566 static struct rmlock hn_vfmap_lock;
567 static int hn_vfmap_size;
568 static struct ifnet **hn_vfmap;
571 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
572 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
573 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
574 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
575 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
576 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
579 static const struct hyperv_guid hn_guid = {
581 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
582 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
585 static device_method_t hn_methods[] = {
586 /* Device interface */
587 DEVMETHOD(device_probe, hn_probe),
588 DEVMETHOD(device_attach, hn_attach),
589 DEVMETHOD(device_detach, hn_detach),
590 DEVMETHOD(device_shutdown, hn_shutdown),
594 static driver_t hn_driver = {
597 sizeof(struct hn_softc)
600 static devclass_t hn_devclass;
602 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
603 MODULE_VERSION(hn, 1);
604 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
606 #if __FreeBSD_version >= 1100099
608 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
612 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
613 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
618 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
621 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
622 txd->chim_size == 0, ("invalid rndis sglist txd"));
623 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
624 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
628 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
630 struct hn_nvs_rndis rndis;
632 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
633 txd->chim_size > 0, ("invalid rndis chim txd"));
635 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
636 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
637 rndis.nvs_chim_idx = txd->chim_index;
638 rndis.nvs_chim_sz = txd->chim_size;
640 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
641 &rndis, sizeof(rndis), &txd->send_ctx));
644 static __inline uint32_t
645 hn_chim_alloc(struct hn_softc *sc)
647 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
648 u_long *bmap = sc->hn_chim_bmap;
649 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
651 for (i = 0; i < bmap_cnt; ++i) {
654 idx = ffsl(~bmap[i]);
658 --idx; /* ffsl is 1-based */
659 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
660 ("invalid i %d and idx %d", i, idx));
662 if (atomic_testandset_long(&bmap[i], idx))
665 ret = i * LONG_BIT + idx;
672 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
677 idx = chim_idx / LONG_BIT;
678 KASSERT(idx < sc->hn_chim_bmap_cnt,
679 ("invalid chimney index 0x%x", chim_idx));
681 mask = 1UL << (chim_idx % LONG_BIT);
682 KASSERT(sc->hn_chim_bmap[idx] & mask,
683 ("index bitmap 0x%lx, chimney index %u, "
684 "bitmap idx %d, bitmask 0x%lx",
685 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
687 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
690 #if defined(INET6) || defined(INET)
692 #define PULLUP_HDR(m, len) \
694 if (__predict_false((m)->m_len < (len))) { \
695 (m) = m_pullup((m), (len)); \
702 * NOTE: If this function failed, the m_head would be freed.
704 static __inline struct mbuf *
705 hn_tso_fixup(struct mbuf *m_head)
707 struct ether_vlan_header *evl;
711 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
713 PULLUP_HDR(m_head, sizeof(*evl));
714 evl = mtod(m_head, struct ether_vlan_header *);
715 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
716 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
718 ehlen = ETHER_HDR_LEN;
721 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
725 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
726 ip = mtodo(m_head, ehlen);
727 iphlen = ip->ip_hl << 2;
729 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
730 th = mtodo(m_head, ehlen + iphlen);
734 th->th_sum = in_pseudo(ip->ip_src.s_addr,
735 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
738 #if defined(INET6) && defined(INET)
745 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
746 ip6 = mtodo(m_head, ehlen);
747 if (ip6->ip6_nxt != IPPROTO_TCP) {
752 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
753 th = mtodo(m_head, ehlen + sizeof(*ip6));
756 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
764 * NOTE: If this function failed, the m_head would be freed.
766 static __inline struct mbuf *
767 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
769 const struct ether_vlan_header *evl;
770 const struct tcphdr *th;
775 PULLUP_HDR(m_head, sizeof(*evl));
776 evl = mtod(m_head, const struct ether_vlan_header *);
777 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
778 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
780 ehlen = ETHER_HDR_LEN;
783 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TCP) {
787 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
788 ip = mtodo(m_head, ehlen);
789 iphlen = ip->ip_hl << 2;
791 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
792 th = mtodo(m_head, ehlen + iphlen);
793 if (th->th_flags & TH_SYN)
797 #if defined(INET6) && defined(INET)
802 const struct ip6_hdr *ip6;
804 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
805 ip6 = mtodo(m_head, ehlen);
806 if (ip6->ip6_nxt != IPPROTO_TCP)
809 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
810 th = mtodo(m_head, ehlen + sizeof(*ip6));
811 if (th->th_flags & TH_SYN)
820 #endif /* INET6 || INET */
823 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
829 if (sc->hn_rx_filter != filter) {
830 error = hn_rndis_set_rxfilter(sc, filter);
832 sc->hn_rx_filter = filter;
838 hn_rxfilter_config(struct hn_softc *sc)
840 struct ifnet *ifp = sc->hn_ifp;
846 * If the non-transparent mode VF is activated, we don't know how
847 * its RX filter is configured, so stick the synthetic device in
848 * the promiscous mode.
850 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
851 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
853 filter = NDIS_PACKET_TYPE_DIRECTED;
854 if (ifp->if_flags & IFF_BROADCAST)
855 filter |= NDIS_PACKET_TYPE_BROADCAST;
856 /* TODO: support multicast list */
857 if ((ifp->if_flags & IFF_ALLMULTI) ||
858 !TAILQ_EMPTY(&ifp->if_multiaddrs))
859 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
861 return (hn_set_rxfilter(sc, filter));
865 hn_set_txagg(struct hn_softc *sc)
871 * Setup aggregation size.
873 if (sc->hn_agg_size < 0)
876 size = sc->hn_agg_size;
878 if (sc->hn_rndis_agg_size < size)
879 size = sc->hn_rndis_agg_size;
881 /* NOTE: We only aggregate packets using chimney sending buffers. */
882 if (size > (uint32_t)sc->hn_chim_szmax)
883 size = sc->hn_chim_szmax;
885 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
892 /* NOTE: Type of the per TX ring setting is 'int'. */
897 * Setup aggregation packet count.
899 if (sc->hn_agg_pkts < 0)
902 pkts = sc->hn_agg_pkts;
904 if (sc->hn_rndis_agg_pkts < pkts)
905 pkts = sc->hn_rndis_agg_pkts;
914 /* NOTE: Type of the per TX ring setting is 'short'. */
919 /* NOTE: Type of the per TX ring setting is 'short'. */
920 if (sc->hn_rndis_agg_align > SHRT_MAX) {
927 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
928 size, pkts, sc->hn_rndis_agg_align);
931 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
932 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
934 mtx_lock(&txr->hn_tx_lock);
935 txr->hn_agg_szmax = size;
936 txr->hn_agg_pktmax = pkts;
937 txr->hn_agg_align = sc->hn_rndis_agg_align;
938 mtx_unlock(&txr->hn_tx_lock);
943 hn_get_txswq_depth(const struct hn_tx_ring *txr)
946 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
947 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
948 return txr->hn_txdesc_cnt;
949 return hn_tx_swq_depth;
953 hn_rss_reconfig(struct hn_softc *sc)
959 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
966 * Direct reconfiguration by setting the UNCHG flags does
967 * _not_ work properly.
970 if_printf(sc->hn_ifp, "disable RSS\n");
971 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
973 if_printf(sc->hn_ifp, "RSS disable failed\n");
978 * Reenable the RSS w/ the updated RSS key or indirect
982 if_printf(sc->hn_ifp, "reconfig RSS\n");
983 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
985 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
992 hn_rss_ind_fixup(struct hn_softc *sc)
994 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
997 nchan = sc->hn_rx_ring_inuse;
998 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1001 * Check indirect table to make sure that all channels in it
1004 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1005 if (rss->rss_ind[i] >= nchan) {
1006 if_printf(sc->hn_ifp,
1007 "RSS indirect table %d fixup: %u -> %d\n",
1008 i, rss->rss_ind[i], nchan - 1);
1009 rss->rss_ind[i] = nchan - 1;
1015 hn_ifmedia_upd(struct ifnet *ifp __unused)
1022 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1024 struct hn_softc *sc = ifp->if_softc;
1026 ifmr->ifm_status = IFM_AVALID;
1027 ifmr->ifm_active = IFM_ETHER;
1029 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1030 ifmr->ifm_active |= IFM_NONE;
1033 ifmr->ifm_status |= IFM_ACTIVE;
1034 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1038 hn_rxvf_set_task(void *xarg, int pending __unused)
1040 struct hn_rxvf_setarg *arg = xarg;
1042 arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1046 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1048 struct hn_rx_ring *rxr;
1049 struct hn_rxvf_setarg arg;
1055 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1057 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1058 rxr = &sc->hn_rx_ring[i];
1060 if (i < sc->hn_rx_ring_inuse) {
1062 arg.vf_ifp = vf_ifp;
1063 vmbus_chan_run_task(rxr->hn_chan, &task);
1065 rxr->hn_rxvf_ifp = vf_ifp;
1071 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1073 const struct ifnet *hn_ifp;
1075 hn_ifp = sc->hn_ifp;
1080 if (ifp->if_alloctype != IFT_ETHER)
1083 /* Ignore lagg/vlan interfaces */
1084 if (strcmp(ifp->if_dname, "lagg") == 0 ||
1085 strcmp(ifp->if_dname, "vlan") == 0)
1088 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1095 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1097 struct ifnet *hn_ifp;
1101 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1104 if (!hn_ismyvf(sc, ifp))
1106 hn_ifp = sc->hn_ifp;
1109 if (sc->hn_flags & HN_FLAG_RXVF)
1112 sc->hn_flags |= HN_FLAG_RXVF;
1113 hn_rxfilter_config(sc);
1115 if (!(sc->hn_flags & HN_FLAG_RXVF))
1118 sc->hn_flags &= ~HN_FLAG_RXVF;
1119 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1120 hn_rxfilter_config(sc);
1122 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1125 hn_nvs_set_datapath(sc,
1126 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1128 hn_rxvf_set(sc, rxvf ? ifp : NULL);
1131 hn_vf_rss_fixup(sc, true);
1132 hn_suspend_mgmt(sc);
1133 sc->hn_link_flags &=
1134 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1135 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1137 hn_vf_rss_restore(sc);
1141 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1142 rxvf ? "VF_UP" : "VF_DOWN", NULL);
1145 if_printf(hn_ifp, "datapath is switched %s %s\n",
1146 rxvf ? "to" : "from", ifp->if_xname);
1153 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1156 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1158 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1162 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1165 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1169 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1171 struct ifnet *ifp, *vf_ifp;
1177 vf_ifp = sc->hn_vf_ifp;
1180 * Fix up requested capabilities w/ supported capabilities,
1181 * since the supported capabilities could have been changed.
1183 ifr->ifr_reqcap &= ifp->if_capabilities;
1184 /* Pass SIOCSIFCAP to VF. */
1185 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1189 * The error will be propagated to the callers, however, it
1190 * is _not_ useful here.
1194 * Merge VF's enabled capabilities.
1196 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1198 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1199 if (ifp->if_capenable & IFCAP_TXCSUM)
1200 ifp->if_hwassist |= tmp;
1202 ifp->if_hwassist &= ~tmp;
1204 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1205 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1206 ifp->if_hwassist |= tmp;
1208 ifp->if_hwassist &= ~tmp;
1210 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1211 if (ifp->if_capenable & IFCAP_TSO4)
1212 ifp->if_hwassist |= tmp;
1214 ifp->if_hwassist &= ~tmp;
1216 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1217 if (ifp->if_capenable & IFCAP_TSO6)
1218 ifp->if_hwassist |= tmp;
1220 ifp->if_hwassist &= ~tmp;
1226 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1228 struct ifnet *vf_ifp;
1232 vf_ifp = sc->hn_vf_ifp;
1234 memset(&ifr, 0, sizeof(ifr));
1235 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1236 ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1237 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1238 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1242 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1244 struct ifnet *ifp = sc->hn_ifp;
1249 /* XXX vlan(4) style mcast addr maintenance */
1250 if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1251 allmulti = IFF_ALLMULTI;
1253 /* Always set the VF's if_flags */
1254 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1258 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1260 struct rm_priotracker pt;
1261 struct ifnet *hn_ifp = NULL;
1265 * XXX racy, if hn(4) ever detached.
1267 rm_rlock(&hn_vfmap_lock, &pt);
1268 if (vf_ifp->if_index < hn_vfmap_size)
1269 hn_ifp = hn_vfmap[vf_ifp->if_index];
1270 rm_runlock(&hn_vfmap_lock, &pt);
1272 if (hn_ifp != NULL) {
1273 for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1275 * Allow tapping on the VF.
1277 ETHER_BPF_MTAP(vf_ifp, mn);
1282 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1283 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1287 * XXX IFCOUNTER_IMCAST
1288 * This stat updating is kinda invasive, since it
1289 * requires two checks on the mbuf: the length check
1290 * and the ethernet header check. As of this write,
1291 * all multicast packets go directly to hn(4), which
1292 * makes imcast stat updating in the VF a try in vian.
1296 * Fix up rcvif and increase hn(4)'s ipackets.
1298 mn->m_pkthdr.rcvif = hn_ifp;
1299 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1302 * Go through hn(4)'s if_input.
1304 hn_ifp->if_input(hn_ifp, m);
1307 * In the middle of the transition; free this
1312 m->m_nextpkt = NULL;
1320 hn_mtu_change_fixup(struct hn_softc *sc)
1327 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1328 #if __FreeBSD_version >= 1100099
1329 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1330 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1335 hn_rss_type_fromndis(uint32_t rss_hash)
1339 if (rss_hash & NDIS_HASH_IPV4)
1340 types |= RSS_TYPE_IPV4;
1341 if (rss_hash & NDIS_HASH_TCP_IPV4)
1342 types |= RSS_TYPE_TCP_IPV4;
1343 if (rss_hash & NDIS_HASH_IPV6)
1344 types |= RSS_TYPE_IPV6;
1345 if (rss_hash & NDIS_HASH_IPV6_EX)
1346 types |= RSS_TYPE_IPV6_EX;
1347 if (rss_hash & NDIS_HASH_TCP_IPV6)
1348 types |= RSS_TYPE_TCP_IPV6;
1349 if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1350 types |= RSS_TYPE_TCP_IPV6_EX;
1355 hn_rss_type_tondis(uint32_t types)
1357 uint32_t rss_hash = 0;
1360 (RSS_TYPE_UDP_IPV4 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1361 ("UDP4, UDP6 and UDP6EX are not supported"));
1363 if (types & RSS_TYPE_IPV4)
1364 rss_hash |= NDIS_HASH_IPV4;
1365 if (types & RSS_TYPE_TCP_IPV4)
1366 rss_hash |= NDIS_HASH_TCP_IPV4;
1367 if (types & RSS_TYPE_IPV6)
1368 rss_hash |= NDIS_HASH_IPV6;
1369 if (types & RSS_TYPE_IPV6_EX)
1370 rss_hash |= NDIS_HASH_IPV6_EX;
1371 if (types & RSS_TYPE_TCP_IPV6)
1372 rss_hash |= NDIS_HASH_TCP_IPV6;
1373 if (types & RSS_TYPE_TCP_IPV6_EX)
1374 rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1379 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1385 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1386 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1390 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1392 struct ifnet *ifp, *vf_ifp;
1393 struct ifrsshash ifrh;
1394 struct ifrsskey ifrk;
1396 uint32_t my_types, diff_types, mbuf_types = 0;
1399 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1400 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1402 if (sc->hn_rx_ring_inuse == 1) {
1403 /* No RSS on synthetic parts; done. */
1406 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1407 /* Synthetic parts do not support Toeplitz; done. */
1412 vf_ifp = sc->hn_vf_ifp;
1415 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is
1418 memset(&ifrk, 0, sizeof(ifrk));
1419 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1420 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1422 if_printf(ifp, "%s SIOCGRSSKEY failed: %d\n",
1423 vf_ifp->if_xname, error);
1426 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1427 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1428 vf_ifp->if_xname, ifrk.ifrk_func);
1431 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1432 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1433 vf_ifp->if_xname, ifrk.ifrk_keylen);
1438 * Extract VF's RSS hash. Only Toeplitz is supported.
1440 memset(&ifrh, 0, sizeof(ifrh));
1441 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1442 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1444 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1445 vf_ifp->if_xname, error);
1448 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1449 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1450 vf_ifp->if_xname, ifrh.ifrh_func);
1454 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1455 if ((ifrh.ifrh_types & my_types) == 0) {
1456 /* This disables RSS; ignore it then */
1457 if_printf(ifp, "%s intersection of RSS types failed. "
1458 "VF %#x, mine %#x\n", vf_ifp->if_xname,
1459 ifrh.ifrh_types, my_types);
1463 diff_types = my_types ^ ifrh.ifrh_types;
1464 my_types &= ifrh.ifrh_types;
1465 mbuf_types = my_types;
1468 * Detect RSS hash value/type confliction.
1471 * We don't disable the hash type, but stop delivery the hash
1472 * value/type through mbufs on RX path.
1474 if ((my_types & RSS_TYPE_IPV4) &&
1475 (diff_types & ifrh.ifrh_types &
1476 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1477 /* Conflict; disable IPV4 hash type/value delivery. */
1478 if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1479 mbuf_types &= ~RSS_TYPE_IPV4;
1481 if ((my_types & RSS_TYPE_IPV6) &&
1482 (diff_types & ifrh.ifrh_types &
1483 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1484 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1485 RSS_TYPE_IPV6_EX))) {
1486 /* Conflict; disable IPV6 hash type/value delivery. */
1487 if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1488 mbuf_types &= ~RSS_TYPE_IPV6;
1490 if ((my_types & RSS_TYPE_IPV6_EX) &&
1491 (diff_types & ifrh.ifrh_types &
1492 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1493 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1495 /* Conflict; disable IPV6_EX hash type/value delivery. */
1496 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1497 mbuf_types &= ~RSS_TYPE_IPV6_EX;
1499 if ((my_types & RSS_TYPE_TCP_IPV6) &&
1500 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1501 /* Conflict; disable TCP_IPV6 hash type/value delivery. */
1502 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1503 mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1505 if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1506 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1507 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1508 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1509 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1511 if ((my_types & RSS_TYPE_UDP_IPV6) &&
1512 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1513 /* Conflict; disable UDP_IPV6 hash type/value delivery. */
1514 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1515 mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1517 if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1518 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1519 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1520 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1521 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1525 * Indirect table does not matter.
1528 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1529 hn_rss_type_tondis(my_types);
1530 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1531 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1534 error = hn_rss_reconfig(sc);
1536 /* XXX roll-back? */
1537 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1538 /* XXX keep going. */
1542 /* Hash deliverability for mbufs. */
1543 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1547 hn_vf_rss_restore(struct hn_softc *sc)
1551 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1552 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1554 if (sc->hn_rx_ring_inuse == 1)
1558 * Restore hash types. Key does _not_ matter.
1560 if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1563 sc->hn_rss_hash = sc->hn_rss_hcap;
1564 error = hn_rss_reconfig(sc);
1566 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1568 /* XXX keep going. */
1572 /* Hash deliverability for mbufs. */
1573 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1577 hn_xpnt_vf_setready(struct hn_softc *sc)
1579 struct ifnet *ifp, *vf_ifp;
1584 vf_ifp = sc->hn_vf_ifp;
1587 * Mark the VF ready.
1589 sc->hn_vf_rdytick = 0;
1592 * Save information for restoration.
1594 sc->hn_saved_caps = ifp->if_capabilities;
1595 sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1596 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1597 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1600 * Intersect supported/enabled capabilities.
1603 * if_hwassist is not changed here.
1605 ifp->if_capabilities &= vf_ifp->if_capabilities;
1606 ifp->if_capenable &= ifp->if_capabilities;
1611 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1612 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1613 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1614 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1615 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1616 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1619 * Change VF's enabled capabilities.
1621 memset(&ifr, 0, sizeof(ifr));
1622 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1623 ifr.ifr_reqcap = ifp->if_capenable;
1624 hn_xpnt_vf_iocsetcaps(sc, &ifr);
1626 if (ifp->if_mtu != ETHERMTU) {
1632 memset(&ifr, 0, sizeof(ifr));
1633 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1634 ifr.ifr_mtu = ifp->if_mtu;
1635 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1637 if_printf(ifp, "%s SIOCSIFMTU %lu failed\n",
1638 vf_ifp->if_xname, ifp->if_mtu);
1639 if (ifp->if_mtu > ETHERMTU) {
1640 if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1644 * No need to adjust the synthetic parts' MTU;
1645 * failure of the adjustment will cause us
1646 * infinite headache.
1648 ifp->if_mtu = ETHERMTU;
1649 hn_mtu_change_fixup(sc);
1656 hn_xpnt_vf_isready(struct hn_softc *sc)
1661 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1664 if (sc->hn_vf_rdytick == 0)
1667 if (sc->hn_vf_rdytick > ticks)
1670 /* Mark VF as ready. */
1671 hn_xpnt_vf_setready(sc);
1676 hn_xpnt_vf_setenable(struct hn_softc *sc)
1682 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1683 rm_wlock(&sc->hn_vf_lock);
1684 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1685 rm_wunlock(&sc->hn_vf_lock);
1687 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1688 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1692 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1698 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1699 rm_wlock(&sc->hn_vf_lock);
1700 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1702 sc->hn_vf_ifp = NULL;
1703 rm_wunlock(&sc->hn_vf_lock);
1705 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1706 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1710 hn_xpnt_vf_init(struct hn_softc *sc)
1716 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1717 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1720 if_printf(sc->hn_ifp, "try bringing up %s\n",
1721 sc->hn_vf_ifp->if_xname);
1727 hn_xpnt_vf_saveifflags(sc);
1728 sc->hn_vf_ifp->if_flags |= IFF_UP;
1729 error = hn_xpnt_vf_iocsetflags(sc);
1731 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1732 sc->hn_vf_ifp->if_xname, error);
1738 * Datapath setting must happen _after_ bringing the VF up.
1740 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1744 * Fixup RSS related bits _after_ the VF is brought up, since
1745 * many VFs generate RSS key during it's initialization.
1747 hn_vf_rss_fixup(sc, true);
1749 /* Mark transparent mode VF as enabled. */
1750 hn_xpnt_vf_setenable(sc);
1754 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1756 struct hn_softc *sc = xsc;
1760 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1762 if (sc->hn_vf_ifp == NULL)
1764 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1767 if (sc->hn_vf_rdytick != 0) {
1768 /* Mark VF as ready. */
1769 hn_xpnt_vf_setready(sc);
1772 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1774 * Delayed VF initialization.
1777 if_printf(sc->hn_ifp, "delayed initialize %s\n",
1778 sc->hn_vf_ifp->if_xname);
1780 hn_xpnt_vf_init(sc);
1787 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1789 struct hn_softc *sc = xsc;
1793 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1796 if (!hn_ismyvf(sc, ifp))
1799 if (sc->hn_vf_ifp != NULL) {
1800 if_printf(sc->hn_ifp, "%s was attached as VF\n",
1801 sc->hn_vf_ifp->if_xname);
1805 if (hn_xpnt_vf && ifp->if_start != NULL) {
1807 * ifnet.if_start is _not_ supported by transparent
1808 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1810 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1811 "in transparent VF mode.\n", ifp->if_xname);
1815 rm_wlock(&hn_vfmap_lock);
1817 if (ifp->if_index >= hn_vfmap_size) {
1818 struct ifnet **newmap;
1821 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1822 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1825 memcpy(newmap, hn_vfmap,
1826 sizeof(struct ifnet *) * hn_vfmap_size);
1827 free(hn_vfmap, M_DEVBUF);
1829 hn_vfmap_size = newsize;
1831 KASSERT(hn_vfmap[ifp->if_index] == NULL,
1832 ("%s: ifindex %d was mapped to %s",
1833 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1834 hn_vfmap[ifp->if_index] = sc->hn_ifp;
1836 rm_wunlock(&hn_vfmap_lock);
1838 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1839 rm_wlock(&sc->hn_vf_lock);
1840 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1841 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1842 sc->hn_vf_ifp = ifp;
1843 rm_wunlock(&sc->hn_vf_lock);
1849 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1850 * Save vf_ifp's current if_input for later restoration.
1852 sc->hn_vf_input = ifp->if_input;
1853 ifp->if_input = hn_xpnt_vf_input;
1856 * Stop link status management; use the VF's.
1858 hn_suspend_mgmt(sc);
1861 * Give VF sometime to complete its attach routing.
1863 wait_ticks = hn_xpnt_vf_attwait * hz;
1864 sc->hn_vf_rdytick = ticks + wait_ticks;
1866 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1874 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1876 struct hn_softc *sc = xsc;
1880 if (sc->hn_vf_ifp == NULL)
1883 if (!hn_ismyvf(sc, ifp))
1888 * Make sure that the delayed initialization is not running.
1891 * - This lock _must_ be released, since the hn_vf_init task
1892 * will try holding this lock.
1893 * - It is safe to release this lock here, since the
1894 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1896 * XXX racy, if hn(4) ever detached.
1899 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1902 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1903 sc->hn_ifp->if_xname));
1904 ifp->if_input = sc->hn_vf_input;
1905 sc->hn_vf_input = NULL;
1907 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1908 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1909 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1911 if (sc->hn_vf_rdytick == 0) {
1913 * The VF was ready; restore some settings.
1915 sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
1918 * There is _no_ need to fixup if_capenable and
1919 * if_hwassist, since the if_capabilities before
1920 * restoration was an intersection of the VF's
1921 * if_capabilites and the synthetic device's
1924 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
1925 sc->hn_ifp->if_hw_tsomaxsegcount =
1926 sc->hn_saved_tsosegcnt;
1927 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
1930 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1932 * Restore RSS settings.
1934 hn_vf_rss_restore(sc);
1937 * Resume link status management, which was suspended
1938 * by hn_ifnet_attevent().
1944 /* Mark transparent mode VF as disabled. */
1945 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
1947 rm_wlock(&hn_vfmap_lock);
1949 KASSERT(ifp->if_index < hn_vfmap_size,
1950 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
1951 if (hn_vfmap[ifp->if_index] != NULL) {
1952 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
1953 ("%s: ifindex %d was mapped to %s",
1954 ifp->if_xname, ifp->if_index,
1955 hn_vfmap[ifp->if_index]->if_xname));
1956 hn_vfmap[ifp->if_index] = NULL;
1959 rm_wunlock(&hn_vfmap_lock);
1965 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
1967 struct hn_softc *sc = xsc;
1969 if (sc->hn_vf_ifp == ifp)
1970 if_link_state_change(sc->hn_ifp, link_state);
1974 hn_probe(device_t dev)
1977 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
1978 device_set_desc(dev, "Hyper-V Network Interface");
1979 return BUS_PROBE_DEFAULT;
1985 hn_attach(device_t dev)
1987 struct hn_softc *sc = device_get_softc(dev);
1988 struct sysctl_oid_list *child;
1989 struct sysctl_ctx_list *ctx;
1990 uint8_t eaddr[ETHER_ADDR_LEN];
1991 struct ifnet *ifp = NULL;
1992 int error, ring_cnt, tx_ring_cnt;
1995 sc->hn_prichan = vmbus_get_channel(dev);
1997 rm_init(&sc->hn_vf_lock, "hnvf");
1998 if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
1999 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2002 * Initialize these tunables once.
2004 sc->hn_agg_size = hn_tx_agg_size;
2005 sc->hn_agg_pkts = hn_tx_agg_pkts;
2008 * Setup taskqueue for transmission.
2010 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2014 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2015 M_DEVBUF, M_WAITOK);
2016 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2017 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2018 M_WAITOK, taskqueue_thread_enqueue,
2019 &sc->hn_tx_taskqs[i]);
2020 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2021 "%s tx%d", device_get_nameunit(dev), i);
2023 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2024 sc->hn_tx_taskqs = hn_tx_taskque;
2028 * Setup taskqueue for mangement tasks, e.g. link status.
2030 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2031 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2032 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2033 device_get_nameunit(dev));
2034 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2035 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2036 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2037 hn_netchg_status_taskfunc, sc);
2041 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2043 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2044 taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2045 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2046 device_get_nameunit(dev));
2047 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2048 hn_xpnt_vf_init_taskfunc, sc);
2052 * Allocate ifnet and setup its name earlier, so that if_printf
2053 * can be used by functions, which will be called after
2056 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
2058 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2061 * Initialize ifmedia earlier so that it can be unconditionally
2062 * destroyed, if error happened later on.
2064 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2067 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2068 * to use (tx_ring_cnt).
2071 * The # of RX rings to use is same as the # of channels to use.
2073 ring_cnt = hn_chan_cnt;
2074 if (ring_cnt <= 0) {
2076 ring_cnt = mp_ncpus;
2077 if (ring_cnt > HN_RING_CNT_DEF_MAX)
2078 ring_cnt = HN_RING_CNT_DEF_MAX;
2079 } else if (ring_cnt > mp_ncpus) {
2080 ring_cnt = mp_ncpus;
2083 tx_ring_cnt = hn_tx_ring_cnt;
2084 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2085 tx_ring_cnt = ring_cnt;
2086 #ifdef HN_IFSTART_SUPPORT
2087 if (hn_use_if_start) {
2088 /* ifnet.if_start only needs one TX ring. */
2094 * Set the leader CPU for channels.
2096 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2099 * Create enough TX/RX rings, even if only limited number of
2100 * channels can be allocated.
2102 error = hn_create_tx_data(sc, tx_ring_cnt);
2105 error = hn_create_rx_data(sc, ring_cnt);
2110 * Create transaction context for NVS and RNDIS transactions.
2112 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2113 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2114 if (sc->hn_xact == NULL) {
2120 * Install orphan handler for the revocation of this device's
2124 * The processing order is critical here:
2125 * Install the orphan handler, _before_ testing whether this
2126 * device's primary channel has been revoked or not.
2128 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2129 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2135 * Attach the synthetic parts, i.e. NVS and RNDIS.
2137 error = hn_synth_attach(sc, ETHERMTU);
2141 error = hn_rndis_get_eaddr(sc, eaddr);
2145 #if __FreeBSD_version >= 1100099
2146 if (sc->hn_rx_ring_inuse > 1) {
2148 * Reduce TCP segment aggregation limit for multiple
2149 * RX rings to increase ACK timeliness.
2151 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2156 * Fixup TX stuffs after synthetic parts are attached.
2158 hn_fixup_tx_data(sc);
2160 ctx = device_get_sysctl_ctx(dev);
2161 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2162 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2163 &sc->hn_nvs_ver, 0, "NVS version");
2164 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2165 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2166 hn_ndis_version_sysctl, "A", "NDIS version");
2167 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2168 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2169 hn_caps_sysctl, "A", "capabilities");
2170 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2171 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2172 hn_hwassist_sysctl, "A", "hwassist");
2173 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2174 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2175 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2176 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2177 "max # of TSO segments");
2178 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2179 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2180 "max size of TSO segment");
2181 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2182 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2183 hn_rxfilter_sysctl, "A", "rxfilter");
2184 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2185 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2186 hn_rss_hash_sysctl, "A", "RSS hash");
2187 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2188 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2189 hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2190 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2191 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2192 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2193 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2194 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2195 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2196 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2197 hn_rss_key_sysctl, "IU", "RSS key");
2198 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2199 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2200 hn_rss_ind_sysctl, "IU", "RSS indirect table");
2201 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2202 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2203 "RNDIS offered packet transmission aggregation size limit");
2204 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2205 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2206 "RNDIS offered packet transmission aggregation count limit");
2207 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2208 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2209 "RNDIS packet transmission aggregation alignment");
2210 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2211 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2212 hn_txagg_size_sysctl, "I",
2213 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2214 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2215 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2216 hn_txagg_pkts_sysctl, "I",
2217 "Packet transmission aggregation packets, "
2218 "0 -- disable, -1 -- auto");
2219 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2220 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2221 hn_polling_sysctl, "I",
2222 "Polling frequency: [100,1000000], 0 disable polling");
2223 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2224 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2225 hn_vf_sysctl, "A", "Virtual Function's name");
2227 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2228 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2229 hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2231 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2232 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2233 hn_xpnt_vf_enabled_sysctl, "I",
2234 "Transparent VF enabled");
2235 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2236 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2237 hn_xpnt_vf_accbpf_sysctl, "I",
2238 "Accurate BPF for transparent VF");
2242 * Setup the ifmedia, which has been initialized earlier.
2244 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2245 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2246 /* XXX ifmedia_set really should do this for us */
2247 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2250 * Setup the ifnet for this interface.
2254 ifp->if_baudrate = IF_Gbps(10);
2256 /* if_baudrate is 32bits on 32bit system. */
2257 ifp->if_baudrate = IF_Gbps(1);
2259 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2260 ifp->if_ioctl = hn_ioctl;
2261 ifp->if_init = hn_init;
2262 #ifdef HN_IFSTART_SUPPORT
2263 if (hn_use_if_start) {
2264 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2266 ifp->if_start = hn_start;
2267 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2268 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2269 IFQ_SET_READY(&ifp->if_snd);
2273 ifp->if_transmit = hn_transmit;
2274 ifp->if_qflush = hn_xmit_qflush;
2277 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2279 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2280 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2282 if (sc->hn_caps & HN_CAP_VLAN) {
2283 /* XXX not sure about VLAN_MTU. */
2284 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2287 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2288 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2289 ifp->if_capabilities |= IFCAP_TXCSUM;
2290 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2291 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2292 if (sc->hn_caps & HN_CAP_TSO4) {
2293 ifp->if_capabilities |= IFCAP_TSO4;
2294 ifp->if_hwassist |= CSUM_IP_TSO;
2296 if (sc->hn_caps & HN_CAP_TSO6) {
2297 ifp->if_capabilities |= IFCAP_TSO6;
2298 ifp->if_hwassist |= CSUM_IP6_TSO;
2301 /* Enable all available capabilities by default. */
2302 ifp->if_capenable = ifp->if_capabilities;
2305 * Disable IPv6 TSO and TXCSUM by default, they still can
2306 * be enabled through SIOCSIFCAP.
2308 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2309 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2311 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2313 * Lock hn_set_tso_maxsize() to simplify its
2317 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2319 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2320 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2323 ether_ifattach(ifp, eaddr);
2325 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2326 if_printf(ifp, "TSO segcnt %u segsz %u\n",
2327 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2330 /* Inform the upper layer about the long frame support. */
2331 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2334 * Kick off link status check.
2336 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2337 hn_update_link_status(sc);
2340 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2341 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2342 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2343 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2345 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2346 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2351 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2352 * since interface's LLADDR is needed; interface LLADDR is not
2353 * available when ifnet_arrival event is triggered.
2355 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2356 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2357 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2358 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2362 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2363 hn_synth_detach(sc);
2369 hn_detach(device_t dev)
2371 struct hn_softc *sc = device_get_softc(dev);
2372 struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2374 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2376 * In case that the vmbus missed the orphan handler
2379 vmbus_xact_ctx_orphan(sc->hn_xact);
2382 if (sc->hn_ifaddr_evthand != NULL)
2383 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2384 if (sc->hn_ifnet_evthand != NULL)
2385 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2386 if (sc->hn_ifnet_atthand != NULL) {
2387 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2388 sc->hn_ifnet_atthand);
2390 if (sc->hn_ifnet_dethand != NULL) {
2391 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2392 sc->hn_ifnet_dethand);
2394 if (sc->hn_ifnet_lnkhand != NULL)
2395 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2397 vf_ifp = sc->hn_vf_ifp;
2398 __compiler_membar();
2400 hn_ifnet_detevent(sc, vf_ifp);
2402 if (device_is_attached(dev)) {
2404 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2405 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2409 * hn_stop() only suspends data, so managment
2410 * stuffs have to be suspended manually here.
2412 hn_suspend_mgmt(sc);
2413 hn_synth_detach(sc);
2416 ether_ifdetach(ifp);
2419 ifmedia_removeall(&sc->hn_media);
2420 hn_destroy_rx_data(sc);
2421 hn_destroy_tx_data(sc);
2423 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2426 for (i = 0; i < hn_tx_taskq_cnt; ++i)
2427 taskqueue_free(sc->hn_tx_taskqs[i]);
2428 free(sc->hn_tx_taskqs, M_DEVBUF);
2430 taskqueue_free(sc->hn_mgmt_taskq0);
2431 if (sc->hn_vf_taskq != NULL)
2432 taskqueue_free(sc->hn_vf_taskq);
2434 if (sc->hn_xact != NULL) {
2436 * Uninstall the orphan handler _before_ the xact is
2439 vmbus_chan_unset_orphan(sc->hn_prichan);
2440 vmbus_xact_ctx_destroy(sc->hn_xact);
2445 HN_LOCK_DESTROY(sc);
2446 rm_destroy(&sc->hn_vf_lock);
2451 hn_shutdown(device_t dev)
2458 hn_link_status(struct hn_softc *sc)
2460 uint32_t link_status;
2463 error = hn_rndis_get_linkstatus(sc, &link_status);
2465 /* XXX what to do? */
2469 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2470 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2472 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2473 if_link_state_change(sc->hn_ifp,
2474 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2475 LINK_STATE_UP : LINK_STATE_DOWN);
2479 hn_link_taskfunc(void *xsc, int pending __unused)
2481 struct hn_softc *sc = xsc;
2483 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2489 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2491 struct hn_softc *sc = xsc;
2493 /* Prevent any link status checks from running. */
2494 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2497 * Fake up a [link down --> link up] state change; 5 seconds
2498 * delay is used, which closely simulates miibus reaction
2499 * upon link down event.
2501 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2502 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2503 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2504 &sc->hn_netchg_status, 5 * hz);
2508 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2510 struct hn_softc *sc = xsc;
2512 /* Re-allow link status checks. */
2513 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2518 hn_update_link_status(struct hn_softc *sc)
2521 if (sc->hn_mgmt_taskq != NULL)
2522 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2526 hn_change_network(struct hn_softc *sc)
2529 if (sc->hn_mgmt_taskq != NULL)
2530 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2534 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2535 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2537 struct mbuf *m = *m_head;
2540 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2542 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2543 m, segs, nsegs, BUS_DMA_NOWAIT);
2544 if (error == EFBIG) {
2547 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2551 *m_head = m = m_new;
2552 txr->hn_tx_collapsed++;
2554 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2555 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2558 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2559 BUS_DMASYNC_PREWRITE);
2560 txd->flags |= HN_TXD_FLAG_DMAMAP;
2566 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2569 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2570 ("put an onlist txd %#x", txd->flags));
2571 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2572 ("put an onagg txd %#x", txd->flags));
2574 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2575 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2578 if (!STAILQ_EMPTY(&txd->agg_list)) {
2579 struct hn_txdesc *tmp_txd;
2581 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2584 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2585 ("resursive aggregation on aggregated txdesc"));
2586 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2587 ("not aggregated txdesc"));
2588 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2589 ("aggregated txdesc uses dmamap"));
2590 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2591 ("aggregated txdesc consumes "
2592 "chimney sending buffer"));
2593 KASSERT(tmp_txd->chim_size == 0,
2594 ("aggregated txdesc has non-zero "
2595 "chimney sending size"));
2597 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2598 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2599 freed = hn_txdesc_put(txr, tmp_txd);
2600 KASSERT(freed, ("failed to free aggregated txdesc"));
2604 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2605 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2606 ("chim txd uses dmamap"));
2607 hn_chim_free(txr->hn_sc, txd->chim_index);
2608 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2610 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2611 bus_dmamap_sync(txr->hn_tx_data_dtag,
2612 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2613 bus_dmamap_unload(txr->hn_tx_data_dtag,
2615 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2618 if (txd->m != NULL) {
2623 txd->flags |= HN_TXD_FLAG_ONLIST;
2624 #ifndef HN_USE_TXDESC_BUFRING
2625 mtx_lock_spin(&txr->hn_txlist_spin);
2626 KASSERT(txr->hn_txdesc_avail >= 0 &&
2627 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2628 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2629 txr->hn_txdesc_avail++;
2630 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2631 mtx_unlock_spin(&txr->hn_txlist_spin);
2632 #else /* HN_USE_TXDESC_BUFRING */
2634 atomic_add_int(&txr->hn_txdesc_avail, 1);
2636 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2637 #endif /* !HN_USE_TXDESC_BUFRING */
2642 static __inline struct hn_txdesc *
2643 hn_txdesc_get(struct hn_tx_ring *txr)
2645 struct hn_txdesc *txd;
2647 #ifndef HN_USE_TXDESC_BUFRING
2648 mtx_lock_spin(&txr->hn_txlist_spin);
2649 txd = SLIST_FIRST(&txr->hn_txlist);
2651 KASSERT(txr->hn_txdesc_avail > 0,
2652 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2653 txr->hn_txdesc_avail--;
2654 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2656 mtx_unlock_spin(&txr->hn_txlist_spin);
2658 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2662 #ifdef HN_USE_TXDESC_BUFRING
2664 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2666 #endif /* HN_USE_TXDESC_BUFRING */
2667 KASSERT(txd->m == NULL && txd->refs == 0 &&
2668 STAILQ_EMPTY(&txd->agg_list) &&
2669 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2670 txd->chim_size == 0 &&
2671 (txd->flags & HN_TXD_FLAG_ONLIST) &&
2672 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2673 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2674 txd->flags &= ~HN_TXD_FLAG_ONLIST;
2680 static __inline void
2681 hn_txdesc_hold(struct hn_txdesc *txd)
2684 /* 0->1 transition will never work */
2685 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2686 atomic_add_int(&txd->refs, 1);
2689 static __inline void
2690 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2693 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2694 ("recursive aggregation on aggregating txdesc"));
2696 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2697 ("already aggregated"));
2698 KASSERT(STAILQ_EMPTY(&txd->agg_list),
2699 ("recursive aggregation on to-be-aggregated txdesc"));
2701 txd->flags |= HN_TXD_FLAG_ONAGG;
2702 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2706 hn_tx_ring_pending(struct hn_tx_ring *txr)
2708 bool pending = false;
2710 #ifndef HN_USE_TXDESC_BUFRING
2711 mtx_lock_spin(&txr->hn_txlist_spin);
2712 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2714 mtx_unlock_spin(&txr->hn_txlist_spin);
2716 if (!buf_ring_full(txr->hn_txdesc_br))
2722 static __inline void
2723 hn_txeof(struct hn_tx_ring *txr)
2725 txr->hn_has_txeof = 0;
2730 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2731 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2733 struct hn_txdesc *txd = sndc->hn_cbarg;
2734 struct hn_tx_ring *txr;
2737 KASSERT(txr->hn_chan == chan,
2738 ("channel mismatch, on chan%u, should be chan%u",
2739 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2741 txr->hn_has_txeof = 1;
2742 hn_txdesc_put(txr, txd);
2744 ++txr->hn_txdone_cnt;
2745 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2746 txr->hn_txdone_cnt = 0;
2747 if (txr->hn_oactive)
2753 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2755 #if defined(INET) || defined(INET6)
2756 struct lro_ctrl *lro = &rxr->hn_lro;
2757 struct lro_entry *queued;
2759 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
2760 SLIST_REMOVE_HEAD(&lro->lro_active, next);
2761 tcp_lro_flush(lro, queued);
2767 * 'txr' could be NULL, if multiple channels and
2768 * ifnet.if_start method are enabled.
2770 if (txr == NULL || !txr->hn_has_txeof)
2773 txr->hn_txdone_cnt = 0;
2777 static __inline uint32_t
2778 hn_rndis_pktmsg_offset(uint32_t ofs)
2781 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2782 ("invalid RNDIS packet msg offset %u", ofs));
2783 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2786 static __inline void *
2787 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2788 size_t pi_dlen, uint32_t pi_type)
2790 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2791 struct rndis_pktinfo *pi;
2793 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2794 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2797 * Per-packet-info does not move; it only grows.
2800 * rm_pktinfooffset in this phase counts from the beginning
2801 * of rndis_packet_msg.
2803 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2804 ("%u pktinfo overflows RNDIS packet msg", pi_type));
2805 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2806 pkt->rm_pktinfolen);
2807 pkt->rm_pktinfolen += pi_size;
2809 pi->rm_size = pi_size;
2810 pi->rm_type = pi_type;
2811 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2813 return (pi->rm_data);
2817 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2819 struct hn_txdesc *txd;
2823 txd = txr->hn_agg_txd;
2824 KASSERT(txd != NULL, ("no aggregate txdesc"));
2827 * Since hn_txpkt() will reset this temporary stat, save
2828 * it now, so that oerrors can be updated properly, if
2829 * hn_txpkt() ever fails.
2831 pkts = txr->hn_stat_pkts;
2834 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2835 * failure, save it for later freeing, if hn_txpkt() ever
2839 error = hn_txpkt(ifp, txr, txd);
2840 if (__predict_false(error)) {
2841 /* txd is freed, but m is not. */
2844 txr->hn_flush_failed++;
2845 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2848 /* Reset all aggregation states. */
2849 txr->hn_agg_txd = NULL;
2850 txr->hn_agg_szleft = 0;
2851 txr->hn_agg_pktleft = 0;
2852 txr->hn_agg_prevpkt = NULL;
2858 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2863 if (txr->hn_agg_txd != NULL) {
2864 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2865 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2866 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2870 * Update the previous RNDIS packet's total length,
2871 * it can be increased due to the mandatory alignment
2872 * padding for this RNDIS packet. And update the
2873 * aggregating txdesc's chimney sending buffer size
2877 * Zero-out the padding, as required by the RNDIS spec.
2880 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2881 agg_txd->chim_size += pkt->rm_len - olen;
2883 /* Link this txdesc to the parent. */
2884 hn_txdesc_agg(agg_txd, txd);
2886 chim = (uint8_t *)pkt + pkt->rm_len;
2887 /* Save the current packet for later fixup. */
2888 txr->hn_agg_prevpkt = chim;
2890 txr->hn_agg_pktleft--;
2891 txr->hn_agg_szleft -= pktsize;
2892 if (txr->hn_agg_szleft <=
2893 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2895 * Probably can't aggregate more packets,
2896 * flush this aggregating txdesc proactively.
2898 txr->hn_agg_pktleft = 0;
2903 hn_flush_txagg(ifp, txr);
2905 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
2907 txr->hn_tx_chimney_tried++;
2908 txd->chim_index = hn_chim_alloc(txr->hn_sc);
2909 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
2911 txr->hn_tx_chimney++;
2913 chim = txr->hn_sc->hn_chim +
2914 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
2916 if (txr->hn_agg_pktmax > 1 &&
2917 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2918 txr->hn_agg_txd = txd;
2919 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
2920 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
2921 txr->hn_agg_prevpkt = chim;
2928 * If this function fails, then both txd and m_head0 will be freed.
2931 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2932 struct mbuf **m_head0)
2934 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
2935 int error, nsegs, i;
2936 struct mbuf *m_head = *m_head0;
2937 struct rndis_packet_msg *pkt;
2940 int pkt_hlen, pkt_size;
2942 pkt = txd->rndis_pkt;
2943 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
2944 if (pkt_size < txr->hn_chim_size) {
2945 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
2949 if (txr->hn_agg_txd != NULL)
2950 hn_flush_txagg(ifp, txr);
2953 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
2954 pkt->rm_len = m_head->m_pkthdr.len;
2955 pkt->rm_dataoffset = 0;
2956 pkt->rm_datalen = m_head->m_pkthdr.len;
2957 pkt->rm_oobdataoffset = 0;
2958 pkt->rm_oobdatalen = 0;
2959 pkt->rm_oobdataelements = 0;
2960 pkt->rm_pktinfooffset = sizeof(*pkt);
2961 pkt->rm_pktinfolen = 0;
2962 pkt->rm_vchandle = 0;
2963 pkt->rm_reserved = 0;
2965 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
2967 * Set the hash value for this packet, so that the host could
2968 * dispatch the TX done event for this packet back to this TX
2971 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2972 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
2973 *pi_data = txr->hn_tx_idx;
2976 if (m_head->m_flags & M_VLANTAG) {
2977 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2978 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
2979 *pi_data = NDIS_VLAN_INFO_MAKE(
2980 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
2981 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
2982 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
2985 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
2986 #if defined(INET6) || defined(INET)
2987 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2988 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
2990 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
2991 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
2992 m_head->m_pkthdr.tso_segsz);
2995 #if defined(INET6) && defined(INET)
3000 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
3001 m_head->m_pkthdr.tso_segsz);
3004 #endif /* INET6 || INET */
3005 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3006 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3007 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3008 if (m_head->m_pkthdr.csum_flags &
3009 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3010 *pi_data = NDIS_TXCSUM_INFO_IPV6;
3012 *pi_data = NDIS_TXCSUM_INFO_IPV4;
3013 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3014 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
3017 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
3018 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
3019 else if (m_head->m_pkthdr.csum_flags &
3020 (CSUM_IP_UDP | CSUM_IP6_UDP))
3021 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
3024 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3025 /* Fixup RNDIS packet message total length */
3026 pkt->rm_len += pkt_hlen;
3027 /* Convert RNDIS packet message offsets */
3028 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3029 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3032 * Fast path: Chimney sending.
3035 struct hn_txdesc *tgt_txd = txd;
3037 if (txr->hn_agg_txd != NULL) {
3038 tgt_txd = txr->hn_agg_txd;
3044 KASSERT(pkt == chim,
3045 ("RNDIS pkt not in chimney sending buffer"));
3046 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3047 ("chimney sending buffer is not used"));
3048 tgt_txd->chim_size += pkt->rm_len;
3050 m_copydata(m_head, 0, m_head->m_pkthdr.len,
3051 ((uint8_t *)chim) + pkt_hlen);
3053 txr->hn_gpa_cnt = 0;
3054 txr->hn_sendpkt = hn_txpkt_chim;
3058 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3059 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3060 ("chimney buffer is used"));
3061 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3063 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3064 if (__predict_false(error)) {
3068 * This mbuf is not linked w/ the txd yet, so free it now.
3073 freed = hn_txdesc_put(txr, txd);
3075 ("fail to free txd upon txdma error"));
3077 txr->hn_txdma_failed++;
3078 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3083 /* +1 RNDIS packet message */
3084 txr->hn_gpa_cnt = nsegs + 1;
3086 /* send packet with page buffer */
3087 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3088 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3089 txr->hn_gpa[0].gpa_len = pkt_hlen;
3092 * Fill the page buffers with mbuf info after the page
3093 * buffer for RNDIS packet message.
3095 for (i = 0; i < nsegs; ++i) {
3096 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3098 gpa->gpa_page = atop(segs[i].ds_addr);
3099 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3100 gpa->gpa_len = segs[i].ds_len;
3103 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3105 txr->hn_sendpkt = hn_txpkt_sglist;
3109 /* Set the completion routine */
3110 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3112 /* Update temporary stats for later use. */
3113 txr->hn_stat_pkts++;
3114 txr->hn_stat_size += m_head->m_pkthdr.len;
3115 if (m_head->m_flags & M_MCAST)
3116 txr->hn_stat_mcasts++;
3123 * If this function fails, then txd will be freed, but the mbuf
3124 * associated w/ the txd will _not_ be freed.
3127 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3129 int error, send_failed = 0, has_bpf;
3132 has_bpf = bpf_peers_present(ifp->if_bpf);
3135 * Make sure that this txd and any aggregated txds are not
3136 * freed before ETHER_BPF_MTAP.
3138 hn_txdesc_hold(txd);
3140 error = txr->hn_sendpkt(txr, txd);
3143 const struct hn_txdesc *tmp_txd;
3145 ETHER_BPF_MTAP(ifp, txd->m);
3146 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3147 ETHER_BPF_MTAP(ifp, tmp_txd->m);
3150 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3151 #ifdef HN_IFSTART_SUPPORT
3152 if (!hn_use_if_start)
3155 if_inc_counter(ifp, IFCOUNTER_OBYTES,
3157 if (txr->hn_stat_mcasts != 0) {
3158 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3159 txr->hn_stat_mcasts);
3162 txr->hn_pkts += txr->hn_stat_pkts;
3166 hn_txdesc_put(txr, txd);
3168 if (__predict_false(error)) {
3172 * This should "really rarely" happen.
3174 * XXX Too many RX to be acked or too many sideband
3175 * commands to run? Ask netvsc_channel_rollup()
3176 * to kick start later.
3178 txr->hn_has_txeof = 1;
3180 txr->hn_send_failed++;
3183 * Try sending again after set hn_has_txeof;
3184 * in case that we missed the last
3185 * netvsc_channel_rollup().
3189 if_printf(ifp, "send failed\n");
3192 * Caller will perform further processing on the
3193 * associated mbuf, so don't free it in hn_txdesc_put();
3194 * only unload it from the DMA map in hn_txdesc_put(),
3198 freed = hn_txdesc_put(txr, txd);
3200 ("fail to free txd upon send error"));
3202 txr->hn_send_failed++;
3205 /* Reset temporary stats, after this sending is done. */
3206 txr->hn_stat_size = 0;
3207 txr->hn_stat_pkts = 0;
3208 txr->hn_stat_mcasts = 0;
3214 * Append the specified data to the indicated mbuf chain,
3215 * Extend the mbuf chain if the new data does not fit in
3218 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3219 * There should be an equivalent in the kernel mbuf code,
3220 * but there does not appear to be one yet.
3222 * Differs from m_append() in that additional mbufs are
3223 * allocated with cluster size MJUMPAGESIZE, and filled
3226 * Return 1 if able to complete the job; otherwise 0.
3229 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3232 int remainder, space;
3234 for (m = m0; m->m_next != NULL; m = m->m_next)
3237 space = M_TRAILINGSPACE(m);
3240 * Copy into available space.
3242 if (space > remainder)
3244 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3249 while (remainder > 0) {
3251 * Allocate a new mbuf; could check space
3252 * and allocate a cluster instead.
3254 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
3257 n->m_len = min(MJUMPAGESIZE, remainder);
3258 bcopy(cp, mtod(n, caddr_t), n->m_len);
3260 remainder -= n->m_len;
3264 if (m0->m_flags & M_PKTHDR)
3265 m0->m_pkthdr.len += len - remainder;
3267 return (remainder == 0);
3270 #if defined(INET) || defined(INET6)
3272 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3274 #if __FreeBSD_version >= 1100095
3275 if (hn_lro_mbufq_depth) {
3276 tcp_lro_queue_mbuf(lc, m);
3280 return tcp_lro_rx(lc, m, 0);
3285 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3286 const struct hn_rxinfo *info)
3288 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3290 int size, do_lro = 0, do_csum = 1, is_vf = 0;
3291 int hash_type = M_HASHTYPE_NONE;
3294 if (rxr->hn_rxvf_ifp != NULL) {
3296 * Non-transparent mode VF; pretend this packet is from
3299 ifp = rxr->hn_rxvf_ifp;
3301 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3302 /* Transparent mode VF. */
3306 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3309 * See the NOTE of hn_rndis_init_fixat(). This
3310 * function can be reached, immediately after the
3311 * RNDIS is initialized but before the ifnet is
3312 * setup on the hn_attach() path; drop the unexpected
3318 if (__predict_false(dlen < ETHER_HDR_LEN)) {
3319 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3323 if (dlen <= MHLEN) {
3324 m_new = m_gethdr(M_NOWAIT, MT_DATA);
3325 if (m_new == NULL) {
3326 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3329 memcpy(mtod(m_new, void *), data, dlen);
3330 m_new->m_pkthdr.len = m_new->m_len = dlen;
3331 rxr->hn_small_pkts++;
3334 * Get an mbuf with a cluster. For packets 2K or less,
3335 * get a standard 2K cluster. For anything larger, get a
3336 * 4K cluster. Any buffers larger than 4K can cause problems
3337 * if looped around to the Hyper-V TX channel, so avoid them.
3340 if (dlen > MCLBYTES) {
3342 size = MJUMPAGESIZE;
3345 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3346 if (m_new == NULL) {
3347 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3351 hv_m_append(m_new, dlen, data);
3353 m_new->m_pkthdr.rcvif = ifp;
3355 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3358 /* receive side checksum offload */
3359 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3360 /* IP csum offload */
3361 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3362 m_new->m_pkthdr.csum_flags |=
3363 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3367 /* TCP/UDP csum offload */
3368 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3369 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3370 m_new->m_pkthdr.csum_flags |=
3371 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3372 m_new->m_pkthdr.csum_data = 0xffff;
3373 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3381 * As of this write (Oct 28th, 2016), host side will turn
3382 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3383 * the do_lro setting here is actually _not_ accurate. We
3384 * depend on the RSS hash type check to reset do_lro.
3386 if ((info->csum_info &
3387 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3388 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3391 const struct ether_header *eh;
3396 /* Checked at the beginning of this function. */
3397 KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
3399 eh = mtod(m_new, struct ether_header *);
3400 etype = ntohs(eh->ether_type);
3401 if (etype == ETHERTYPE_VLAN) {
3402 const struct ether_vlan_header *evl;
3404 hoff = sizeof(*evl);
3405 if (m_new->m_len < hoff)
3407 evl = mtod(m_new, struct ether_vlan_header *);
3408 etype = ntohs(evl->evl_proto);
3411 if (etype == ETHERTYPE_IP) {
3414 pr = hn_check_iplen(m_new, hoff);
3415 if (pr == IPPROTO_TCP) {
3417 (rxr->hn_trust_hcsum &
3418 HN_TRUST_HCSUM_TCP)) {
3419 rxr->hn_csum_trusted++;
3420 m_new->m_pkthdr.csum_flags |=
3421 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3422 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3423 m_new->m_pkthdr.csum_data = 0xffff;
3426 } else if (pr == IPPROTO_UDP) {
3428 (rxr->hn_trust_hcsum &
3429 HN_TRUST_HCSUM_UDP)) {
3430 rxr->hn_csum_trusted++;
3431 m_new->m_pkthdr.csum_flags |=
3432 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3433 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3434 m_new->m_pkthdr.csum_data = 0xffff;
3436 } else if (pr != IPPROTO_DONE && do_csum &&
3437 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3438 rxr->hn_csum_trusted++;
3439 m_new->m_pkthdr.csum_flags |=
3440 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3445 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3446 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3447 NDIS_VLAN_INFO_ID(info->vlan_info),
3448 NDIS_VLAN_INFO_PRI(info->vlan_info),
3449 NDIS_VLAN_INFO_CFI(info->vlan_info));
3450 m_new->m_flags |= M_VLANTAG;
3454 * If VF is activated (tranparent/non-transparent mode does not
3459 * hn(4) will only receive broadcast packets, multicast packets,
3460 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3463 * For non-transparent, we definitely _cannot_ enable LRO at
3464 * all, since the LRO flush will use hn(4) as the receiving
3465 * interface; i.e. hn_ifp->if_input(hn_ifp, m).
3471 * If VF is activated (tranparent/non-transparent mode does not
3472 * matter here), do _not_ mess with unsupported hash types or
3475 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3477 m_new->m_pkthdr.flowid = info->hash_value;
3479 hash_type = M_HASHTYPE_OPAQUE;
3480 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3481 NDIS_HASH_FUNCTION_TOEPLITZ) {
3482 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3487 * do_lro is resetted, if the hash types are not TCP
3488 * related. See the comment in the above csum_flags
3492 case NDIS_HASH_IPV4:
3493 hash_type = M_HASHTYPE_RSS_IPV4;
3497 case NDIS_HASH_TCP_IPV4:
3498 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3501 case NDIS_HASH_IPV6:
3502 hash_type = M_HASHTYPE_RSS_IPV6;
3506 case NDIS_HASH_IPV6_EX:
3507 hash_type = M_HASHTYPE_RSS_IPV6_EX;
3511 case NDIS_HASH_TCP_IPV6:
3512 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3515 case NDIS_HASH_TCP_IPV6_EX:
3516 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3520 } else if (!is_vf) {
3521 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3523 M_HASHTYPE_SET(m_new, hash_type);
3525 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3526 if (hn_ifp != ifp) {
3527 const struct ether_header *eh;
3530 * Non-transparent mode VF is activated.
3534 * Allow tapping on hn(4).
3536 ETHER_BPF_MTAP(hn_ifp, m_new);
3539 * Update hn(4)'s stats.
3541 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3542 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3543 /* Checked at the beginning of this function. */
3544 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3545 eh = mtod(m_new, struct ether_header *);
3546 if (ETHER_IS_MULTICAST(eh->ether_dhost))
3547 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3551 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3552 #if defined(INET) || defined(INET6)
3553 struct lro_ctrl *lro = &rxr->hn_lro;
3556 rxr->hn_lro_tried++;
3557 if (hn_lro_rx(lro, m_new) == 0) {
3564 ifp->if_input(ifp, m_new);
3570 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3572 struct hn_softc *sc = ifp->if_softc;
3573 struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3574 struct ifnet *vf_ifp;
3575 int mask, error = 0;
3576 struct ifrsskey *ifrk;
3577 struct ifrsshash *ifrh;
3581 if (ifr->ifr_mtu > HN_MTU_MAX) {
3588 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3593 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3594 /* Can't change MTU */
3600 if (ifp->if_mtu == ifr->ifr_mtu) {
3605 if (hn_xpnt_vf_isready(sc)) {
3606 vf_ifp = sc->hn_vf_ifp;
3608 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3609 sizeof(ifr_vf.ifr_name));
3610 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3614 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3615 vf_ifp->if_xname, ifr->ifr_mtu, error);
3621 * Suspend this interface before the synthetic parts
3627 * Detach the synthetics parts, i.e. NVS and RNDIS.
3629 hn_synth_detach(sc);
3632 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3633 * with the new MTU setting.
3635 error = hn_synth_attach(sc, ifr->ifr_mtu);
3642 * Commit the requested MTU, after the synthetic parts
3643 * have been successfully attached.
3645 ifp->if_mtu = ifr->ifr_mtu;
3648 * Synthetic parts' reattach may change the chimney
3649 * sending size; update it.
3651 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3652 hn_set_chim_size(sc, sc->hn_chim_szmax);
3655 * Make sure that various parameters based on MTU are
3656 * still valid, after the MTU change.
3658 hn_mtu_change_fixup(sc);
3661 * All done! Resume the interface now.
3665 if ((sc->hn_flags & HN_FLAG_RXVF) ||
3666 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3668 * Since we have reattached the NVS part,
3669 * change the datapath to VF again; in case
3670 * that it is lost, after the NVS was detached.
3672 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3681 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3686 if (hn_xpnt_vf_isready(sc))
3687 hn_xpnt_vf_saveifflags(sc);
3689 if (ifp->if_flags & IFF_UP) {
3690 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3692 * Caller meight hold mutex, e.g.
3693 * bpf; use busy-wait for the RNDIS
3697 hn_rxfilter_config(sc);
3700 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3701 error = hn_xpnt_vf_iocsetflags(sc);
3706 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3709 sc->hn_if_flags = ifp->if_flags;
3717 if (hn_xpnt_vf_isready(sc)) {
3719 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3720 sizeof(ifr_vf.ifr_name));
3721 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3727 * Fix up requested capabilities w/ supported capabilities,
3728 * since the supported capabilities could have been changed.
3730 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3733 if (mask & IFCAP_TXCSUM) {
3734 ifp->if_capenable ^= IFCAP_TXCSUM;
3735 if (ifp->if_capenable & IFCAP_TXCSUM)
3736 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3738 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3740 if (mask & IFCAP_TXCSUM_IPV6) {
3741 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3742 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3743 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3745 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3748 /* TODO: flip RNDIS offload parameters for RXCSUM. */
3749 if (mask & IFCAP_RXCSUM)
3750 ifp->if_capenable ^= IFCAP_RXCSUM;
3752 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
3753 if (mask & IFCAP_RXCSUM_IPV6)
3754 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3757 if (mask & IFCAP_LRO)
3758 ifp->if_capenable ^= IFCAP_LRO;
3760 if (mask & IFCAP_TSO4) {
3761 ifp->if_capenable ^= IFCAP_TSO4;
3762 if (ifp->if_capenable & IFCAP_TSO4)
3763 ifp->if_hwassist |= CSUM_IP_TSO;
3765 ifp->if_hwassist &= ~CSUM_IP_TSO;
3767 if (mask & IFCAP_TSO6) {
3768 ifp->if_capenable ^= IFCAP_TSO6;
3769 if (ifp->if_capenable & IFCAP_TSO6)
3770 ifp->if_hwassist |= CSUM_IP6_TSO;
3772 ifp->if_hwassist &= ~CSUM_IP6_TSO;
3782 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3786 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3788 * Multicast uses mutex; use busy-wait for
3792 hn_rxfilter_config(sc);
3796 /* XXX vlan(4) style mcast addr maintenance */
3797 if (hn_xpnt_vf_isready(sc)) {
3800 old_if_flags = sc->hn_vf_ifp->if_flags;
3801 hn_xpnt_vf_saveifflags(sc);
3803 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3804 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3806 error = hn_xpnt_vf_iocsetflags(sc);
3815 if (hn_xpnt_vf_isready(sc)) {
3817 * SIOCGIFMEDIA expects ifmediareq, so don't
3818 * create and pass ifr_vf to the VF here; just
3819 * replace the ifr_name.
3821 vf_ifp = sc->hn_vf_ifp;
3822 strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3823 sizeof(ifr->ifr_name));
3824 error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3825 /* Restore the ifr_name. */
3826 strlcpy(ifr->ifr_name, ifp->if_xname,
3827 sizeof(ifr->ifr_name));
3832 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3835 case SIOCGIFRSSHASH:
3836 ifrh = (struct ifrsshash *)data;
3838 if (sc->hn_rx_ring_inuse == 1) {
3840 ifrh->ifrh_func = RSS_FUNC_NONE;
3841 ifrh->ifrh_types = 0;
3845 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3846 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3848 ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3849 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3854 ifrk = (struct ifrsskey *)data;
3856 if (sc->hn_rx_ring_inuse == 1) {
3858 ifrk->ifrk_func = RSS_FUNC_NONE;
3859 ifrk->ifrk_keylen = 0;
3862 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3863 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3865 ifrk->ifrk_func = RSS_FUNC_PRIVATE;
3866 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
3867 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
3868 NDIS_HASH_KEYSIZE_TOEPLITZ);
3873 error = ether_ioctl(ifp, cmd, data);
3880 hn_stop(struct hn_softc *sc, bool detaching)
3882 struct ifnet *ifp = sc->hn_ifp;
3887 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3888 ("synthetic parts were not attached"));
3890 /* Clear RUNNING bit ASAP. */
3891 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3893 /* Disable polling. */
3896 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
3897 KASSERT(sc->hn_vf_ifp != NULL,
3898 ("%s: VF is not attached", ifp->if_xname));
3900 /* Mark transparent mode VF as disabled. */
3901 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
3905 * Datapath setting must happen _before_ bringing
3908 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
3911 * Bring the VF down.
3913 hn_xpnt_vf_saveifflags(sc);
3914 sc->hn_vf_ifp->if_flags &= ~IFF_UP;
3915 hn_xpnt_vf_iocsetflags(sc);
3918 /* Suspend data transfers. */
3919 hn_suspend_data(sc);
3921 /* Clear OACTIVE bit. */
3922 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3923 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3924 sc->hn_tx_ring[i].hn_oactive = 0;
3927 * If the non-transparent mode VF is active, make sure
3928 * that the RX filter still allows packet reception.
3930 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
3931 hn_rxfilter_config(sc);
3935 hn_init_locked(struct hn_softc *sc)
3937 struct ifnet *ifp = sc->hn_ifp;
3942 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
3945 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3948 /* Configure RX filter */
3949 hn_rxfilter_config(sc);
3951 /* Clear OACTIVE bit. */
3952 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3953 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3954 sc->hn_tx_ring[i].hn_oactive = 0;
3956 /* Clear TX 'suspended' bit. */
3957 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
3959 if (hn_xpnt_vf_isready(sc)) {
3960 /* Initialize transparent VF. */
3961 hn_xpnt_vf_init(sc);
3964 /* Everything is ready; unleash! */
3965 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3967 /* Re-enable polling if requested. */
3968 if (sc->hn_pollhz > 0)
3969 hn_polling(sc, sc->hn_pollhz);
3975 struct hn_softc *sc = xsc;
3982 #if __FreeBSD_version >= 1100099
3985 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
3987 struct hn_softc *sc = arg1;
3988 unsigned int lenlim;
3991 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
3992 error = sysctl_handle_int(oidp, &lenlim, 0, req);
3993 if (error || req->newptr == NULL)
3997 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
3998 lenlim > TCP_LRO_LENGTH_MAX) {
4002 hn_set_lro_lenlim(sc, lenlim);
4009 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4011 struct hn_softc *sc = arg1;
4012 int ackcnt, error, i;
4015 * lro_ackcnt_lim is append count limit,
4016 * +1 to turn it into aggregation limit.
4018 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4019 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4020 if (error || req->newptr == NULL)
4023 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4027 * Convert aggregation limit back to append
4032 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4033 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4041 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4043 struct hn_softc *sc = arg1;
4048 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4051 error = sysctl_handle_int(oidp, &on, 0, req);
4052 if (error || req->newptr == NULL)
4056 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4057 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4060 rxr->hn_trust_hcsum |= hcsum;
4062 rxr->hn_trust_hcsum &= ~hcsum;
4069 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4071 struct hn_softc *sc = arg1;
4072 int chim_size, error;
4074 chim_size = sc->hn_tx_ring[0].hn_chim_size;
4075 error = sysctl_handle_int(oidp, &chim_size, 0, req);
4076 if (error || req->newptr == NULL)
4079 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4083 hn_set_chim_size(sc, chim_size);
4088 #if __FreeBSD_version < 1100095
4090 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4092 struct hn_softc *sc = arg1;
4093 int ofs = arg2, i, error;
4094 struct hn_rx_ring *rxr;
4098 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4099 rxr = &sc->hn_rx_ring[i];
4100 stat += *((int *)((uint8_t *)rxr + ofs));
4103 error = sysctl_handle_64(oidp, &stat, 0, req);
4104 if (error || req->newptr == NULL)
4107 /* Zero out this stat. */
4108 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4109 rxr = &sc->hn_rx_ring[i];
4110 *((int *)((uint8_t *)rxr + ofs)) = 0;
4116 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4118 struct hn_softc *sc = arg1;
4119 int ofs = arg2, i, error;
4120 struct hn_rx_ring *rxr;
4124 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4125 rxr = &sc->hn_rx_ring[i];
4126 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4129 error = sysctl_handle_64(oidp, &stat, 0, req);
4130 if (error || req->newptr == NULL)
4133 /* Zero out this stat. */
4134 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4135 rxr = &sc->hn_rx_ring[i];
4136 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4144 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4146 struct hn_softc *sc = arg1;
4147 int ofs = arg2, i, error;
4148 struct hn_rx_ring *rxr;
4152 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4153 rxr = &sc->hn_rx_ring[i];
4154 stat += *((u_long *)((uint8_t *)rxr + ofs));
4157 error = sysctl_handle_long(oidp, &stat, 0, req);
4158 if (error || req->newptr == NULL)
4161 /* Zero out this stat. */
4162 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4163 rxr = &sc->hn_rx_ring[i];
4164 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
4170 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4172 struct hn_softc *sc = arg1;
4173 int ofs = arg2, i, error;
4174 struct hn_tx_ring *txr;
4178 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4179 txr = &sc->hn_tx_ring[i];
4180 stat += *((u_long *)((uint8_t *)txr + ofs));
4183 error = sysctl_handle_long(oidp, &stat, 0, req);
4184 if (error || req->newptr == NULL)
4187 /* Zero out this stat. */
4188 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4189 txr = &sc->hn_tx_ring[i];
4190 *((u_long *)((uint8_t *)txr + ofs)) = 0;
4196 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4198 struct hn_softc *sc = arg1;
4199 int ofs = arg2, i, error, conf;
4200 struct hn_tx_ring *txr;
4202 txr = &sc->hn_tx_ring[0];
4203 conf = *((int *)((uint8_t *)txr + ofs));
4205 error = sysctl_handle_int(oidp, &conf, 0, req);
4206 if (error || req->newptr == NULL)
4210 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4211 txr = &sc->hn_tx_ring[i];
4212 *((int *)((uint8_t *)txr + ofs)) = conf;
4220 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4222 struct hn_softc *sc = arg1;
4225 size = sc->hn_agg_size;
4226 error = sysctl_handle_int(oidp, &size, 0, req);
4227 if (error || req->newptr == NULL)
4231 sc->hn_agg_size = size;
4239 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4241 struct hn_softc *sc = arg1;
4244 pkts = sc->hn_agg_pkts;
4245 error = sysctl_handle_int(oidp, &pkts, 0, req);
4246 if (error || req->newptr == NULL)
4250 sc->hn_agg_pkts = pkts;
4258 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4260 struct hn_softc *sc = arg1;
4263 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4264 return (sysctl_handle_int(oidp, &pkts, 0, req));
4268 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4270 struct hn_softc *sc = arg1;
4273 align = sc->hn_tx_ring[0].hn_agg_align;
4274 return (sysctl_handle_int(oidp, &align, 0, req));
4278 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4281 vmbus_chan_poll_disable(chan);
4283 vmbus_chan_poll_enable(chan, pollhz);
4287 hn_polling(struct hn_softc *sc, u_int pollhz)
4289 int nsubch = sc->hn_rx_ring_inuse - 1;
4294 struct vmbus_channel **subch;
4297 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4298 for (i = 0; i < nsubch; ++i)
4299 hn_chan_polling(subch[i], pollhz);
4300 vmbus_subchan_rel(subch, nsubch);
4302 hn_chan_polling(sc->hn_prichan, pollhz);
4306 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4308 struct hn_softc *sc = arg1;
4311 pollhz = sc->hn_pollhz;
4312 error = sysctl_handle_int(oidp, &pollhz, 0, req);
4313 if (error || req->newptr == NULL)
4317 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4321 if (sc->hn_pollhz != pollhz) {
4322 sc->hn_pollhz = pollhz;
4323 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4324 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4325 hn_polling(sc, sc->hn_pollhz);
4333 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4335 struct hn_softc *sc = arg1;
4338 snprintf(verstr, sizeof(verstr), "%u.%u",
4339 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4340 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4341 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4345 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4347 struct hn_softc *sc = arg1;
4354 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4355 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4359 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4361 struct hn_softc *sc = arg1;
4362 char assist_str[128];
4366 hwassist = sc->hn_ifp->if_hwassist;
4368 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4369 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4373 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4375 struct hn_softc *sc = arg1;
4376 char filter_str[128];
4380 filter = sc->hn_rx_filter;
4382 snprintf(filter_str, sizeof(filter_str), "%b", filter,
4384 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4388 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4390 struct hn_softc *sc = arg1;
4395 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4396 if (error || req->newptr == NULL)
4399 if ((sc->hn_flags & HN_FLAG_RXVF) ||
4400 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4402 * RSS key is synchronized w/ VF's, don't allow users
4409 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4412 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4414 if (sc->hn_rx_ring_inuse > 1) {
4415 error = hn_rss_reconfig(sc);
4417 /* Not RSS capable, at least for now; just save the RSS key. */
4426 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4428 struct hn_softc *sc = arg1;
4433 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4434 if (error || req->newptr == NULL)
4438 * Don't allow RSS indirect table change, if this interface is not
4439 * RSS capable currently.
4441 if (sc->hn_rx_ring_inuse == 1) {
4446 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4449 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4451 hn_rss_ind_fixup(sc);
4452 error = hn_rss_reconfig(sc);
4459 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4461 struct hn_softc *sc = arg1;
4466 hash = sc->hn_rss_hash;
4468 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4469 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4473 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4475 struct hn_softc *sc = arg1;
4480 hash = sc->hn_rss_hcap;
4482 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4483 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4487 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4489 struct hn_softc *sc = arg1;
4494 hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4496 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4497 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4501 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4503 struct hn_softc *sc = arg1;
4504 char vf_name[IFNAMSIZ + 1];
4505 struct ifnet *vf_ifp;
4509 vf_ifp = sc->hn_vf_ifp;
4511 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4513 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4517 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4519 struct hn_softc *sc = arg1;
4520 char vf_name[IFNAMSIZ + 1];
4521 struct ifnet *vf_ifp;
4525 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4527 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4529 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4533 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4535 struct rm_priotracker pt;
4540 error = sysctl_wire_old_buffer(req, 0);
4544 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4548 rm_rlock(&hn_vfmap_lock, &pt);
4551 for (i = 0; i < hn_vfmap_size; ++i) {
4554 if (hn_vfmap[i] == NULL)
4557 ifp = ifnet_byindex(i);
4560 sbuf_printf(sb, "%s", ifp->if_xname);
4562 sbuf_printf(sb, " %s", ifp->if_xname);
4567 rm_runlock(&hn_vfmap_lock, &pt);
4569 error = sbuf_finish(sb);
4575 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4577 struct rm_priotracker pt;
4582 error = sysctl_wire_old_buffer(req, 0);
4586 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4590 rm_rlock(&hn_vfmap_lock, &pt);
4593 for (i = 0; i < hn_vfmap_size; ++i) {
4594 struct ifnet *ifp, *hn_ifp;
4596 hn_ifp = hn_vfmap[i];
4600 ifp = ifnet_byindex(i);
4603 sbuf_printf(sb, "%s:%s", ifp->if_xname,
4606 sbuf_printf(sb, " %s:%s", ifp->if_xname,
4613 rm_runlock(&hn_vfmap_lock, &pt);
4615 error = sbuf_finish(sb);
4621 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4623 struct hn_softc *sc = arg1;
4624 int error, onoff = 0;
4626 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4628 error = sysctl_handle_int(oidp, &onoff, 0, req);
4629 if (error || req->newptr == NULL)
4633 /* NOTE: hn_vf_lock for hn_transmit() */
4634 rm_wlock(&sc->hn_vf_lock);
4636 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4638 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4639 rm_wunlock(&sc->hn_vf_lock);
4646 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4648 struct hn_softc *sc = arg1;
4651 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4653 return (sysctl_handle_int(oidp, &enabled, 0, req));
4657 hn_check_iplen(const struct mbuf *m, int hoff)
4659 const struct ip *ip;
4660 int len, iphlen, iplen;
4661 const struct tcphdr *th;
4662 int thoff; /* TCP data offset */
4664 len = hoff + sizeof(struct ip);
4666 /* The packet must be at least the size of an IP header. */
4667 if (m->m_pkthdr.len < len)
4668 return IPPROTO_DONE;
4670 /* The fixed IP header must reside completely in the first mbuf. */
4672 return IPPROTO_DONE;
4674 ip = mtodo(m, hoff);
4676 /* Bound check the packet's stated IP header length. */
4677 iphlen = ip->ip_hl << 2;
4678 if (iphlen < sizeof(struct ip)) /* minimum header length */
4679 return IPPROTO_DONE;
4681 /* The full IP header must reside completely in the one mbuf. */
4682 if (m->m_len < hoff + iphlen)
4683 return IPPROTO_DONE;
4685 iplen = ntohs(ip->ip_len);
4688 * Check that the amount of data in the buffers is as
4689 * at least much as the IP header would have us expect.
4691 if (m->m_pkthdr.len < hoff + iplen)
4692 return IPPROTO_DONE;
4695 * Ignore IP fragments.
4697 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4698 return IPPROTO_DONE;
4701 * The TCP/IP or UDP/IP header must be entirely contained within
4702 * the first fragment of a packet.
4706 if (iplen < iphlen + sizeof(struct tcphdr))
4707 return IPPROTO_DONE;
4708 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4709 return IPPROTO_DONE;
4710 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4711 thoff = th->th_off << 2;
4712 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4713 return IPPROTO_DONE;
4714 if (m->m_len < hoff + iphlen + thoff)
4715 return IPPROTO_DONE;
4718 if (iplen < iphlen + sizeof(struct udphdr))
4719 return IPPROTO_DONE;
4720 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4721 return IPPROTO_DONE;
4725 return IPPROTO_DONE;
4732 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4734 struct sysctl_oid_list *child;
4735 struct sysctl_ctx_list *ctx;
4736 device_t dev = sc->hn_dev;
4737 #if defined(INET) || defined(INET6)
4738 #if __FreeBSD_version >= 1100095
4745 * Create RXBUF for reception.
4748 * - It is shared by all channels.
4749 * - A large enough buffer is allocated, certain version of NVSes
4750 * may further limit the usable space.
4752 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4753 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4754 BUS_DMA_WAITOK | BUS_DMA_ZERO);
4755 if (sc->hn_rxbuf == NULL) {
4756 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4760 sc->hn_rx_ring_cnt = ring_cnt;
4761 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4763 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4764 M_DEVBUF, M_WAITOK | M_ZERO);
4766 #if defined(INET) || defined(INET6)
4767 #if __FreeBSD_version >= 1100095
4768 lroent_cnt = hn_lro_entry_count;
4769 if (lroent_cnt < TCP_LRO_ENTRIES)
4770 lroent_cnt = TCP_LRO_ENTRIES;
4772 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4774 #endif /* INET || INET6 */
4776 ctx = device_get_sysctl_ctx(dev);
4777 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4779 /* Create dev.hn.UNIT.rx sysctl tree */
4780 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4781 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4783 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4784 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4786 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4787 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4788 &rxr->hn_br_dma, BUS_DMA_WAITOK);
4789 if (rxr->hn_br == NULL) {
4790 device_printf(dev, "allocate bufring failed\n");
4794 if (hn_trust_hosttcp)
4795 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4796 if (hn_trust_hostudp)
4797 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4798 if (hn_trust_hostip)
4799 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4800 rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4801 rxr->hn_ifp = sc->hn_ifp;
4802 if (i < sc->hn_tx_ring_cnt)
4803 rxr->hn_txr = &sc->hn_tx_ring[i];
4804 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4805 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4807 rxr->hn_rxbuf = sc->hn_rxbuf;
4812 #if defined(INET) || defined(INET6)
4813 #if __FreeBSD_version >= 1100095
4814 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4815 hn_lro_mbufq_depth);
4817 tcp_lro_init(&rxr->hn_lro);
4818 rxr->hn_lro.ifp = sc->hn_ifp;
4820 #if __FreeBSD_version >= 1100099
4821 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4822 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4824 #endif /* INET || INET6 */
4826 if (sc->hn_rx_sysctl_tree != NULL) {
4830 * Create per RX ring sysctl tree:
4831 * dev.hn.UNIT.rx.RINGID
4833 snprintf(name, sizeof(name), "%d", i);
4834 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4835 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4836 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4838 if (rxr->hn_rx_sysctl_tree != NULL) {
4839 SYSCTL_ADD_ULONG(ctx,
4840 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4841 OID_AUTO, "packets", CTLFLAG_RW,
4842 &rxr->hn_pkts, "# of packets received");
4843 SYSCTL_ADD_ULONG(ctx,
4844 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4845 OID_AUTO, "rss_pkts", CTLFLAG_RW,
4847 "# of packets w/ RSS info received");
4849 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4850 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
4851 &rxr->hn_pktbuf_len, 0,
4852 "Temporary channel packet buffer length");
4857 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
4858 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4859 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
4860 #if __FreeBSD_version < 1100095
4861 hn_rx_stat_int_sysctl,
4863 hn_rx_stat_u64_sysctl,
4865 "LU", "LRO queued");
4866 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
4867 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4868 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
4869 #if __FreeBSD_version < 1100095
4870 hn_rx_stat_int_sysctl,
4872 hn_rx_stat_u64_sysctl,
4874 "LU", "LRO flushed");
4875 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
4876 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4877 __offsetof(struct hn_rx_ring, hn_lro_tried),
4878 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
4879 #if __FreeBSD_version >= 1100099
4880 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
4881 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4882 hn_lro_lenlim_sysctl, "IU",
4883 "Max # of data bytes to be aggregated by LRO");
4884 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
4885 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4886 hn_lro_ackcnt_sysctl, "I",
4887 "Max # of ACKs to be aggregated by LRO");
4889 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
4890 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
4891 hn_trust_hcsum_sysctl, "I",
4892 "Trust tcp segement verification on host side, "
4893 "when csum info is missing");
4894 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
4895 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
4896 hn_trust_hcsum_sysctl, "I",
4897 "Trust udp datagram verification on host side, "
4898 "when csum info is missing");
4899 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
4900 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
4901 hn_trust_hcsum_sysctl, "I",
4902 "Trust ip packet verification on host side, "
4903 "when csum info is missing");
4904 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
4905 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4906 __offsetof(struct hn_rx_ring, hn_csum_ip),
4907 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
4908 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
4909 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4910 __offsetof(struct hn_rx_ring, hn_csum_tcp),
4911 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
4912 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
4913 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4914 __offsetof(struct hn_rx_ring, hn_csum_udp),
4915 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
4916 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
4917 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4918 __offsetof(struct hn_rx_ring, hn_csum_trusted),
4919 hn_rx_stat_ulong_sysctl, "LU",
4920 "# of packets that we trust host's csum verification");
4921 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
4922 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4923 __offsetof(struct hn_rx_ring, hn_small_pkts),
4924 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
4925 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
4926 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4927 __offsetof(struct hn_rx_ring, hn_ack_failed),
4928 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
4929 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
4930 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
4931 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
4932 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
4938 hn_destroy_rx_data(struct hn_softc *sc)
4942 if (sc->hn_rxbuf != NULL) {
4943 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
4944 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
4946 device_printf(sc->hn_dev, "RXBUF is referenced\n");
4947 sc->hn_rxbuf = NULL;
4950 if (sc->hn_rx_ring_cnt == 0)
4953 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4954 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4956 if (rxr->hn_br == NULL)
4958 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
4959 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
4961 device_printf(sc->hn_dev,
4962 "%dth channel bufring is referenced", i);
4966 #if defined(INET) || defined(INET6)
4967 tcp_lro_free(&rxr->hn_lro);
4969 free(rxr->hn_pktbuf, M_DEVBUF);
4971 free(sc->hn_rx_ring, M_DEVBUF);
4972 sc->hn_rx_ring = NULL;
4974 sc->hn_rx_ring_cnt = 0;
4975 sc->hn_rx_ring_inuse = 0;
4979 hn_tx_ring_create(struct hn_softc *sc, int id)
4981 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
4982 device_t dev = sc->hn_dev;
4983 bus_dma_tag_t parent_dtag;
4987 txr->hn_tx_idx = id;
4989 #ifndef HN_USE_TXDESC_BUFRING
4990 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
4992 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
4994 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
4995 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
4996 M_DEVBUF, M_WAITOK | M_ZERO);
4997 #ifndef HN_USE_TXDESC_BUFRING
4998 SLIST_INIT(&txr->hn_txlist);
5000 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5001 M_WAITOK, &txr->hn_tx_lock);
5004 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5005 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5006 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5008 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5011 #ifdef HN_IFSTART_SUPPORT
5012 if (hn_use_if_start) {
5013 txr->hn_txeof = hn_start_txeof;
5014 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5015 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5021 txr->hn_txeof = hn_xmit_txeof;
5022 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5023 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5025 br_depth = hn_get_txswq_depth(txr);
5026 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5027 M_WAITOK, &txr->hn_tx_lock);
5030 txr->hn_direct_tx_size = hn_direct_tx_size;
5033 * Always schedule transmission instead of trying to do direct
5034 * transmission. This one gives the best performance so far.
5036 txr->hn_sched_tx = 1;
5038 parent_dtag = bus_get_dma_tag(dev);
5040 /* DMA tag for RNDIS packet messages. */
5041 error = bus_dma_tag_create(parent_dtag, /* parent */
5042 HN_RNDIS_PKT_ALIGN, /* alignment */
5043 HN_RNDIS_PKT_BOUNDARY, /* boundary */
5044 BUS_SPACE_MAXADDR, /* lowaddr */
5045 BUS_SPACE_MAXADDR, /* highaddr */
5046 NULL, NULL, /* filter, filterarg */
5047 HN_RNDIS_PKT_LEN, /* maxsize */
5049 HN_RNDIS_PKT_LEN, /* maxsegsize */
5051 NULL, /* lockfunc */
5052 NULL, /* lockfuncarg */
5053 &txr->hn_tx_rndis_dtag);
5055 device_printf(dev, "failed to create rndis dmatag\n");
5059 /* DMA tag for data. */
5060 error = bus_dma_tag_create(parent_dtag, /* parent */
5062 HN_TX_DATA_BOUNDARY, /* boundary */
5063 BUS_SPACE_MAXADDR, /* lowaddr */
5064 BUS_SPACE_MAXADDR, /* highaddr */
5065 NULL, NULL, /* filter, filterarg */
5066 HN_TX_DATA_MAXSIZE, /* maxsize */
5067 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
5068 HN_TX_DATA_SEGSIZE, /* maxsegsize */
5070 NULL, /* lockfunc */
5071 NULL, /* lockfuncarg */
5072 &txr->hn_tx_data_dtag);
5074 device_printf(dev, "failed to create data dmatag\n");
5078 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5079 struct hn_txdesc *txd = &txr->hn_txdesc[i];
5082 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5083 STAILQ_INIT(&txd->agg_list);
5086 * Allocate and load RNDIS packet message.
5088 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5089 (void **)&txd->rndis_pkt,
5090 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5091 &txd->rndis_pkt_dmap);
5094 "failed to allocate rndis_packet_msg, %d\n", i);
5098 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5099 txd->rndis_pkt_dmap,
5100 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5101 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5105 "failed to load rndis_packet_msg, %d\n", i);
5106 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5107 txd->rndis_pkt, txd->rndis_pkt_dmap);
5111 /* DMA map for TX data. */
5112 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5116 "failed to allocate tx data dmamap\n");
5117 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5118 txd->rndis_pkt_dmap);
5119 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5120 txd->rndis_pkt, txd->rndis_pkt_dmap);
5124 /* All set, put it to list */
5125 txd->flags |= HN_TXD_FLAG_ONLIST;
5126 #ifndef HN_USE_TXDESC_BUFRING
5127 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5129 buf_ring_enqueue(txr->hn_txdesc_br, txd);
5132 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5134 if (sc->hn_tx_sysctl_tree != NULL) {
5135 struct sysctl_oid_list *child;
5136 struct sysctl_ctx_list *ctx;
5140 * Create per TX ring sysctl tree:
5141 * dev.hn.UNIT.tx.RINGID
5143 ctx = device_get_sysctl_ctx(dev);
5144 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5146 snprintf(name, sizeof(name), "%d", id);
5147 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5148 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5150 if (txr->hn_tx_sysctl_tree != NULL) {
5151 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5154 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5155 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5156 "# of available TX descs");
5158 #ifdef HN_IFSTART_SUPPORT
5159 if (!hn_use_if_start)
5162 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5163 CTLFLAG_RD, &txr->hn_oactive, 0,
5166 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5167 CTLFLAG_RW, &txr->hn_pkts,
5168 "# of packets transmitted");
5169 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5170 CTLFLAG_RW, &txr->hn_sends, "# of sends");
5178 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5180 struct hn_tx_ring *txr = txd->txr;
5182 KASSERT(txd->m == NULL, ("still has mbuf installed"));
5183 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5185 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5186 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5187 txd->rndis_pkt_dmap);
5188 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5192 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5195 KASSERT(txd->refs == 0 || txd->refs == 1,
5196 ("invalid txd refs %d", txd->refs));
5198 /* Aggregated txds will be freed by their aggregating txd. */
5199 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5202 freed = hn_txdesc_put(txr, txd);
5203 KASSERT(freed, ("can't free txdesc"));
5208 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5212 if (txr->hn_txdesc == NULL)
5217 * Because the freeing of aggregated txds will be deferred
5218 * to the aggregating txd, two passes are used here:
5219 * - The first pass GCes any pending txds. This GC is necessary,
5220 * since if the channels are revoked, hypervisor will not
5221 * deliver send-done for all pending txds.
5222 * - The second pass frees the busdma stuffs, i.e. after all txds
5225 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5226 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5227 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5228 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5230 if (txr->hn_tx_data_dtag != NULL)
5231 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5232 if (txr->hn_tx_rndis_dtag != NULL)
5233 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5235 #ifdef HN_USE_TXDESC_BUFRING
5236 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5239 free(txr->hn_txdesc, M_DEVBUF);
5240 txr->hn_txdesc = NULL;
5242 if (txr->hn_mbuf_br != NULL)
5243 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5245 #ifndef HN_USE_TXDESC_BUFRING
5246 mtx_destroy(&txr->hn_txlist_spin);
5248 mtx_destroy(&txr->hn_tx_lock);
5252 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5254 struct sysctl_oid_list *child;
5255 struct sysctl_ctx_list *ctx;
5259 * Create TXBUF for chimney sending.
5261 * NOTE: It is shared by all channels.
5263 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5264 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5265 BUS_DMA_WAITOK | BUS_DMA_ZERO);
5266 if (sc->hn_chim == NULL) {
5267 device_printf(sc->hn_dev, "allocate txbuf failed\n");
5271 sc->hn_tx_ring_cnt = ring_cnt;
5272 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5274 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5275 M_DEVBUF, M_WAITOK | M_ZERO);
5277 ctx = device_get_sysctl_ctx(sc->hn_dev);
5278 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5280 /* Create dev.hn.UNIT.tx sysctl tree */
5281 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5282 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5284 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5287 error = hn_tx_ring_create(sc, i);
5292 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5293 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5294 __offsetof(struct hn_tx_ring, hn_no_txdescs),
5295 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5296 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5297 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5298 __offsetof(struct hn_tx_ring, hn_send_failed),
5299 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5300 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5301 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5302 __offsetof(struct hn_tx_ring, hn_txdma_failed),
5303 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5304 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5305 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5306 __offsetof(struct hn_tx_ring, hn_flush_failed),
5307 hn_tx_stat_ulong_sysctl, "LU",
5308 "# of packet transmission aggregation flush failure");
5309 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5310 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5311 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5312 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5313 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5314 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5315 __offsetof(struct hn_tx_ring, hn_tx_chimney),
5316 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5317 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5318 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5319 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5320 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5321 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5322 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5323 "# of total TX descs");
5324 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5325 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5326 "Chimney send packet size upper boundary");
5327 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5328 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5329 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5330 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5331 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5332 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5333 hn_tx_conf_int_sysctl, "I",
5334 "Size of the packet for direct transmission");
5335 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5336 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5337 __offsetof(struct hn_tx_ring, hn_sched_tx),
5338 hn_tx_conf_int_sysctl, "I",
5339 "Always schedule transmission "
5340 "instead of doing direct transmission");
5341 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5342 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5343 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5344 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5345 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5346 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5347 "Applied packet transmission aggregation size");
5348 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5349 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5350 hn_txagg_pktmax_sysctl, "I",
5351 "Applied packet transmission aggregation packets");
5352 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5353 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5354 hn_txagg_align_sysctl, "I",
5355 "Applied packet transmission aggregation alignment");
5361 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5365 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5366 sc->hn_tx_ring[i].hn_chim_size = chim_size;
5370 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5372 struct ifnet *ifp = sc->hn_ifp;
5378 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5381 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5382 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5383 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5385 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5386 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5387 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5389 if (tso_maxlen < tso_minlen)
5390 tso_maxlen = tso_minlen;
5391 else if (tso_maxlen > IP_MAXPACKET)
5392 tso_maxlen = IP_MAXPACKET;
5393 if (tso_maxlen > sc->hn_ndis_tso_szmax)
5394 tso_maxlen = sc->hn_ndis_tso_szmax;
5395 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5397 if (hn_xpnt_vf_isready(sc)) {
5398 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5399 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5401 ifp->if_hw_tsomax = hw_tsomax;
5403 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5407 hn_fixup_tx_data(struct hn_softc *sc)
5409 uint64_t csum_assist;
5412 hn_set_chim_size(sc, sc->hn_chim_szmax);
5413 if (hn_tx_chimney_size > 0 &&
5414 hn_tx_chimney_size < sc->hn_chim_szmax)
5415 hn_set_chim_size(sc, hn_tx_chimney_size);
5418 if (sc->hn_caps & HN_CAP_IPCS)
5419 csum_assist |= CSUM_IP;
5420 if (sc->hn_caps & HN_CAP_TCP4CS)
5421 csum_assist |= CSUM_IP_TCP;
5422 if (sc->hn_caps & HN_CAP_UDP4CS)
5423 csum_assist |= CSUM_IP_UDP;
5424 if (sc->hn_caps & HN_CAP_TCP6CS)
5425 csum_assist |= CSUM_IP6_TCP;
5426 if (sc->hn_caps & HN_CAP_UDP6CS)
5427 csum_assist |= CSUM_IP6_UDP;
5428 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5429 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5431 if (sc->hn_caps & HN_CAP_HASHVAL) {
5433 * Support HASHVAL pktinfo on TX path.
5436 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5437 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5438 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5443 hn_destroy_tx_data(struct hn_softc *sc)
5447 if (sc->hn_chim != NULL) {
5448 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5449 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5451 device_printf(sc->hn_dev,
5452 "chimney sending buffer is referenced");
5457 if (sc->hn_tx_ring_cnt == 0)
5460 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5461 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5463 free(sc->hn_tx_ring, M_DEVBUF);
5464 sc->hn_tx_ring = NULL;
5466 sc->hn_tx_ring_cnt = 0;
5467 sc->hn_tx_ring_inuse = 0;
5470 #ifdef HN_IFSTART_SUPPORT
5473 hn_start_taskfunc(void *xtxr, int pending __unused)
5475 struct hn_tx_ring *txr = xtxr;
5477 mtx_lock(&txr->hn_tx_lock);
5478 hn_start_locked(txr, 0);
5479 mtx_unlock(&txr->hn_tx_lock);
5483 hn_start_locked(struct hn_tx_ring *txr, int len)
5485 struct hn_softc *sc = txr->hn_sc;
5486 struct ifnet *ifp = sc->hn_ifp;
5489 KASSERT(hn_use_if_start,
5490 ("hn_start_locked is called, when if_start is disabled"));
5491 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5492 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5493 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5495 if (__predict_false(txr->hn_suspended))
5498 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5502 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5503 struct hn_txdesc *txd;
5504 struct mbuf *m_head;
5507 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5511 if (len > 0 && m_head->m_pkthdr.len > len) {
5513 * This sending could be time consuming; let callers
5514 * dispatch this packet sending (and sending of any
5515 * following up packets) to tx taskqueue.
5517 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5522 #if defined(INET6) || defined(INET)
5523 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5524 m_head = hn_tso_fixup(m_head);
5525 if (__predict_false(m_head == NULL)) {
5526 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5532 txd = hn_txdesc_get(txr);
5534 txr->hn_no_txdescs++;
5535 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5536 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5540 error = hn_encap(ifp, txr, txd, &m_head);
5542 /* Both txd and m_head are freed */
5543 KASSERT(txr->hn_agg_txd == NULL,
5544 ("encap failed w/ pending aggregating txdesc"));
5548 if (txr->hn_agg_pktleft == 0) {
5549 if (txr->hn_agg_txd != NULL) {
5550 KASSERT(m_head == NULL,
5551 ("pending mbuf for aggregating txdesc"));
5552 error = hn_flush_txagg(ifp, txr);
5553 if (__predict_false(error)) {
5554 atomic_set_int(&ifp->if_drv_flags,
5559 KASSERT(m_head != NULL, ("mbuf was freed"));
5560 error = hn_txpkt(ifp, txr, txd);
5561 if (__predict_false(error)) {
5562 /* txd is freed, but m_head is not */
5563 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5564 atomic_set_int(&ifp->if_drv_flags,
5572 KASSERT(txr->hn_agg_txd != NULL,
5573 ("no aggregating txdesc"));
5574 KASSERT(m_head == NULL,
5575 ("pending mbuf for aggregating txdesc"));
5580 /* Flush pending aggerated transmission. */
5581 if (txr->hn_agg_txd != NULL)
5582 hn_flush_txagg(ifp, txr);
5587 hn_start(struct ifnet *ifp)
5589 struct hn_softc *sc = ifp->if_softc;
5590 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5592 if (txr->hn_sched_tx)
5595 if (mtx_trylock(&txr->hn_tx_lock)) {
5598 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5599 mtx_unlock(&txr->hn_tx_lock);
5604 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5608 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5610 struct hn_tx_ring *txr = xtxr;
5612 mtx_lock(&txr->hn_tx_lock);
5613 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5614 hn_start_locked(txr, 0);
5615 mtx_unlock(&txr->hn_tx_lock);
5619 hn_start_txeof(struct hn_tx_ring *txr)
5621 struct hn_softc *sc = txr->hn_sc;
5622 struct ifnet *ifp = sc->hn_ifp;
5624 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5626 if (txr->hn_sched_tx)
5629 if (mtx_trylock(&txr->hn_tx_lock)) {
5632 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5633 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5634 mtx_unlock(&txr->hn_tx_lock);
5636 taskqueue_enqueue(txr->hn_tx_taskq,
5642 * Release the OACTIVE earlier, with the hope, that
5643 * others could catch up. The task will clear the
5644 * flag again with the hn_tx_lock to avoid possible
5647 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5648 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5652 #endif /* HN_IFSTART_SUPPORT */
5655 hn_xmit(struct hn_tx_ring *txr, int len)
5657 struct hn_softc *sc = txr->hn_sc;
5658 struct ifnet *ifp = sc->hn_ifp;
5659 struct mbuf *m_head;
5662 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5663 #ifdef HN_IFSTART_SUPPORT
5664 KASSERT(hn_use_if_start == 0,
5665 ("hn_xmit is called, when if_start is enabled"));
5667 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5669 if (__predict_false(txr->hn_suspended))
5672 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5675 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5676 struct hn_txdesc *txd;
5679 if (len > 0 && m_head->m_pkthdr.len > len) {
5681 * This sending could be time consuming; let callers
5682 * dispatch this packet sending (and sending of any
5683 * following up packets) to tx taskqueue.
5685 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5690 txd = hn_txdesc_get(txr);
5692 txr->hn_no_txdescs++;
5693 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5694 txr->hn_oactive = 1;
5698 error = hn_encap(ifp, txr, txd, &m_head);
5700 /* Both txd and m_head are freed; discard */
5701 KASSERT(txr->hn_agg_txd == NULL,
5702 ("encap failed w/ pending aggregating txdesc"));
5703 drbr_advance(ifp, txr->hn_mbuf_br);
5707 if (txr->hn_agg_pktleft == 0) {
5708 if (txr->hn_agg_txd != NULL) {
5709 KASSERT(m_head == NULL,
5710 ("pending mbuf for aggregating txdesc"));
5711 error = hn_flush_txagg(ifp, txr);
5712 if (__predict_false(error)) {
5713 txr->hn_oactive = 1;
5717 KASSERT(m_head != NULL, ("mbuf was freed"));
5718 error = hn_txpkt(ifp, txr, txd);
5719 if (__predict_false(error)) {
5720 /* txd is freed, but m_head is not */
5721 drbr_putback(ifp, txr->hn_mbuf_br,
5723 txr->hn_oactive = 1;
5730 KASSERT(txr->hn_agg_txd != NULL,
5731 ("no aggregating txdesc"));
5732 KASSERT(m_head == NULL,
5733 ("pending mbuf for aggregating txdesc"));
5738 drbr_advance(ifp, txr->hn_mbuf_br);
5741 /* Flush pending aggerated transmission. */
5742 if (txr->hn_agg_txd != NULL)
5743 hn_flush_txagg(ifp, txr);
5748 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5750 struct hn_softc *sc = ifp->if_softc;
5751 struct hn_tx_ring *txr;
5754 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5755 struct rm_priotracker pt;
5757 rm_rlock(&sc->hn_vf_lock, &pt);
5758 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5759 struct mbuf *m_bpf = NULL;
5762 obytes = m->m_pkthdr.len;
5763 if (m->m_flags & M_MCAST)
5766 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5767 if (bpf_peers_present(ifp->if_bpf)) {
5768 m_bpf = m_copypacket(m, M_NOWAIT);
5769 if (m_bpf == NULL) {
5771 * Failed to grab a shallow
5774 ETHER_BPF_MTAP(ifp, m);
5778 ETHER_BPF_MTAP(ifp, m);
5781 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5782 rm_runlock(&sc->hn_vf_lock, &pt);
5784 if (m_bpf != NULL) {
5786 ETHER_BPF_MTAP(ifp, m_bpf);
5790 if (error == ENOBUFS) {
5791 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5793 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5795 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5796 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5798 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5804 rm_runlock(&sc->hn_vf_lock, &pt);
5807 #if defined(INET6) || defined(INET)
5809 * Perform TSO packet header fixup now, since the TSO
5810 * packet header should be cache-hot.
5812 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5813 m = hn_tso_fixup(m);
5814 if (__predict_false(m == NULL)) {
5815 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5822 * Select the TX ring based on flowid
5824 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
5825 #if defined(INET6) || defined(INET)
5828 if (m->m_pkthdr.len < 128 &&
5829 (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
5830 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
5831 m = hn_check_tcpsyn(m, &tcpsyn);
5832 if (__predict_false(m == NULL)) {
5834 IFCOUNTER_OERRORS, 1);
5839 const int tcpsyn = 0;
5844 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
5846 txr = &sc->hn_tx_ring[idx];
5848 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
5850 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5854 if (txr->hn_oactive)
5857 if (txr->hn_sched_tx)
5860 if (mtx_trylock(&txr->hn_tx_lock)) {
5863 sched = hn_xmit(txr, txr->hn_direct_tx_size);
5864 mtx_unlock(&txr->hn_tx_lock);
5869 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5874 hn_tx_ring_qflush(struct hn_tx_ring *txr)
5878 mtx_lock(&txr->hn_tx_lock);
5879 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
5881 mtx_unlock(&txr->hn_tx_lock);
5885 hn_xmit_qflush(struct ifnet *ifp)
5887 struct hn_softc *sc = ifp->if_softc;
5888 struct rm_priotracker pt;
5891 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
5892 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5895 rm_rlock(&sc->hn_vf_lock, &pt);
5896 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
5897 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
5898 rm_runlock(&sc->hn_vf_lock, &pt);
5902 hn_xmit_txeof(struct hn_tx_ring *txr)
5905 if (txr->hn_sched_tx)
5908 if (mtx_trylock(&txr->hn_tx_lock)) {
5911 txr->hn_oactive = 0;
5912 sched = hn_xmit(txr, txr->hn_direct_tx_size);
5913 mtx_unlock(&txr->hn_tx_lock);
5915 taskqueue_enqueue(txr->hn_tx_taskq,
5921 * Release the oactive earlier, with the hope, that
5922 * others could catch up. The task will clear the
5923 * oactive again with the hn_tx_lock to avoid possible
5926 txr->hn_oactive = 0;
5927 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5932 hn_xmit_taskfunc(void *xtxr, int pending __unused)
5934 struct hn_tx_ring *txr = xtxr;
5936 mtx_lock(&txr->hn_tx_lock);
5938 mtx_unlock(&txr->hn_tx_lock);
5942 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
5944 struct hn_tx_ring *txr = xtxr;
5946 mtx_lock(&txr->hn_tx_lock);
5947 txr->hn_oactive = 0;
5949 mtx_unlock(&txr->hn_tx_lock);
5953 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
5955 struct vmbus_chan_br cbr;
5956 struct hn_rx_ring *rxr;
5957 struct hn_tx_ring *txr = NULL;
5960 idx = vmbus_chan_subidx(chan);
5963 * Link this channel to RX/TX ring.
5965 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
5966 ("invalid channel index %d, should > 0 && < %d",
5967 idx, sc->hn_rx_ring_inuse));
5968 rxr = &sc->hn_rx_ring[idx];
5969 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
5970 ("RX ring %d already attached", idx));
5971 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
5972 rxr->hn_chan = chan;
5975 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
5976 idx, vmbus_chan_id(chan));
5979 if (idx < sc->hn_tx_ring_inuse) {
5980 txr = &sc->hn_tx_ring[idx];
5981 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
5982 ("TX ring %d already attached", idx));
5983 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
5985 txr->hn_chan = chan;
5987 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
5988 idx, vmbus_chan_id(chan));
5992 /* Bind this channel to a proper CPU. */
5993 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
5998 cbr.cbr = rxr->hn_br;
5999 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6000 cbr.cbr_txsz = HN_TXBR_SIZE;
6001 cbr.cbr_rxsz = HN_RXBR_SIZE;
6002 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6004 if (error == EISCONN) {
6005 if_printf(sc->hn_ifp, "bufring is connected after "
6006 "chan%u open failure\n", vmbus_chan_id(chan));
6007 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6009 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6010 vmbus_chan_id(chan), error);
6017 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6019 struct hn_rx_ring *rxr;
6022 idx = vmbus_chan_subidx(chan);
6025 * Link this channel to RX/TX ring.
6027 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6028 ("invalid channel index %d, should > 0 && < %d",
6029 idx, sc->hn_rx_ring_inuse));
6030 rxr = &sc->hn_rx_ring[idx];
6031 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6032 ("RX ring %d is not attached", idx));
6033 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6035 if (idx < sc->hn_tx_ring_inuse) {
6036 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6038 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6039 ("TX ring %d is not attached attached", idx));
6040 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6044 * Close this channel.
6047 * Channel closing does _not_ destroy the target channel.
6049 error = vmbus_chan_close_direct(chan);
6050 if (error == EISCONN) {
6051 if_printf(sc->hn_ifp, "chan%u bufring is connected "
6052 "after being closed\n", vmbus_chan_id(chan));
6053 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6055 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6056 vmbus_chan_id(chan), error);
6061 hn_attach_subchans(struct hn_softc *sc)
6063 struct vmbus_channel **subchans;
6064 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6067 KASSERT(subchan_cnt > 0, ("no sub-channels"));
6069 /* Attach the sub-channels. */
6070 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6071 for (i = 0; i < subchan_cnt; ++i) {
6074 error1 = hn_chan_attach(sc, subchans[i]);
6077 /* Move on; all channels will be detached later. */
6080 vmbus_subchan_rel(subchans, subchan_cnt);
6083 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6086 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6094 hn_detach_allchans(struct hn_softc *sc)
6096 struct vmbus_channel **subchans;
6097 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6100 if (subchan_cnt == 0)
6103 /* Detach the sub-channels. */
6104 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6105 for (i = 0; i < subchan_cnt; ++i)
6106 hn_chan_detach(sc, subchans[i]);
6107 vmbus_subchan_rel(subchans, subchan_cnt);
6111 * Detach the primary channel, _after_ all sub-channels
6114 hn_chan_detach(sc, sc->hn_prichan);
6116 /* Wait for sub-channels to be destroyed, if any. */
6117 vmbus_subchan_drain(sc->hn_prichan);
6120 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6121 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6122 HN_RX_FLAG_ATTACHED) == 0,
6123 ("%dth RX ring is still attached", i));
6125 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6126 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6127 HN_TX_FLAG_ATTACHED) == 0,
6128 ("%dth TX ring is still attached", i));
6134 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6136 struct vmbus_channel **subchans;
6137 int nchan, rxr_cnt, error;
6139 nchan = *nsubch + 1;
6142 * Multiple RX/TX rings are not requested.
6149 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6152 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6154 /* No RSS; this is benign. */
6159 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6163 if (nchan > rxr_cnt)
6166 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6172 * Allocate sub-channels from NVS.
6174 *nsubch = nchan - 1;
6175 error = hn_nvs_alloc_subchans(sc, nsubch);
6176 if (error || *nsubch == 0) {
6177 /* Failed to allocate sub-channels. */
6183 * Wait for all sub-channels to become ready before moving on.
6185 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6186 vmbus_subchan_rel(subchans, *nsubch);
6191 hn_synth_attachable(const struct hn_softc *sc)
6195 if (sc->hn_flags & HN_FLAG_ERRORS)
6198 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6199 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6201 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6208 * Make sure that the RX filter is zero after the successful
6209 * RNDIS initialization.
6212 * Under certain conditions on certain versions of Hyper-V,
6213 * the RNDIS rxfilter is _not_ zero on the hypervisor side
6214 * after the successful RNDIS initialization, which breaks
6215 * the assumption of any following code (well, it breaks the
6216 * RNDIS API contract actually). Clear the RNDIS rxfilter
6217 * explicitly, drain packets sneaking through, and drain the
6218 * interrupt taskqueues scheduled due to the stealth packets.
6221 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6225 hn_drain_rxtx(sc, nchan);
6229 hn_synth_attach(struct hn_softc *sc, int mtu)
6231 #define ATTACHED_NVS 0x0002
6232 #define ATTACHED_RNDIS 0x0004
6234 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6235 int error, nsubch, nchan = 1, i, rndis_inited;
6236 uint32_t old_caps, attached = 0;
6238 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6239 ("synthetic parts were attached"));
6241 if (!hn_synth_attachable(sc))
6244 /* Save capabilities for later verification. */
6245 old_caps = sc->hn_caps;
6248 /* Clear RSS stuffs. */
6249 sc->hn_rss_ind_size = 0;
6250 sc->hn_rss_hash = 0;
6251 sc->hn_rss_hcap = 0;
6254 * Attach the primary channel _before_ attaching NVS and RNDIS.
6256 error = hn_chan_attach(sc, sc->hn_prichan);
6263 error = hn_nvs_attach(sc, mtu);
6266 attached |= ATTACHED_NVS;
6269 * Attach RNDIS _after_ NVS is attached.
6271 error = hn_rndis_attach(sc, mtu, &rndis_inited);
6273 attached |= ATTACHED_RNDIS;
6278 * Make sure capabilities are not changed.
6280 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6281 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6282 old_caps, sc->hn_caps);
6288 * Allocate sub-channels for multi-TX/RX rings.
6291 * The # of RX rings that can be used is equivalent to the # of
6292 * channels to be requested.
6294 nsubch = sc->hn_rx_ring_cnt - 1;
6295 error = hn_synth_alloc_subchans(sc, &nsubch);
6298 /* NOTE: _Full_ synthetic parts detach is required now. */
6299 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6302 * Set the # of TX/RX rings that could be used according to
6303 * the # of channels that NVS offered.
6306 hn_set_ring_inuse(sc, nchan);
6308 /* Only the primary channel can be used; done */
6313 * Attach the sub-channels.
6315 * NOTE: hn_set_ring_inuse() _must_ have been called.
6317 error = hn_attach_subchans(sc);
6322 * Configure RSS key and indirect table _after_ all sub-channels
6325 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6327 * RSS key is not set yet; set it to the default RSS key.
6330 if_printf(sc->hn_ifp, "setup default RSS key\n");
6331 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6332 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6335 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6337 * RSS indirect table is not set yet; set it up in round-
6341 if_printf(sc->hn_ifp, "setup default RSS indirect "
6344 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
6345 rss->rss_ind[i] = i % nchan;
6346 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6349 * # of usable channels may be changed, so we have to
6350 * make sure that all entries in RSS indirect table
6353 * NOTE: hn_set_ring_inuse() _must_ have been called.
6355 hn_rss_ind_fixup(sc);
6358 sc->hn_rss_hash = sc->hn_rss_hcap;
6359 if ((sc->hn_flags & HN_FLAG_RXVF) ||
6360 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6361 /* NOTE: Don't reconfigure RSS; will do immediately. */
6362 hn_vf_rss_fixup(sc, false);
6364 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6369 * Fixup transmission aggregation setup.
6372 hn_rndis_init_fixat(sc, nchan);
6376 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6377 hn_rndis_init_fixat(sc, nchan);
6378 hn_synth_detach(sc);
6380 if (attached & ATTACHED_RNDIS) {
6381 hn_rndis_init_fixat(sc, nchan);
6382 hn_rndis_detach(sc);
6384 if (attached & ATTACHED_NVS)
6386 hn_chan_detach(sc, sc->hn_prichan);
6387 /* Restore old capabilities. */
6388 sc->hn_caps = old_caps;
6392 #undef ATTACHED_RNDIS
6398 * The interface must have been suspended though hn_suspend(), before
6399 * this function get called.
6402 hn_synth_detach(struct hn_softc *sc)
6405 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6406 ("synthetic parts were not attached"));
6408 /* Detach the RNDIS first. */
6409 hn_rndis_detach(sc);
6414 /* Detach all of the channels. */
6415 hn_detach_allchans(sc);
6417 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6421 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6423 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6424 ("invalid ring count %d", ring_cnt));
6426 if (sc->hn_tx_ring_cnt > ring_cnt)
6427 sc->hn_tx_ring_inuse = ring_cnt;
6429 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6430 sc->hn_rx_ring_inuse = ring_cnt;
6433 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6434 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6439 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6444 * The TX bufring will not be drained by the hypervisor,
6445 * if the primary channel is revoked.
6447 while (!vmbus_chan_rx_empty(chan) ||
6448 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6449 !vmbus_chan_tx_empty(chan)))
6451 vmbus_chan_intr_drain(chan);
6455 hn_disable_rx(struct hn_softc *sc)
6459 * Disable RX by clearing RX filter forcefully.
6461 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6462 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6465 * Give RNDIS enough time to flush all pending data packets.
6467 pause("waitrx", (200 * hz) / 1000);
6472 * RX/TX _must_ have been suspended/disabled, before this function
6476 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6478 struct vmbus_channel **subch = NULL;
6482 * Drain RX/TX bufrings and interrupts.
6486 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6488 if (subch != NULL) {
6491 for (i = 0; i < nsubch; ++i)
6492 hn_chan_drain(sc, subch[i]);
6494 hn_chan_drain(sc, sc->hn_prichan);
6497 vmbus_subchan_rel(subch, nsubch);
6501 hn_suspend_data(struct hn_softc *sc)
6503 struct hn_tx_ring *txr;
6511 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6512 txr = &sc->hn_tx_ring[i];
6514 mtx_lock(&txr->hn_tx_lock);
6515 txr->hn_suspended = 1;
6516 mtx_unlock(&txr->hn_tx_lock);
6517 /* No one is able send more packets now. */
6520 * Wait for all pending sends to finish.
6523 * We will _not_ receive all pending send-done, if the
6524 * primary channel is revoked.
6526 while (hn_tx_ring_pending(txr) &&
6527 !vmbus_chan_is_revoked(sc->hn_prichan))
6528 pause("hnwtx", 1 /* 1 tick */);
6539 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6542 * Drain any pending TX tasks.
6545 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6546 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6548 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6549 txr = &sc->hn_tx_ring[i];
6551 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6552 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6557 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6560 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6564 hn_suspend_mgmt(struct hn_softc *sc)
6571 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6572 * through hn_mgmt_taskq.
6574 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6575 vmbus_chan_run_task(sc->hn_prichan, &task);
6578 * Make sure that all pending management tasks are completed.
6580 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6581 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6582 taskqueue_drain_all(sc->hn_mgmt_taskq0);
6586 hn_suspend(struct hn_softc *sc)
6589 /* Disable polling. */
6593 * If the non-transparent mode VF is activated, the synthetic
6594 * device is receiving packets, so the data path of the
6595 * synthetic device must be suspended.
6597 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6598 (sc->hn_flags & HN_FLAG_RXVF))
6599 hn_suspend_data(sc);
6600 hn_suspend_mgmt(sc);
6604 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6608 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6609 ("invalid TX ring count %d", tx_ring_cnt));
6611 for (i = 0; i < tx_ring_cnt; ++i) {
6612 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6614 mtx_lock(&txr->hn_tx_lock);
6615 txr->hn_suspended = 0;
6616 mtx_unlock(&txr->hn_tx_lock);
6621 hn_resume_data(struct hn_softc *sc)
6630 hn_rxfilter_config(sc);
6633 * Make sure to clear suspend status on "all" TX rings,
6634 * since hn_tx_ring_inuse can be changed after
6635 * hn_suspend_data().
6637 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6639 #ifdef HN_IFSTART_SUPPORT
6640 if (!hn_use_if_start)
6644 * Flush unused drbrs, since hn_tx_ring_inuse may be
6647 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6648 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6654 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6655 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6658 * Use txeof task, so that any pending oactive can be
6661 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6666 hn_resume_mgmt(struct hn_softc *sc)
6669 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6672 * Kick off network change detection, if it was pending.
6673 * If no network change was pending, start link status
6674 * checks, which is more lightweight than network change
6677 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6678 hn_change_network(sc);
6680 hn_update_link_status(sc);
6684 hn_resume(struct hn_softc *sc)
6688 * If the non-transparent mode VF is activated, the synthetic
6689 * device have to receive packets, so the data path of the
6690 * synthetic device must be resumed.
6692 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6693 (sc->hn_flags & HN_FLAG_RXVF))
6697 * Don't resume link status change if VF is attached/activated.
6698 * - In the non-transparent VF mode, the synthetic device marks
6699 * link down until the VF is deactivated; i.e. VF is down.
6700 * - In transparent VF mode, VF's media status is used until
6701 * the VF is detached.
6703 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6704 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6708 * Re-enable polling if this interface is running and
6709 * the polling is requested.
6711 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6712 hn_polling(sc, sc->hn_pollhz);
6716 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6718 const struct rndis_status_msg *msg;
6721 if (dlen < sizeof(*msg)) {
6722 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6727 switch (msg->rm_status) {
6728 case RNDIS_STATUS_MEDIA_CONNECT:
6729 case RNDIS_STATUS_MEDIA_DISCONNECT:
6730 hn_update_link_status(sc);
6733 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6734 case RNDIS_STATUS_LINK_SPEED_CHANGE:
6735 /* Not really useful; ignore. */
6738 case RNDIS_STATUS_NETWORK_CHANGE:
6739 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6740 if (dlen < ofs + msg->rm_stbuflen ||
6741 msg->rm_stbuflen < sizeof(uint32_t)) {
6742 if_printf(sc->hn_ifp, "network changed\n");
6746 memcpy(&change, ((const uint8_t *)msg) + ofs,
6748 if_printf(sc->hn_ifp, "network changed, change %u\n",
6751 hn_change_network(sc);
6755 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6762 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6764 const struct rndis_pktinfo *pi = info_data;
6767 while (info_dlen != 0) {
6771 if (__predict_false(info_dlen < sizeof(*pi)))
6773 if (__predict_false(info_dlen < pi->rm_size))
6775 info_dlen -= pi->rm_size;
6777 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6779 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6781 dlen = pi->rm_size - pi->rm_pktinfooffset;
6784 switch (pi->rm_type) {
6785 case NDIS_PKTINFO_TYPE_VLAN:
6786 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
6788 info->vlan_info = *((const uint32_t *)data);
6789 mask |= HN_RXINFO_VLAN;
6792 case NDIS_PKTINFO_TYPE_CSUM:
6793 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
6795 info->csum_info = *((const uint32_t *)data);
6796 mask |= HN_RXINFO_CSUM;
6799 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
6800 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
6802 info->hash_value = *((const uint32_t *)data);
6803 mask |= HN_RXINFO_HASHVAL;
6806 case HN_NDIS_PKTINFO_TYPE_HASHINF:
6807 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
6809 info->hash_info = *((const uint32_t *)data);
6810 mask |= HN_RXINFO_HASHINF;
6817 if (mask == HN_RXINFO_ALL) {
6818 /* All found; done */
6822 pi = (const struct rndis_pktinfo *)
6823 ((const uint8_t *)pi + pi->rm_size);
6828 * - If there is no hash value, invalidate the hash info.
6830 if ((mask & HN_RXINFO_HASHVAL) == 0)
6831 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
6835 static __inline bool
6836 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
6839 if (off < check_off) {
6840 if (__predict_true(off + len <= check_off))
6842 } else if (off > check_off) {
6843 if (__predict_true(check_off + check_len <= off))
6850 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
6852 const struct rndis_packet_msg *pkt;
6853 struct hn_rxinfo info;
6854 int data_off, pktinfo_off, data_len, pktinfo_len;
6859 if (__predict_false(dlen < sizeof(*pkt))) {
6860 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
6865 if (__predict_false(dlen < pkt->rm_len)) {
6866 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
6867 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
6870 if (__predict_false(pkt->rm_len <
6871 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
6872 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
6873 "msglen %u, data %u, oob %u, pktinfo %u\n",
6874 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
6875 pkt->rm_pktinfolen);
6878 if (__predict_false(pkt->rm_datalen == 0)) {
6879 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
6886 #define IS_OFFSET_INVALID(ofs) \
6887 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
6888 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
6890 /* XXX Hyper-V does not meet data offset alignment requirement */
6891 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
6892 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6893 "data offset %u\n", pkt->rm_dataoffset);
6896 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
6897 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
6898 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6899 "oob offset %u\n", pkt->rm_oobdataoffset);
6902 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
6903 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
6904 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6905 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
6909 #undef IS_OFFSET_INVALID
6911 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
6912 data_len = pkt->rm_datalen;
6913 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
6914 pktinfo_len = pkt->rm_pktinfolen;
6917 * Check OOB coverage.
6919 if (__predict_false(pkt->rm_oobdatalen != 0)) {
6920 int oob_off, oob_len;
6922 if_printf(rxr->hn_ifp, "got oobdata\n");
6923 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
6924 oob_len = pkt->rm_oobdatalen;
6926 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
6927 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6928 "oob overflow, msglen %u, oob abs %d len %d\n",
6929 pkt->rm_len, oob_off, oob_len);
6934 * Check against data.
6936 if (hn_rndis_check_overlap(oob_off, oob_len,
6937 data_off, data_len)) {
6938 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6939 "oob overlaps data, oob abs %d len %d, "
6940 "data abs %d len %d\n",
6941 oob_off, oob_len, data_off, data_len);
6946 * Check against pktinfo.
6948 if (pktinfo_len != 0 &&
6949 hn_rndis_check_overlap(oob_off, oob_len,
6950 pktinfo_off, pktinfo_len)) {
6951 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6952 "oob overlaps pktinfo, oob abs %d len %d, "
6953 "pktinfo abs %d len %d\n",
6954 oob_off, oob_len, pktinfo_off, pktinfo_len);
6960 * Check per-packet-info coverage and find useful per-packet-info.
6962 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
6963 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
6964 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
6965 if (__predict_true(pktinfo_len != 0)) {
6969 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
6970 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6971 "pktinfo overflow, msglen %u, "
6972 "pktinfo abs %d len %d\n",
6973 pkt->rm_len, pktinfo_off, pktinfo_len);
6978 * Check packet info coverage.
6980 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
6981 data_off, data_len);
6982 if (__predict_false(overlap)) {
6983 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6984 "pktinfo overlap data, pktinfo abs %d len %d, "
6985 "data abs %d len %d\n",
6986 pktinfo_off, pktinfo_len, data_off, data_len);
6991 * Find useful per-packet-info.
6993 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
6994 pktinfo_len, &info);
6995 if (__predict_false(error)) {
6996 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7002 if (__predict_false(data_off + data_len > pkt->rm_len)) {
7003 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7004 "data overflow, msglen %u, data abs %d len %d\n",
7005 pkt->rm_len, data_off, data_len);
7008 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7011 static __inline void
7012 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7014 const struct rndis_msghdr *hdr;
7016 if (__predict_false(dlen < sizeof(*hdr))) {
7017 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7022 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7023 /* Hot data path. */
7024 hn_rndis_rx_data(rxr, data, dlen);
7029 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7030 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7032 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7036 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7038 const struct hn_nvs_hdr *hdr;
7040 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7041 if_printf(sc->hn_ifp, "invalid nvs notify\n");
7044 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7046 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7047 /* Useless; ignore */
7050 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7054 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7055 const struct vmbus_chanpkt_hdr *pkt)
7057 struct hn_nvs_sendctx *sndc;
7059 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7060 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7061 VMBUS_CHANPKT_DATALEN(pkt));
7064 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7070 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7071 const struct vmbus_chanpkt_hdr *pkthdr)
7073 const struct vmbus_chanpkt_rxbuf *pkt;
7074 const struct hn_nvs_hdr *nvs_hdr;
7077 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7078 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7081 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7083 /* Make sure that this is a RNDIS message. */
7084 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7085 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7090 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7091 if (__predict_false(hlen < sizeof(*pkt))) {
7092 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7095 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7097 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7098 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7103 count = pkt->cp_rxbuf_cnt;
7104 if (__predict_false(hlen <
7105 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7106 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7110 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7111 for (i = 0; i < count; ++i) {
7114 ofs = pkt->cp_rxbuf[i].rb_ofs;
7115 len = pkt->cp_rxbuf[i].rb_len;
7116 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7117 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7118 "ofs %d, len %d\n", i, ofs, len);
7121 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7125 * Ack the consumed RXBUF associated w/ this channel packet,
7126 * so that this RXBUF can be recycled by the hypervisor.
7128 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7132 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7135 struct hn_nvs_rndis_ack ack;
7138 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7139 ack.nvs_status = HN_NVS_STATUS_OK;
7143 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7144 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7145 if (__predict_false(error == EAGAIN)) {
7148 * This should _not_ happen in real world, since the
7149 * consumption of the TX bufring from the TX path is
7152 if (rxr->hn_ack_failed == 0)
7153 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7154 rxr->hn_ack_failed++;
7161 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7166 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7168 struct hn_rx_ring *rxr = xrxr;
7169 struct hn_softc *sc = rxr->hn_ifp->if_softc;
7172 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7175 pktlen = rxr->hn_pktbuf_len;
7176 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7177 if (__predict_false(error == ENOBUFS)) {
7182 * Expand channel packet buffer.
7185 * Use M_WAITOK here, since allocation failure
7188 nlen = rxr->hn_pktbuf_len * 2;
7189 while (nlen < pktlen)
7191 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7193 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7194 rxr->hn_pktbuf_len, nlen);
7196 free(rxr->hn_pktbuf, M_DEVBUF);
7197 rxr->hn_pktbuf = nbuf;
7198 rxr->hn_pktbuf_len = nlen;
7201 } else if (__predict_false(error == EAGAIN)) {
7202 /* No more channel packets; done! */
7205 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7207 switch (pkt->cph_type) {
7208 case VMBUS_CHANPKT_TYPE_COMP:
7209 hn_nvs_handle_comp(sc, chan, pkt);
7212 case VMBUS_CHANPKT_TYPE_RXBUF:
7213 hn_nvs_handle_rxbuf(rxr, chan, pkt);
7216 case VMBUS_CHANPKT_TYPE_INBAND:
7217 hn_nvs_handle_notify(sc, pkt);
7221 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7226 hn_chan_rollup(rxr, rxr->hn_txr);
7230 hn_sysinit(void *arg __unused)
7234 #ifdef HN_IFSTART_SUPPORT
7236 * Don't use ifnet.if_start if transparent VF mode is requested;
7237 * mainly due to the IFF_DRV_OACTIVE flag.
7239 if (hn_xpnt_vf && hn_use_if_start) {
7240 hn_use_if_start = 0;
7241 printf("hn: tranparent VF mode, if_transmit will be used, "
7242 "instead of if_start\n");
7245 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7246 printf("hn: invalid transparent VF attach routing "
7247 "wait timeout %d, reset to %d\n",
7248 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7249 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7253 * Initialize VF map.
7255 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7256 hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7257 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7261 * Fix the # of TX taskqueues.
7263 if (hn_tx_taskq_cnt <= 0)
7264 hn_tx_taskq_cnt = 1;
7265 else if (hn_tx_taskq_cnt > mp_ncpus)
7266 hn_tx_taskq_cnt = mp_ncpus;
7269 * Fix the TX taskqueue mode.
7271 switch (hn_tx_taskq_mode) {
7272 case HN_TX_TASKQ_M_INDEP:
7273 case HN_TX_TASKQ_M_GLOBAL:
7274 case HN_TX_TASKQ_M_EVTTQ:
7277 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7281 if (vm_guest != VM_GUEST_HV)
7284 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7287 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7288 M_DEVBUF, M_WAITOK);
7289 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7290 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7291 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7292 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7296 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7299 hn_sysuninit(void *arg __unused)
7302 if (hn_tx_taskque != NULL) {
7305 for (i = 0; i < hn_tx_taskq_cnt; ++i)
7306 taskqueue_free(hn_tx_taskque[i]);
7307 free(hn_tx_taskque, M_DEVBUF);
7310 if (hn_vfmap != NULL)
7311 free(hn_vfmap, M_DEVBUF);
7312 rm_destroy(&hn_vfmap_lock);
7314 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);