2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
62 #include <sys/param.h>
63 #include <sys/systm.h>
65 #include <sys/counter.h>
66 #include <sys/kernel.h>
67 #include <sys/limits.h>
68 #include <sys/malloc.h>
70 #include <sys/module.h>
72 #include <sys/queue.h>
74 #include <sys/rmlock.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
85 #include <machine/atomic.h>
86 #include <machine/in_cksum.h>
89 #include <net/ethernet.h>
91 #include <net/if_arp.h>
92 #include <net/if_dl.h>
93 #include <net/if_media.h>
94 #include <net/if_types.h>
95 #include <net/if_var.h>
96 #include <net/if_vlan_var.h>
97 #include <net/rndis.h>
99 #include <netinet/in_systm.h>
100 #include <netinet/in.h>
101 #include <netinet/ip.h>
102 #include <netinet/ip6.h>
103 #include <netinet/tcp.h>
104 #include <netinet/tcp_lro.h>
105 #include <netinet/udp.h>
107 #include <dev/hyperv/include/hyperv.h>
108 #include <dev/hyperv/include/hyperv_busdma.h>
109 #include <dev/hyperv/include/vmbus.h>
110 #include <dev/hyperv/include/vmbus_xact.h>
112 #include <dev/hyperv/netvsc/ndis.h>
113 #include <dev/hyperv/netvsc/if_hnreg.h>
114 #include <dev/hyperv/netvsc/if_hnvar.h>
115 #include <dev/hyperv/netvsc/hn_nvs.h>
116 #include <dev/hyperv/netvsc/hn_rndis.h>
118 #include "vmbus_if.h"
120 #define HN_IFSTART_SUPPORT
122 /* NOTE: M_HASHTYPE_RSS_UDP_IPV4 is not available on stable/10. */
123 #ifndef M_HASHTYPE_RSS_UDP_IPV4
124 #define M_HASHTYPE_RSS_UDP_IPV4 M_HASHTYPE_OPAQUE
127 #define HN_RING_CNT_DEF_MAX 8
129 #define HN_VFMAP_SIZE_DEF 8
131 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */
133 /* YYY should get it from the underlying channel */
134 #define HN_TX_DESC_CNT 512
136 #define HN_RNDIS_PKT_LEN \
137 (sizeof(struct rndis_packet_msg) + \
138 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
139 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
140 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
141 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
142 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
143 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
145 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
146 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
147 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
148 /* -1 for RNDIS packet message */
149 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
151 #define HN_DIRECT_TX_SIZE_DEF 128
153 #define HN_EARLY_TXEOF_THRESH 8
155 #define HN_PKTBUF_LEN_DEF (16 * 1024)
157 #define HN_LROENT_CNT_DEF 128
159 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
160 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
161 /* YYY 2*MTU is a bit rough, but should be good enough. */
162 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
164 #define HN_LRO_ACKCNT_DEF 1
166 #define HN_LOCK_INIT(sc) \
167 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
168 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
169 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
170 #define HN_LOCK(sc) \
172 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
175 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
177 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
178 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
179 #define HN_CSUM_IP_HWASSIST(sc) \
180 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
181 #define HN_CSUM_IP6_HWASSIST(sc) \
182 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
184 #define HN_PKTSIZE_MIN(align) \
185 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
186 HN_RNDIS_PKT_LEN, (align))
187 #define HN_PKTSIZE(m, align) \
188 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
190 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
193 #ifndef HN_USE_TXDESC_BUFRING
194 SLIST_ENTRY(hn_txdesc) link;
196 STAILQ_ENTRY(hn_txdesc) agg_link;
198 /* Aggregated txdescs, in sending order. */
199 STAILQ_HEAD(, hn_txdesc) agg_list;
201 /* The oldest packet, if transmission aggregation happens. */
203 struct hn_tx_ring *txr;
205 uint32_t flags; /* HN_TXD_FLAG_ */
206 struct hn_nvs_sendctx send_ctx;
210 bus_dmamap_t data_dmap;
212 bus_addr_t rndis_pkt_paddr;
213 struct rndis_packet_msg *rndis_pkt;
214 bus_dmamap_t rndis_pkt_dmap;
217 #define HN_TXD_FLAG_ONLIST 0x0001
218 #define HN_TXD_FLAG_DMAMAP 0x0002
219 #define HN_TXD_FLAG_ONAGG 0x0004
228 struct hn_rxvf_setarg {
229 struct hn_rx_ring *rxr;
230 struct ifnet *vf_ifp;
233 #define HN_RXINFO_VLAN 0x0001
234 #define HN_RXINFO_CSUM 0x0002
235 #define HN_RXINFO_HASHINF 0x0004
236 #define HN_RXINFO_HASHVAL 0x0008
237 #define HN_RXINFO_ALL \
240 HN_RXINFO_HASHINF | \
243 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
244 #define HN_NDIS_RXCSUM_INFO_INVALID 0
245 #define HN_NDIS_HASH_INFO_INVALID 0
247 static int hn_probe(device_t);
248 static int hn_attach(device_t);
249 static int hn_detach(device_t);
250 static int hn_shutdown(device_t);
251 static void hn_chan_callback(struct vmbus_channel *,
254 static void hn_init(void *);
255 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
256 #ifdef HN_IFSTART_SUPPORT
257 static void hn_start(struct ifnet *);
259 static int hn_transmit(struct ifnet *, struct mbuf *);
260 static void hn_xmit_qflush(struct ifnet *);
261 static int hn_ifmedia_upd(struct ifnet *);
262 static void hn_ifmedia_sts(struct ifnet *,
263 struct ifmediareq *);
265 static void hn_ifnet_event(void *, struct ifnet *, int);
266 static void hn_ifaddr_event(void *, struct ifnet *);
267 static void hn_ifnet_attevent(void *, struct ifnet *);
268 static void hn_ifnet_detevent(void *, struct ifnet *);
269 static void hn_ifnet_lnkevent(void *, struct ifnet *, int);
271 static bool hn_ismyvf(const struct hn_softc *,
272 const struct ifnet *);
273 static void hn_rxvf_change(struct hn_softc *,
274 struct ifnet *, bool);
275 static void hn_rxvf_set(struct hn_softc *, struct ifnet *);
276 static void hn_rxvf_set_task(void *, int);
277 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
278 static int hn_xpnt_vf_iocsetflags(struct hn_softc *);
279 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *,
281 static void hn_xpnt_vf_saveifflags(struct hn_softc *);
282 static bool hn_xpnt_vf_isready(struct hn_softc *);
283 static void hn_xpnt_vf_setready(struct hn_softc *);
284 static void hn_xpnt_vf_init_taskfunc(void *, int);
285 static void hn_xpnt_vf_init(struct hn_softc *);
286 static void hn_xpnt_vf_setenable(struct hn_softc *);
287 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool);
288 static void hn_vf_rss_fixup(struct hn_softc *, bool);
289 static void hn_vf_rss_restore(struct hn_softc *);
291 static int hn_rndis_rxinfo(const void *, int,
293 static void hn_rndis_rx_data(struct hn_rx_ring *,
295 static void hn_rndis_rx_status(struct hn_softc *,
297 static void hn_rndis_init_fixat(struct hn_softc *, int);
299 static void hn_nvs_handle_notify(struct hn_softc *,
300 const struct vmbus_chanpkt_hdr *);
301 static void hn_nvs_handle_comp(struct hn_softc *,
302 struct vmbus_channel *,
303 const struct vmbus_chanpkt_hdr *);
304 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
305 struct vmbus_channel *,
306 const struct vmbus_chanpkt_hdr *);
307 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
308 struct vmbus_channel *, uint64_t);
310 #if __FreeBSD_version >= 1100099
311 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
312 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
314 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
315 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
316 #if __FreeBSD_version < 1100095
317 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
319 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
321 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
322 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
323 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
324 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
325 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
326 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
327 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
328 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
329 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
330 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
331 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
332 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
333 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
334 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
335 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
336 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
337 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
338 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
339 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
340 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
341 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
342 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
343 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
345 static void hn_stop(struct hn_softc *, bool);
346 static void hn_init_locked(struct hn_softc *);
347 static int hn_chan_attach(struct hn_softc *,
348 struct vmbus_channel *);
349 static void hn_chan_detach(struct hn_softc *,
350 struct vmbus_channel *);
351 static int hn_attach_subchans(struct hn_softc *);
352 static void hn_detach_allchans(struct hn_softc *);
353 static void hn_chan_rollup(struct hn_rx_ring *,
354 struct hn_tx_ring *);
355 static void hn_set_ring_inuse(struct hn_softc *, int);
356 static int hn_synth_attach(struct hn_softc *, int);
357 static void hn_synth_detach(struct hn_softc *);
358 static int hn_synth_alloc_subchans(struct hn_softc *,
360 static bool hn_synth_attachable(const struct hn_softc *);
361 static void hn_suspend(struct hn_softc *);
362 static void hn_suspend_data(struct hn_softc *);
363 static void hn_suspend_mgmt(struct hn_softc *);
364 static void hn_resume(struct hn_softc *);
365 static void hn_resume_data(struct hn_softc *);
366 static void hn_resume_mgmt(struct hn_softc *);
367 static void hn_suspend_mgmt_taskfunc(void *, int);
368 static void hn_chan_drain(struct hn_softc *,
369 struct vmbus_channel *);
370 static void hn_disable_rx(struct hn_softc *);
371 static void hn_drain_rxtx(struct hn_softc *, int);
372 static void hn_polling(struct hn_softc *, u_int);
373 static void hn_chan_polling(struct vmbus_channel *, u_int);
374 static void hn_mtu_change_fixup(struct hn_softc *);
376 static void hn_update_link_status(struct hn_softc *);
377 static void hn_change_network(struct hn_softc *);
378 static void hn_link_taskfunc(void *, int);
379 static void hn_netchg_init_taskfunc(void *, int);
380 static void hn_netchg_status_taskfunc(void *, int);
381 static void hn_link_status(struct hn_softc *);
383 static int hn_create_rx_data(struct hn_softc *, int);
384 static void hn_destroy_rx_data(struct hn_softc *);
385 static int hn_check_iplen(const struct mbuf *, int);
386 static void hn_rxpkt_proto(const struct mbuf *, int *, int *);
387 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
388 static int hn_rxfilter_config(struct hn_softc *);
389 static int hn_rss_reconfig(struct hn_softc *);
390 static void hn_rss_ind_fixup(struct hn_softc *);
391 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
392 static int hn_rxpkt(struct hn_rx_ring *, const void *,
393 int, const struct hn_rxinfo *);
394 static uint32_t hn_rss_type_fromndis(uint32_t);
395 static uint32_t hn_rss_type_tondis(uint32_t);
397 static int hn_tx_ring_create(struct hn_softc *, int);
398 static void hn_tx_ring_destroy(struct hn_tx_ring *);
399 static int hn_create_tx_data(struct hn_softc *, int);
400 static void hn_fixup_tx_data(struct hn_softc *);
401 static void hn_fixup_rx_data(struct hn_softc *);
402 static void hn_destroy_tx_data(struct hn_softc *);
403 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
404 static void hn_txdesc_gc(struct hn_tx_ring *,
406 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
407 struct hn_txdesc *, struct mbuf **);
408 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
410 static void hn_set_chim_size(struct hn_softc *, int);
411 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
412 static bool hn_tx_ring_pending(struct hn_tx_ring *);
413 static void hn_tx_ring_qflush(struct hn_tx_ring *);
414 static void hn_resume_tx(struct hn_softc *, int);
415 static void hn_set_txagg(struct hn_softc *);
416 static void *hn_try_txagg(struct ifnet *,
417 struct hn_tx_ring *, struct hn_txdesc *,
419 static int hn_get_txswq_depth(const struct hn_tx_ring *);
420 static void hn_txpkt_done(struct hn_nvs_sendctx *,
421 struct hn_softc *, struct vmbus_channel *,
423 static int hn_txpkt_sglist(struct hn_tx_ring *,
425 static int hn_txpkt_chim(struct hn_tx_ring *,
427 static int hn_xmit(struct hn_tx_ring *, int);
428 static void hn_xmit_taskfunc(void *, int);
429 static void hn_xmit_txeof(struct hn_tx_ring *);
430 static void hn_xmit_txeof_taskfunc(void *, int);
431 #ifdef HN_IFSTART_SUPPORT
432 static int hn_start_locked(struct hn_tx_ring *, int);
433 static void hn_start_taskfunc(void *, int);
434 static void hn_start_txeof(struct hn_tx_ring *);
435 static void hn_start_txeof_taskfunc(void *, int);
438 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
439 "Hyper-V network interface");
441 /* Trust tcp segements verification on host side. */
442 static int hn_trust_hosttcp = 1;
443 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
444 &hn_trust_hosttcp, 0,
445 "Trust tcp segement verification on host side, "
446 "when csum info is missing (global setting)");
448 /* Trust udp datagrams verification on host side. */
449 static int hn_trust_hostudp = 1;
450 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
451 &hn_trust_hostudp, 0,
452 "Trust udp datagram verification on host side, "
453 "when csum info is missing (global setting)");
455 /* Trust ip packets verification on host side. */
456 static int hn_trust_hostip = 1;
457 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
459 "Trust ip packet verification on host side, "
460 "when csum info is missing (global setting)");
463 * Offload UDP/IPv4 checksum.
465 static int hn_enable_udp4cs = 1;
466 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
467 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
470 * Offload UDP/IPv6 checksum.
472 static int hn_enable_udp6cs = 1;
473 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
474 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
477 static counter_u64_t hn_udpcs_fixup;
478 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
479 &hn_udpcs_fixup, "# of UDP checksum fixup");
484 * This value is for Azure. For Hyper-V, set this above
485 * 65536 to disable UDP datagram checksum fixup.
487 static int hn_udpcs_fixup_mtu = 1420;
488 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
489 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
491 /* Limit TSO burst size */
492 static int hn_tso_maxlen = IP_MAXPACKET;
493 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
494 &hn_tso_maxlen, 0, "TSO burst limit");
496 /* Limit chimney send size */
497 static int hn_tx_chimney_size = 0;
498 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
499 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
501 /* Limit the size of packet for direct transmission */
502 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
503 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
504 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
506 /* # of LRO entries per RX ring */
507 #if defined(INET) || defined(INET6)
508 #if __FreeBSD_version >= 1100095
509 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
510 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
511 &hn_lro_entry_count, 0, "LRO entry count");
515 static int hn_tx_taskq_cnt = 1;
516 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
517 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
519 #define HN_TX_TASKQ_M_INDEP 0
520 #define HN_TX_TASKQ_M_GLOBAL 1
521 #define HN_TX_TASKQ_M_EVTTQ 2
523 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
524 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
525 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
526 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
528 #ifndef HN_USE_TXDESC_BUFRING
529 static int hn_use_txdesc_bufring = 0;
531 static int hn_use_txdesc_bufring = 1;
533 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
534 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
536 #ifdef HN_IFSTART_SUPPORT
537 /* Use ifnet.if_start instead of ifnet.if_transmit */
538 static int hn_use_if_start = 0;
539 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
540 &hn_use_if_start, 0, "Use if_start TX method");
543 /* # of channels to use */
544 static int hn_chan_cnt = 0;
545 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
547 "# of channels to use; each channel has one RX ring and one TX ring");
549 /* # of transmit rings to use */
550 static int hn_tx_ring_cnt = 0;
551 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
552 &hn_tx_ring_cnt, 0, "# of TX rings to use");
554 /* Software TX ring deptch */
555 static int hn_tx_swq_depth = 0;
556 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
557 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
559 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
560 #if __FreeBSD_version >= 1100095
561 static u_int hn_lro_mbufq_depth = 0;
562 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
563 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
566 /* Packet transmission aggregation size limit */
567 static int hn_tx_agg_size = -1;
568 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
569 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
571 /* Packet transmission aggregation count limit */
572 static int hn_tx_agg_pkts = -1;
573 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
574 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
577 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
578 0, 0, hn_vflist_sysctl, "A", "VF list");
581 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
582 0, 0, hn_vfmap_sysctl, "A", "VF mapping");
585 static int hn_xpnt_vf = 1;
586 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
587 &hn_xpnt_vf, 0, "Transparent VF mod");
589 /* Accurate BPF support for Transparent VF */
590 static int hn_xpnt_vf_accbpf = 0;
591 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
592 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
594 /* Extra wait for transparent VF attach routing; unit seconds. */
595 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
596 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
597 &hn_xpnt_vf_attwait, 0,
598 "Extra wait for transparent VF attach routing; unit: seconds");
600 static u_int hn_cpu_index; /* next CPU for channel */
601 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
603 static struct rmlock hn_vfmap_lock;
604 static int hn_vfmap_size;
605 static struct ifnet **hn_vfmap;
608 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
609 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
610 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
611 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
612 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
613 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
616 static const struct hyperv_guid hn_guid = {
618 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
619 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
622 static device_method_t hn_methods[] = {
623 /* Device interface */
624 DEVMETHOD(device_probe, hn_probe),
625 DEVMETHOD(device_attach, hn_attach),
626 DEVMETHOD(device_detach, hn_detach),
627 DEVMETHOD(device_shutdown, hn_shutdown),
631 static driver_t hn_driver = {
634 sizeof(struct hn_softc)
637 static devclass_t hn_devclass;
639 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
640 MODULE_VERSION(hn, 1);
641 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
643 #if __FreeBSD_version >= 1100099
645 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
649 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
650 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
655 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
658 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
659 txd->chim_size == 0, ("invalid rndis sglist txd"));
660 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
661 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
665 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
667 struct hn_nvs_rndis rndis;
669 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
670 txd->chim_size > 0, ("invalid rndis chim txd"));
672 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
673 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
674 rndis.nvs_chim_idx = txd->chim_index;
675 rndis.nvs_chim_sz = txd->chim_size;
677 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
678 &rndis, sizeof(rndis), &txd->send_ctx));
681 static __inline uint32_t
682 hn_chim_alloc(struct hn_softc *sc)
684 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
685 u_long *bmap = sc->hn_chim_bmap;
686 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
688 for (i = 0; i < bmap_cnt; ++i) {
691 idx = ffsl(~bmap[i]);
695 --idx; /* ffsl is 1-based */
696 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
697 ("invalid i %d and idx %d", i, idx));
699 if (atomic_testandset_long(&bmap[i], idx))
702 ret = i * LONG_BIT + idx;
709 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
714 idx = chim_idx / LONG_BIT;
715 KASSERT(idx < sc->hn_chim_bmap_cnt,
716 ("invalid chimney index 0x%x", chim_idx));
718 mask = 1UL << (chim_idx % LONG_BIT);
719 KASSERT(sc->hn_chim_bmap[idx] & mask,
720 ("index bitmap 0x%lx, chimney index %u, "
721 "bitmap idx %d, bitmask 0x%lx",
722 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
724 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
727 #if defined(INET6) || defined(INET)
729 #define PULLUP_HDR(m, len) \
731 if (__predict_false((m)->m_len < (len))) { \
732 (m) = m_pullup((m), (len)); \
739 * NOTE: If this function failed, the m_head would be freed.
741 static __inline struct mbuf *
742 hn_tso_fixup(struct mbuf *m_head)
744 struct ether_vlan_header *evl;
748 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
750 PULLUP_HDR(m_head, sizeof(*evl));
751 evl = mtod(m_head, struct ether_vlan_header *);
752 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
753 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
755 ehlen = ETHER_HDR_LEN;
756 m_head->m_pkthdr.l2hlen = ehlen;
759 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
763 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
764 ip = mtodo(m_head, ehlen);
765 iphlen = ip->ip_hl << 2;
766 m_head->m_pkthdr.l3hlen = iphlen;
768 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
769 th = mtodo(m_head, ehlen + iphlen);
773 th->th_sum = in_pseudo(ip->ip_src.s_addr,
774 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
777 #if defined(INET6) && defined(INET)
784 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
785 ip6 = mtodo(m_head, ehlen);
786 if (ip6->ip6_nxt != IPPROTO_TCP) {
790 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
792 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
793 th = mtodo(m_head, ehlen + sizeof(*ip6));
796 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
803 * NOTE: If this function failed, the m_head would be freed.
805 static __inline struct mbuf *
806 hn_set_hlen(struct mbuf *m_head)
808 const struct ether_vlan_header *evl;
811 PULLUP_HDR(m_head, sizeof(*evl));
812 evl = mtod(m_head, const struct ether_vlan_header *);
813 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
814 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
816 ehlen = ETHER_HDR_LEN;
817 m_head->m_pkthdr.l2hlen = ehlen;
820 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
824 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
825 ip = mtodo(m_head, ehlen);
826 iphlen = ip->ip_hl << 2;
827 m_head->m_pkthdr.l3hlen = iphlen;
830 * UDP checksum offload does not work in Azure, if the
831 * following conditions meet:
832 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
833 * - IP_DF is not set in the IP hdr.
835 * Fallback to software checksum for these UDP datagrams.
837 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
838 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
839 (ntohs(ip->ip_off) & IP_DF) == 0) {
840 uint16_t off = ehlen + iphlen;
842 counter_u64_add(hn_udpcs_fixup, 1);
843 PULLUP_HDR(m_head, off + sizeof(struct udphdr));
844 *(uint16_t *)(m_head->m_data + off +
845 m_head->m_pkthdr.csum_data) = in_cksum_skip(
846 m_head, m_head->m_pkthdr.len, off);
847 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
851 #if defined(INET6) && defined(INET)
856 const struct ip6_hdr *ip6;
858 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
859 ip6 = mtodo(m_head, ehlen);
860 if (ip6->ip6_nxt != IPPROTO_TCP) {
864 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
871 * NOTE: If this function failed, the m_head would be freed.
873 static __inline struct mbuf *
874 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
876 const struct tcphdr *th;
880 ehlen = m_head->m_pkthdr.l2hlen;
881 iphlen = m_head->m_pkthdr.l3hlen;
883 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
884 th = mtodo(m_head, ehlen + iphlen);
885 if (th->th_flags & TH_SYN)
892 #endif /* INET6 || INET */
895 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
901 if (sc->hn_rx_filter != filter) {
902 error = hn_rndis_set_rxfilter(sc, filter);
904 sc->hn_rx_filter = filter;
910 hn_rxfilter_config(struct hn_softc *sc)
912 struct ifnet *ifp = sc->hn_ifp;
918 * If the non-transparent mode VF is activated, we don't know how
919 * its RX filter is configured, so stick the synthetic device in
920 * the promiscous mode.
922 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
923 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
925 filter = NDIS_PACKET_TYPE_DIRECTED;
926 if (ifp->if_flags & IFF_BROADCAST)
927 filter |= NDIS_PACKET_TYPE_BROADCAST;
928 /* TODO: support multicast list */
929 if ((ifp->if_flags & IFF_ALLMULTI) ||
930 !TAILQ_EMPTY(&ifp->if_multiaddrs))
931 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
933 return (hn_set_rxfilter(sc, filter));
937 hn_set_txagg(struct hn_softc *sc)
943 * Setup aggregation size.
945 if (sc->hn_agg_size < 0)
948 size = sc->hn_agg_size;
950 if (sc->hn_rndis_agg_size < size)
951 size = sc->hn_rndis_agg_size;
953 /* NOTE: We only aggregate packets using chimney sending buffers. */
954 if (size > (uint32_t)sc->hn_chim_szmax)
955 size = sc->hn_chim_szmax;
957 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
964 /* NOTE: Type of the per TX ring setting is 'int'. */
969 * Setup aggregation packet count.
971 if (sc->hn_agg_pkts < 0)
974 pkts = sc->hn_agg_pkts;
976 if (sc->hn_rndis_agg_pkts < pkts)
977 pkts = sc->hn_rndis_agg_pkts;
986 /* NOTE: Type of the per TX ring setting is 'short'. */
991 /* NOTE: Type of the per TX ring setting is 'short'. */
992 if (sc->hn_rndis_agg_align > SHRT_MAX) {
999 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1000 size, pkts, sc->hn_rndis_agg_align);
1003 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1004 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1006 mtx_lock(&txr->hn_tx_lock);
1007 txr->hn_agg_szmax = size;
1008 txr->hn_agg_pktmax = pkts;
1009 txr->hn_agg_align = sc->hn_rndis_agg_align;
1010 mtx_unlock(&txr->hn_tx_lock);
1015 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1018 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1019 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1020 return txr->hn_txdesc_cnt;
1021 return hn_tx_swq_depth;
1025 hn_rss_reconfig(struct hn_softc *sc)
1031 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1035 * Disable RSS first.
1038 * Direct reconfiguration by setting the UNCHG flags does
1039 * _not_ work properly.
1042 if_printf(sc->hn_ifp, "disable RSS\n");
1043 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1045 if_printf(sc->hn_ifp, "RSS disable failed\n");
1050 * Reenable the RSS w/ the updated RSS key or indirect
1054 if_printf(sc->hn_ifp, "reconfig RSS\n");
1055 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1057 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1064 hn_rss_ind_fixup(struct hn_softc *sc)
1066 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1069 nchan = sc->hn_rx_ring_inuse;
1070 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1073 * Check indirect table to make sure that all channels in it
1076 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1077 if (rss->rss_ind[i] >= nchan) {
1078 if_printf(sc->hn_ifp,
1079 "RSS indirect table %d fixup: %u -> %d\n",
1080 i, rss->rss_ind[i], nchan - 1);
1081 rss->rss_ind[i] = nchan - 1;
1087 hn_ifmedia_upd(struct ifnet *ifp __unused)
1094 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1096 struct hn_softc *sc = ifp->if_softc;
1098 ifmr->ifm_status = IFM_AVALID;
1099 ifmr->ifm_active = IFM_ETHER;
1101 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1102 ifmr->ifm_active |= IFM_NONE;
1105 ifmr->ifm_status |= IFM_ACTIVE;
1106 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1110 hn_rxvf_set_task(void *xarg, int pending __unused)
1112 struct hn_rxvf_setarg *arg = xarg;
1114 arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1118 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1120 struct hn_rx_ring *rxr;
1121 struct hn_rxvf_setarg arg;
1127 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1129 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1130 rxr = &sc->hn_rx_ring[i];
1132 if (i < sc->hn_rx_ring_inuse) {
1134 arg.vf_ifp = vf_ifp;
1135 vmbus_chan_run_task(rxr->hn_chan, &task);
1137 rxr->hn_rxvf_ifp = vf_ifp;
1143 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1145 const struct ifnet *hn_ifp;
1147 hn_ifp = sc->hn_ifp;
1152 if (ifp->if_alloctype != IFT_ETHER)
1155 /* Ignore lagg/vlan interfaces */
1156 if (strcmp(ifp->if_dname, "lagg") == 0 ||
1157 strcmp(ifp->if_dname, "vlan") == 0)
1161 * During detach events ifp->if_addr might be NULL.
1162 * Make sure the bcmp() below doesn't panic on that:
1164 if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1167 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1174 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1176 struct ifnet *hn_ifp;
1180 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1183 if (!hn_ismyvf(sc, ifp))
1185 hn_ifp = sc->hn_ifp;
1188 if (sc->hn_flags & HN_FLAG_RXVF)
1191 sc->hn_flags |= HN_FLAG_RXVF;
1192 hn_rxfilter_config(sc);
1194 if (!(sc->hn_flags & HN_FLAG_RXVF))
1197 sc->hn_flags &= ~HN_FLAG_RXVF;
1198 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1199 hn_rxfilter_config(sc);
1201 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1204 hn_nvs_set_datapath(sc,
1205 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1207 hn_rxvf_set(sc, rxvf ? ifp : NULL);
1210 hn_vf_rss_fixup(sc, true);
1211 hn_suspend_mgmt(sc);
1212 sc->hn_link_flags &=
1213 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1214 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1216 hn_vf_rss_restore(sc);
1220 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1221 rxvf ? "VF_UP" : "VF_DOWN", NULL);
1224 if_printf(hn_ifp, "datapath is switched %s %s\n",
1225 rxvf ? "to" : "from", ifp->if_xname);
1232 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1235 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1237 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1241 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1244 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1248 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1250 struct ifnet *ifp, *vf_ifp;
1256 vf_ifp = sc->hn_vf_ifp;
1259 * Fix up requested capabilities w/ supported capabilities,
1260 * since the supported capabilities could have been changed.
1262 ifr->ifr_reqcap &= ifp->if_capabilities;
1263 /* Pass SIOCSIFCAP to VF. */
1264 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1268 * The error will be propagated to the callers, however, it
1269 * is _not_ useful here.
1273 * Merge VF's enabled capabilities.
1275 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1277 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1278 if (ifp->if_capenable & IFCAP_TXCSUM)
1279 ifp->if_hwassist |= tmp;
1281 ifp->if_hwassist &= ~tmp;
1283 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1284 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1285 ifp->if_hwassist |= tmp;
1287 ifp->if_hwassist &= ~tmp;
1289 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1290 if (ifp->if_capenable & IFCAP_TSO4)
1291 ifp->if_hwassist |= tmp;
1293 ifp->if_hwassist &= ~tmp;
1295 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1296 if (ifp->if_capenable & IFCAP_TSO6)
1297 ifp->if_hwassist |= tmp;
1299 ifp->if_hwassist &= ~tmp;
1305 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1307 struct ifnet *vf_ifp;
1311 vf_ifp = sc->hn_vf_ifp;
1313 memset(&ifr, 0, sizeof(ifr));
1314 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1315 ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1316 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1317 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1321 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1323 struct ifnet *ifp = sc->hn_ifp;
1328 /* XXX vlan(4) style mcast addr maintenance */
1329 if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1330 allmulti = IFF_ALLMULTI;
1332 /* Always set the VF's if_flags */
1333 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1337 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1339 struct rm_priotracker pt;
1340 struct ifnet *hn_ifp = NULL;
1344 * XXX racy, if hn(4) ever detached.
1346 rm_rlock(&hn_vfmap_lock, &pt);
1347 if (vf_ifp->if_index < hn_vfmap_size)
1348 hn_ifp = hn_vfmap[vf_ifp->if_index];
1349 rm_runlock(&hn_vfmap_lock, &pt);
1351 if (hn_ifp != NULL) {
1352 for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1354 * Allow tapping on the VF.
1356 ETHER_BPF_MTAP(vf_ifp, mn);
1361 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1362 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1366 * XXX IFCOUNTER_IMCAST
1367 * This stat updating is kinda invasive, since it
1368 * requires two checks on the mbuf: the length check
1369 * and the ethernet header check. As of this write,
1370 * all multicast packets go directly to hn(4), which
1371 * makes imcast stat updating in the VF a try in vian.
1375 * Fix up rcvif and increase hn(4)'s ipackets.
1377 mn->m_pkthdr.rcvif = hn_ifp;
1378 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1381 * Go through hn(4)'s if_input.
1383 hn_ifp->if_input(hn_ifp, m);
1386 * In the middle of the transition; free this
1391 m->m_nextpkt = NULL;
1399 hn_mtu_change_fixup(struct hn_softc *sc)
1406 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1407 #if __FreeBSD_version >= 1100099
1408 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1409 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1414 hn_rss_type_fromndis(uint32_t rss_hash)
1418 if (rss_hash & NDIS_HASH_IPV4)
1419 types |= RSS_TYPE_IPV4;
1420 if (rss_hash & NDIS_HASH_TCP_IPV4)
1421 types |= RSS_TYPE_TCP_IPV4;
1422 if (rss_hash & NDIS_HASH_IPV6)
1423 types |= RSS_TYPE_IPV6;
1424 if (rss_hash & NDIS_HASH_IPV6_EX)
1425 types |= RSS_TYPE_IPV6_EX;
1426 if (rss_hash & NDIS_HASH_TCP_IPV6)
1427 types |= RSS_TYPE_TCP_IPV6;
1428 if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1429 types |= RSS_TYPE_TCP_IPV6_EX;
1430 if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1431 types |= RSS_TYPE_UDP_IPV4;
1436 hn_rss_type_tondis(uint32_t types)
1438 uint32_t rss_hash = 0;
1440 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1441 ("UDP6 and UDP6EX are not supported"));
1443 if (types & RSS_TYPE_IPV4)
1444 rss_hash |= NDIS_HASH_IPV4;
1445 if (types & RSS_TYPE_TCP_IPV4)
1446 rss_hash |= NDIS_HASH_TCP_IPV4;
1447 if (types & RSS_TYPE_IPV6)
1448 rss_hash |= NDIS_HASH_IPV6;
1449 if (types & RSS_TYPE_IPV6_EX)
1450 rss_hash |= NDIS_HASH_IPV6_EX;
1451 if (types & RSS_TYPE_TCP_IPV6)
1452 rss_hash |= NDIS_HASH_TCP_IPV6;
1453 if (types & RSS_TYPE_TCP_IPV6_EX)
1454 rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1455 if (types & RSS_TYPE_UDP_IPV4)
1456 rss_hash |= NDIS_HASH_UDP_IPV4_X;
1461 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1467 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1468 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1472 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1474 struct ifnet *ifp, *vf_ifp;
1475 struct ifrsshash ifrh;
1476 struct ifrsskey ifrk;
1478 uint32_t my_types, diff_types, mbuf_types = 0;
1481 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1482 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1484 if (sc->hn_rx_ring_inuse == 1) {
1485 /* No RSS on synthetic parts; done. */
1488 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1489 /* Synthetic parts do not support Toeplitz; done. */
1494 vf_ifp = sc->hn_vf_ifp;
1497 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is
1500 memset(&ifrk, 0, sizeof(ifrk));
1501 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1502 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1504 if_printf(ifp, "%s SIOCGRSSKEY failed: %d\n",
1505 vf_ifp->if_xname, error);
1508 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1509 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1510 vf_ifp->if_xname, ifrk.ifrk_func);
1513 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1514 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1515 vf_ifp->if_xname, ifrk.ifrk_keylen);
1520 * Extract VF's RSS hash. Only Toeplitz is supported.
1522 memset(&ifrh, 0, sizeof(ifrh));
1523 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1524 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1526 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1527 vf_ifp->if_xname, error);
1530 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1531 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1532 vf_ifp->if_xname, ifrh.ifrh_func);
1536 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1537 if ((ifrh.ifrh_types & my_types) == 0) {
1538 /* This disables RSS; ignore it then */
1539 if_printf(ifp, "%s intersection of RSS types failed. "
1540 "VF %#x, mine %#x\n", vf_ifp->if_xname,
1541 ifrh.ifrh_types, my_types);
1545 diff_types = my_types ^ ifrh.ifrh_types;
1546 my_types &= ifrh.ifrh_types;
1547 mbuf_types = my_types;
1550 * Detect RSS hash value/type confliction.
1553 * We don't disable the hash type, but stop delivery the hash
1554 * value/type through mbufs on RX path.
1556 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1557 * hash is delivered with type of TCP_IPV4. This means if
1558 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1559 * least to hn_mbuf_hash. However, given that _all_ of the
1560 * NICs implement TCP_IPV4, this will _not_ impose any issues
1563 if ((my_types & RSS_TYPE_IPV4) &&
1564 (diff_types & ifrh.ifrh_types &
1565 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1566 /* Conflict; disable IPV4 hash type/value delivery. */
1567 if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1568 mbuf_types &= ~RSS_TYPE_IPV4;
1570 if ((my_types & RSS_TYPE_IPV6) &&
1571 (diff_types & ifrh.ifrh_types &
1572 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1573 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1574 RSS_TYPE_IPV6_EX))) {
1575 /* Conflict; disable IPV6 hash type/value delivery. */
1576 if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1577 mbuf_types &= ~RSS_TYPE_IPV6;
1579 if ((my_types & RSS_TYPE_IPV6_EX) &&
1580 (diff_types & ifrh.ifrh_types &
1581 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1582 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1584 /* Conflict; disable IPV6_EX hash type/value delivery. */
1585 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1586 mbuf_types &= ~RSS_TYPE_IPV6_EX;
1588 if ((my_types & RSS_TYPE_TCP_IPV6) &&
1589 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1590 /* Conflict; disable TCP_IPV6 hash type/value delivery. */
1591 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1592 mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1594 if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1595 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1596 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1597 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1598 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1600 if ((my_types & RSS_TYPE_UDP_IPV6) &&
1601 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1602 /* Conflict; disable UDP_IPV6 hash type/value delivery. */
1603 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1604 mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1606 if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1607 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1608 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1609 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1610 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1614 * Indirect table does not matter.
1617 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1618 hn_rss_type_tondis(my_types);
1619 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1620 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1623 error = hn_rss_reconfig(sc);
1625 /* XXX roll-back? */
1626 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1627 /* XXX keep going. */
1631 /* Hash deliverability for mbufs. */
1632 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1636 hn_vf_rss_restore(struct hn_softc *sc)
1640 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1641 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1643 if (sc->hn_rx_ring_inuse == 1)
1647 * Restore hash types. Key does _not_ matter.
1649 if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1652 sc->hn_rss_hash = sc->hn_rss_hcap;
1653 error = hn_rss_reconfig(sc);
1655 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1657 /* XXX keep going. */
1661 /* Hash deliverability for mbufs. */
1662 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1666 hn_xpnt_vf_setready(struct hn_softc *sc)
1668 struct ifnet *ifp, *vf_ifp;
1673 vf_ifp = sc->hn_vf_ifp;
1676 * Mark the VF ready.
1678 sc->hn_vf_rdytick = 0;
1681 * Save information for restoration.
1683 sc->hn_saved_caps = ifp->if_capabilities;
1684 sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1685 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1686 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1689 * Intersect supported/enabled capabilities.
1692 * if_hwassist is not changed here.
1694 ifp->if_capabilities &= vf_ifp->if_capabilities;
1695 ifp->if_capenable &= ifp->if_capabilities;
1700 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1701 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1702 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1703 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1704 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1705 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1708 * Change VF's enabled capabilities.
1710 memset(&ifr, 0, sizeof(ifr));
1711 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1712 ifr.ifr_reqcap = ifp->if_capenable;
1713 hn_xpnt_vf_iocsetcaps(sc, &ifr);
1715 if (ifp->if_mtu != ETHERMTU) {
1721 memset(&ifr, 0, sizeof(ifr));
1722 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1723 ifr.ifr_mtu = ifp->if_mtu;
1724 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1726 if_printf(ifp, "%s SIOCSIFMTU %lu failed\n",
1727 vf_ifp->if_xname, ifp->if_mtu);
1728 if (ifp->if_mtu > ETHERMTU) {
1729 if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1733 * No need to adjust the synthetic parts' MTU;
1734 * failure of the adjustment will cause us
1735 * infinite headache.
1737 ifp->if_mtu = ETHERMTU;
1738 hn_mtu_change_fixup(sc);
1745 hn_xpnt_vf_isready(struct hn_softc *sc)
1750 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1753 if (sc->hn_vf_rdytick == 0)
1756 if (sc->hn_vf_rdytick > ticks)
1759 /* Mark VF as ready. */
1760 hn_xpnt_vf_setready(sc);
1765 hn_xpnt_vf_setenable(struct hn_softc *sc)
1771 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1772 rm_wlock(&sc->hn_vf_lock);
1773 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1774 rm_wunlock(&sc->hn_vf_lock);
1776 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1777 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1781 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1787 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1788 rm_wlock(&sc->hn_vf_lock);
1789 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1791 sc->hn_vf_ifp = NULL;
1792 rm_wunlock(&sc->hn_vf_lock);
1794 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1795 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1799 hn_xpnt_vf_init(struct hn_softc *sc)
1805 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1806 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1809 if_printf(sc->hn_ifp, "try bringing up %s\n",
1810 sc->hn_vf_ifp->if_xname);
1816 hn_xpnt_vf_saveifflags(sc);
1817 sc->hn_vf_ifp->if_flags |= IFF_UP;
1818 error = hn_xpnt_vf_iocsetflags(sc);
1820 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1821 sc->hn_vf_ifp->if_xname, error);
1827 * Datapath setting must happen _after_ bringing the VF up.
1829 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1833 * Fixup RSS related bits _after_ the VF is brought up, since
1834 * many VFs generate RSS key during it's initialization.
1836 hn_vf_rss_fixup(sc, true);
1838 /* Mark transparent mode VF as enabled. */
1839 hn_xpnt_vf_setenable(sc);
1843 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1845 struct hn_softc *sc = xsc;
1849 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1851 if (sc->hn_vf_ifp == NULL)
1853 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1856 if (sc->hn_vf_rdytick != 0) {
1857 /* Mark VF as ready. */
1858 hn_xpnt_vf_setready(sc);
1861 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1863 * Delayed VF initialization.
1866 if_printf(sc->hn_ifp, "delayed initialize %s\n",
1867 sc->hn_vf_ifp->if_xname);
1869 hn_xpnt_vf_init(sc);
1876 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1878 struct hn_softc *sc = xsc;
1882 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1885 if (!hn_ismyvf(sc, ifp))
1888 if (sc->hn_vf_ifp != NULL) {
1889 if_printf(sc->hn_ifp, "%s was attached as VF\n",
1890 sc->hn_vf_ifp->if_xname);
1894 if (hn_xpnt_vf && ifp->if_start != NULL) {
1896 * ifnet.if_start is _not_ supported by transparent
1897 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1899 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1900 "in transparent VF mode.\n", ifp->if_xname);
1904 rm_wlock(&hn_vfmap_lock);
1906 if (ifp->if_index >= hn_vfmap_size) {
1907 struct ifnet **newmap;
1910 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1911 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1914 memcpy(newmap, hn_vfmap,
1915 sizeof(struct ifnet *) * hn_vfmap_size);
1916 free(hn_vfmap, M_DEVBUF);
1918 hn_vfmap_size = newsize;
1920 KASSERT(hn_vfmap[ifp->if_index] == NULL,
1921 ("%s: ifindex %d was mapped to %s",
1922 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1923 hn_vfmap[ifp->if_index] = sc->hn_ifp;
1925 rm_wunlock(&hn_vfmap_lock);
1927 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1928 rm_wlock(&sc->hn_vf_lock);
1929 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1930 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1931 sc->hn_vf_ifp = ifp;
1932 rm_wunlock(&sc->hn_vf_lock);
1938 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1939 * Save vf_ifp's current if_input for later restoration.
1941 sc->hn_vf_input = ifp->if_input;
1942 ifp->if_input = hn_xpnt_vf_input;
1945 * Stop link status management; use the VF's.
1947 hn_suspend_mgmt(sc);
1950 * Give VF sometime to complete its attach routing.
1952 wait_ticks = hn_xpnt_vf_attwait * hz;
1953 sc->hn_vf_rdytick = ticks + wait_ticks;
1955 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1963 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1965 struct hn_softc *sc = xsc;
1969 if (sc->hn_vf_ifp == NULL)
1972 if (!hn_ismyvf(sc, ifp))
1977 * Make sure that the delayed initialization is not running.
1980 * - This lock _must_ be released, since the hn_vf_init task
1981 * will try holding this lock.
1982 * - It is safe to release this lock here, since the
1983 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1985 * XXX racy, if hn(4) ever detached.
1988 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1991 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1992 sc->hn_ifp->if_xname));
1993 ifp->if_input = sc->hn_vf_input;
1994 sc->hn_vf_input = NULL;
1996 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1997 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1998 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2000 if (sc->hn_vf_rdytick == 0) {
2002 * The VF was ready; restore some settings.
2004 sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2007 * There is _no_ need to fixup if_capenable and
2008 * if_hwassist, since the if_capabilities before
2009 * restoration was an intersection of the VF's
2010 * if_capabilites and the synthetic device's
2013 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2014 sc->hn_ifp->if_hw_tsomaxsegcount =
2015 sc->hn_saved_tsosegcnt;
2016 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2019 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2021 * Restore RSS settings.
2023 hn_vf_rss_restore(sc);
2026 * Resume link status management, which was suspended
2027 * by hn_ifnet_attevent().
2033 /* Mark transparent mode VF as disabled. */
2034 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2036 rm_wlock(&hn_vfmap_lock);
2038 KASSERT(ifp->if_index < hn_vfmap_size,
2039 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2040 if (hn_vfmap[ifp->if_index] != NULL) {
2041 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2042 ("%s: ifindex %d was mapped to %s",
2043 ifp->if_xname, ifp->if_index,
2044 hn_vfmap[ifp->if_index]->if_xname));
2045 hn_vfmap[ifp->if_index] = NULL;
2048 rm_wunlock(&hn_vfmap_lock);
2054 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2056 struct hn_softc *sc = xsc;
2058 if (sc->hn_vf_ifp == ifp)
2059 if_link_state_change(sc->hn_ifp, link_state);
2063 hn_probe(device_t dev)
2066 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2067 device_set_desc(dev, "Hyper-V Network Interface");
2068 return BUS_PROBE_DEFAULT;
2074 hn_attach(device_t dev)
2076 struct hn_softc *sc = device_get_softc(dev);
2077 struct sysctl_oid_list *child;
2078 struct sysctl_ctx_list *ctx;
2079 uint8_t eaddr[ETHER_ADDR_LEN];
2080 struct ifnet *ifp = NULL;
2081 int error, ring_cnt, tx_ring_cnt;
2085 sc->hn_prichan = vmbus_get_channel(dev);
2087 rm_init(&sc->hn_vf_lock, "hnvf");
2088 if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2089 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2092 * Initialize these tunables once.
2094 sc->hn_agg_size = hn_tx_agg_size;
2095 sc->hn_agg_pkts = hn_tx_agg_pkts;
2098 * Setup taskqueue for transmission.
2100 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2104 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2105 M_DEVBUF, M_WAITOK);
2106 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2107 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2108 M_WAITOK, taskqueue_thread_enqueue,
2109 &sc->hn_tx_taskqs[i]);
2110 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2111 "%s tx%d", device_get_nameunit(dev), i);
2113 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2114 sc->hn_tx_taskqs = hn_tx_taskque;
2118 * Setup taskqueue for mangement tasks, e.g. link status.
2120 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2121 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2122 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2123 device_get_nameunit(dev));
2124 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2125 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2126 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2127 hn_netchg_status_taskfunc, sc);
2131 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2133 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2134 taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2135 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2136 device_get_nameunit(dev));
2137 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2138 hn_xpnt_vf_init_taskfunc, sc);
2142 * Allocate ifnet and setup its name earlier, so that if_printf
2143 * can be used by functions, which will be called after
2146 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
2148 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2151 * Initialize ifmedia earlier so that it can be unconditionally
2152 * destroyed, if error happened later on.
2154 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2157 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2158 * to use (tx_ring_cnt).
2161 * The # of RX rings to use is same as the # of channels to use.
2163 ring_cnt = hn_chan_cnt;
2164 if (ring_cnt <= 0) {
2166 ring_cnt = mp_ncpus;
2167 if (ring_cnt > HN_RING_CNT_DEF_MAX)
2168 ring_cnt = HN_RING_CNT_DEF_MAX;
2169 } else if (ring_cnt > mp_ncpus) {
2170 ring_cnt = mp_ncpus;
2173 tx_ring_cnt = hn_tx_ring_cnt;
2174 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2175 tx_ring_cnt = ring_cnt;
2176 #ifdef HN_IFSTART_SUPPORT
2177 if (hn_use_if_start) {
2178 /* ifnet.if_start only needs one TX ring. */
2184 * Set the leader CPU for channels.
2186 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2189 * Create enough TX/RX rings, even if only limited number of
2190 * channels can be allocated.
2192 error = hn_create_tx_data(sc, tx_ring_cnt);
2195 error = hn_create_rx_data(sc, ring_cnt);
2200 * Create transaction context for NVS and RNDIS transactions.
2202 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2203 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2204 if (sc->hn_xact == NULL) {
2210 * Install orphan handler for the revocation of this device's
2214 * The processing order is critical here:
2215 * Install the orphan handler, _before_ testing whether this
2216 * device's primary channel has been revoked or not.
2218 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2219 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2225 * Attach the synthetic parts, i.e. NVS and RNDIS.
2227 error = hn_synth_attach(sc, ETHERMTU);
2231 error = hn_rndis_get_eaddr(sc, eaddr);
2235 error = hn_rndis_get_mtu(sc, &mtu);
2238 else if (bootverbose)
2239 device_printf(dev, "RNDIS mtu %u\n", mtu);
2241 #if __FreeBSD_version >= 1100099
2242 if (sc->hn_rx_ring_inuse > 1) {
2244 * Reduce TCP segment aggregation limit for multiple
2245 * RX rings to increase ACK timeliness.
2247 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2252 * Fixup TX/RX stuffs after synthetic parts are attached.
2254 hn_fixup_tx_data(sc);
2255 hn_fixup_rx_data(sc);
2257 ctx = device_get_sysctl_ctx(dev);
2258 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2259 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2260 &sc->hn_nvs_ver, 0, "NVS version");
2261 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2262 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2263 hn_ndis_version_sysctl, "A", "NDIS version");
2264 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2265 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2266 hn_caps_sysctl, "A", "capabilities");
2267 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2268 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2269 hn_hwassist_sysctl, "A", "hwassist");
2270 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2271 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2272 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2273 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2274 "max # of TSO segments");
2275 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2276 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2277 "max size of TSO segment");
2278 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2279 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2280 hn_rxfilter_sysctl, "A", "rxfilter");
2281 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2282 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2283 hn_rss_hash_sysctl, "A", "RSS hash");
2284 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2285 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2286 hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2287 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2288 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2289 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2290 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2291 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2292 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2293 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2294 hn_rss_key_sysctl, "IU", "RSS key");
2295 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2296 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2297 hn_rss_ind_sysctl, "IU", "RSS indirect table");
2298 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2299 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2300 "RNDIS offered packet transmission aggregation size limit");
2301 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2302 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2303 "RNDIS offered packet transmission aggregation count limit");
2304 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2305 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2306 "RNDIS packet transmission aggregation alignment");
2307 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2308 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2309 hn_txagg_size_sysctl, "I",
2310 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2311 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2312 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2313 hn_txagg_pkts_sysctl, "I",
2314 "Packet transmission aggregation packets, "
2315 "0 -- disable, -1 -- auto");
2316 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2317 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2318 hn_polling_sysctl, "I",
2319 "Polling frequency: [100,1000000], 0 disable polling");
2320 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2321 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2322 hn_vf_sysctl, "A", "Virtual Function's name");
2324 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2325 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2326 hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2328 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2329 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2330 hn_xpnt_vf_enabled_sysctl, "I",
2331 "Transparent VF enabled");
2332 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2333 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2334 hn_xpnt_vf_accbpf_sysctl, "I",
2335 "Accurate BPF for transparent VF");
2339 * Setup the ifmedia, which has been initialized earlier.
2341 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2342 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2343 /* XXX ifmedia_set really should do this for us */
2344 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2347 * Setup the ifnet for this interface.
2351 ifp->if_baudrate = IF_Gbps(10);
2353 /* if_baudrate is 32bits on 32bit system. */
2354 ifp->if_baudrate = IF_Gbps(1);
2356 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2357 ifp->if_ioctl = hn_ioctl;
2358 ifp->if_init = hn_init;
2359 #ifdef HN_IFSTART_SUPPORT
2360 if (hn_use_if_start) {
2361 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2363 ifp->if_start = hn_start;
2364 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2365 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2366 IFQ_SET_READY(&ifp->if_snd);
2370 ifp->if_transmit = hn_transmit;
2371 ifp->if_qflush = hn_xmit_qflush;
2374 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2376 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2377 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2379 if (sc->hn_caps & HN_CAP_VLAN) {
2380 /* XXX not sure about VLAN_MTU. */
2381 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2384 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2385 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2386 ifp->if_capabilities |= IFCAP_TXCSUM;
2387 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2388 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2389 if (sc->hn_caps & HN_CAP_TSO4) {
2390 ifp->if_capabilities |= IFCAP_TSO4;
2391 ifp->if_hwassist |= CSUM_IP_TSO;
2393 if (sc->hn_caps & HN_CAP_TSO6) {
2394 ifp->if_capabilities |= IFCAP_TSO6;
2395 ifp->if_hwassist |= CSUM_IP6_TSO;
2398 /* Enable all available capabilities by default. */
2399 ifp->if_capenable = ifp->if_capabilities;
2402 * Disable IPv6 TSO and TXCSUM by default, they still can
2403 * be enabled through SIOCSIFCAP.
2405 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2406 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2408 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2410 * Lock hn_set_tso_maxsize() to simplify its
2414 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2416 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2417 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2420 ether_ifattach(ifp, eaddr);
2422 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2423 if_printf(ifp, "TSO segcnt %u segsz %u\n",
2424 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2426 if (mtu < ETHERMTU) {
2427 if_printf(ifp, "fixup mtu %lu -> %u\n", ifp->if_mtu, mtu);
2431 /* Inform the upper layer about the long frame support. */
2432 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2435 * Kick off link status check.
2437 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2438 hn_update_link_status(sc);
2441 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2442 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2443 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2444 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2446 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2447 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2452 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2453 * since interface's LLADDR is needed; interface LLADDR is not
2454 * available when ifnet_arrival event is triggered.
2456 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2457 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2458 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2459 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2463 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2464 hn_synth_detach(sc);
2470 hn_detach(device_t dev)
2472 struct hn_softc *sc = device_get_softc(dev);
2473 struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2475 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2477 * In case that the vmbus missed the orphan handler
2480 vmbus_xact_ctx_orphan(sc->hn_xact);
2483 if (sc->hn_ifaddr_evthand != NULL)
2484 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2485 if (sc->hn_ifnet_evthand != NULL)
2486 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2487 if (sc->hn_ifnet_atthand != NULL) {
2488 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2489 sc->hn_ifnet_atthand);
2491 if (sc->hn_ifnet_dethand != NULL) {
2492 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2493 sc->hn_ifnet_dethand);
2495 if (sc->hn_ifnet_lnkhand != NULL)
2496 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2498 vf_ifp = sc->hn_vf_ifp;
2499 __compiler_membar();
2501 hn_ifnet_detevent(sc, vf_ifp);
2503 if (device_is_attached(dev)) {
2505 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2506 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2510 * hn_stop() only suspends data, so managment
2511 * stuffs have to be suspended manually here.
2513 hn_suspend_mgmt(sc);
2514 hn_synth_detach(sc);
2517 ether_ifdetach(ifp);
2520 ifmedia_removeall(&sc->hn_media);
2521 hn_destroy_rx_data(sc);
2522 hn_destroy_tx_data(sc);
2524 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2527 for (i = 0; i < hn_tx_taskq_cnt; ++i)
2528 taskqueue_free(sc->hn_tx_taskqs[i]);
2529 free(sc->hn_tx_taskqs, M_DEVBUF);
2531 taskqueue_free(sc->hn_mgmt_taskq0);
2532 if (sc->hn_vf_taskq != NULL)
2533 taskqueue_free(sc->hn_vf_taskq);
2535 if (sc->hn_xact != NULL) {
2537 * Uninstall the orphan handler _before_ the xact is
2540 vmbus_chan_unset_orphan(sc->hn_prichan);
2541 vmbus_xact_ctx_destroy(sc->hn_xact);
2546 HN_LOCK_DESTROY(sc);
2547 rm_destroy(&sc->hn_vf_lock);
2552 hn_shutdown(device_t dev)
2559 hn_link_status(struct hn_softc *sc)
2561 uint32_t link_status;
2564 error = hn_rndis_get_linkstatus(sc, &link_status);
2566 /* XXX what to do? */
2570 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2571 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2573 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2574 if_link_state_change(sc->hn_ifp,
2575 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2576 LINK_STATE_UP : LINK_STATE_DOWN);
2580 hn_link_taskfunc(void *xsc, int pending __unused)
2582 struct hn_softc *sc = xsc;
2584 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2590 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2592 struct hn_softc *sc = xsc;
2594 /* Prevent any link status checks from running. */
2595 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2598 * Fake up a [link down --> link up] state change; 5 seconds
2599 * delay is used, which closely simulates miibus reaction
2600 * upon link down event.
2602 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2603 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2604 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2605 &sc->hn_netchg_status, 5 * hz);
2609 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2611 struct hn_softc *sc = xsc;
2613 /* Re-allow link status checks. */
2614 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2619 hn_update_link_status(struct hn_softc *sc)
2622 if (sc->hn_mgmt_taskq != NULL)
2623 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2627 hn_change_network(struct hn_softc *sc)
2630 if (sc->hn_mgmt_taskq != NULL)
2631 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2635 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2636 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2638 struct mbuf *m = *m_head;
2641 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2643 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2644 m, segs, nsegs, BUS_DMA_NOWAIT);
2645 if (error == EFBIG) {
2648 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2652 *m_head = m = m_new;
2653 txr->hn_tx_collapsed++;
2655 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2656 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2659 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2660 BUS_DMASYNC_PREWRITE);
2661 txd->flags |= HN_TXD_FLAG_DMAMAP;
2667 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2670 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2671 ("put an onlist txd %#x", txd->flags));
2672 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2673 ("put an onagg txd %#x", txd->flags));
2675 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2676 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2679 if (!STAILQ_EMPTY(&txd->agg_list)) {
2680 struct hn_txdesc *tmp_txd;
2682 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2685 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2686 ("resursive aggregation on aggregated txdesc"));
2687 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2688 ("not aggregated txdesc"));
2689 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2690 ("aggregated txdesc uses dmamap"));
2691 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2692 ("aggregated txdesc consumes "
2693 "chimney sending buffer"));
2694 KASSERT(tmp_txd->chim_size == 0,
2695 ("aggregated txdesc has non-zero "
2696 "chimney sending size"));
2698 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2699 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2700 freed = hn_txdesc_put(txr, tmp_txd);
2701 KASSERT(freed, ("failed to free aggregated txdesc"));
2705 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2706 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2707 ("chim txd uses dmamap"));
2708 hn_chim_free(txr->hn_sc, txd->chim_index);
2709 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2711 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2712 bus_dmamap_sync(txr->hn_tx_data_dtag,
2713 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2714 bus_dmamap_unload(txr->hn_tx_data_dtag,
2716 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2719 if (txd->m != NULL) {
2724 txd->flags |= HN_TXD_FLAG_ONLIST;
2725 #ifndef HN_USE_TXDESC_BUFRING
2726 mtx_lock_spin(&txr->hn_txlist_spin);
2727 KASSERT(txr->hn_txdesc_avail >= 0 &&
2728 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2729 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2730 txr->hn_txdesc_avail++;
2731 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2732 mtx_unlock_spin(&txr->hn_txlist_spin);
2733 #else /* HN_USE_TXDESC_BUFRING */
2735 atomic_add_int(&txr->hn_txdesc_avail, 1);
2737 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2738 #endif /* !HN_USE_TXDESC_BUFRING */
2743 static __inline struct hn_txdesc *
2744 hn_txdesc_get(struct hn_tx_ring *txr)
2746 struct hn_txdesc *txd;
2748 #ifndef HN_USE_TXDESC_BUFRING
2749 mtx_lock_spin(&txr->hn_txlist_spin);
2750 txd = SLIST_FIRST(&txr->hn_txlist);
2752 KASSERT(txr->hn_txdesc_avail > 0,
2753 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2754 txr->hn_txdesc_avail--;
2755 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2757 mtx_unlock_spin(&txr->hn_txlist_spin);
2759 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2763 #ifdef HN_USE_TXDESC_BUFRING
2765 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2767 #endif /* HN_USE_TXDESC_BUFRING */
2768 KASSERT(txd->m == NULL && txd->refs == 0 &&
2769 STAILQ_EMPTY(&txd->agg_list) &&
2770 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2771 txd->chim_size == 0 &&
2772 (txd->flags & HN_TXD_FLAG_ONLIST) &&
2773 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2774 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2775 txd->flags &= ~HN_TXD_FLAG_ONLIST;
2781 static __inline void
2782 hn_txdesc_hold(struct hn_txdesc *txd)
2785 /* 0->1 transition will never work */
2786 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2787 atomic_add_int(&txd->refs, 1);
2790 static __inline void
2791 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2794 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2795 ("recursive aggregation on aggregating txdesc"));
2797 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2798 ("already aggregated"));
2799 KASSERT(STAILQ_EMPTY(&txd->agg_list),
2800 ("recursive aggregation on to-be-aggregated txdesc"));
2802 txd->flags |= HN_TXD_FLAG_ONAGG;
2803 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2807 hn_tx_ring_pending(struct hn_tx_ring *txr)
2809 bool pending = false;
2811 #ifndef HN_USE_TXDESC_BUFRING
2812 mtx_lock_spin(&txr->hn_txlist_spin);
2813 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2815 mtx_unlock_spin(&txr->hn_txlist_spin);
2817 if (!buf_ring_full(txr->hn_txdesc_br))
2823 static __inline void
2824 hn_txeof(struct hn_tx_ring *txr)
2826 txr->hn_has_txeof = 0;
2831 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2832 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2834 struct hn_txdesc *txd = sndc->hn_cbarg;
2835 struct hn_tx_ring *txr;
2838 KASSERT(txr->hn_chan == chan,
2839 ("channel mismatch, on chan%u, should be chan%u",
2840 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2842 txr->hn_has_txeof = 1;
2843 hn_txdesc_put(txr, txd);
2845 ++txr->hn_txdone_cnt;
2846 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2847 txr->hn_txdone_cnt = 0;
2848 if (txr->hn_oactive)
2854 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2856 #if defined(INET) || defined(INET6)
2857 struct lro_ctrl *lro = &rxr->hn_lro;
2858 struct lro_entry *queued;
2860 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
2861 SLIST_REMOVE_HEAD(&lro->lro_active, next);
2862 tcp_lro_flush(lro, queued);
2868 * 'txr' could be NULL, if multiple channels and
2869 * ifnet.if_start method are enabled.
2871 if (txr == NULL || !txr->hn_has_txeof)
2874 txr->hn_txdone_cnt = 0;
2878 static __inline uint32_t
2879 hn_rndis_pktmsg_offset(uint32_t ofs)
2882 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2883 ("invalid RNDIS packet msg offset %u", ofs));
2884 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2887 static __inline void *
2888 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2889 size_t pi_dlen, uint32_t pi_type)
2891 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2892 struct rndis_pktinfo *pi;
2894 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2895 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2898 * Per-packet-info does not move; it only grows.
2901 * rm_pktinfooffset in this phase counts from the beginning
2902 * of rndis_packet_msg.
2904 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2905 ("%u pktinfo overflows RNDIS packet msg", pi_type));
2906 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2907 pkt->rm_pktinfolen);
2908 pkt->rm_pktinfolen += pi_size;
2910 pi->rm_size = pi_size;
2911 pi->rm_type = pi_type;
2912 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2914 return (pi->rm_data);
2918 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2920 struct hn_txdesc *txd;
2924 txd = txr->hn_agg_txd;
2925 KASSERT(txd != NULL, ("no aggregate txdesc"));
2928 * Since hn_txpkt() will reset this temporary stat, save
2929 * it now, so that oerrors can be updated properly, if
2930 * hn_txpkt() ever fails.
2932 pkts = txr->hn_stat_pkts;
2935 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2936 * failure, save it for later freeing, if hn_txpkt() ever
2940 error = hn_txpkt(ifp, txr, txd);
2941 if (__predict_false(error)) {
2942 /* txd is freed, but m is not. */
2945 txr->hn_flush_failed++;
2946 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2949 /* Reset all aggregation states. */
2950 txr->hn_agg_txd = NULL;
2951 txr->hn_agg_szleft = 0;
2952 txr->hn_agg_pktleft = 0;
2953 txr->hn_agg_prevpkt = NULL;
2959 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2964 if (txr->hn_agg_txd != NULL) {
2965 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2966 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2967 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2971 * Update the previous RNDIS packet's total length,
2972 * it can be increased due to the mandatory alignment
2973 * padding for this RNDIS packet. And update the
2974 * aggregating txdesc's chimney sending buffer size
2978 * Zero-out the padding, as required by the RNDIS spec.
2981 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2982 agg_txd->chim_size += pkt->rm_len - olen;
2984 /* Link this txdesc to the parent. */
2985 hn_txdesc_agg(agg_txd, txd);
2987 chim = (uint8_t *)pkt + pkt->rm_len;
2988 /* Save the current packet for later fixup. */
2989 txr->hn_agg_prevpkt = chim;
2991 txr->hn_agg_pktleft--;
2992 txr->hn_agg_szleft -= pktsize;
2993 if (txr->hn_agg_szleft <=
2994 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2996 * Probably can't aggregate more packets,
2997 * flush this aggregating txdesc proactively.
2999 txr->hn_agg_pktleft = 0;
3004 hn_flush_txagg(ifp, txr);
3006 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3008 txr->hn_tx_chimney_tried++;
3009 txd->chim_index = hn_chim_alloc(txr->hn_sc);
3010 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3012 txr->hn_tx_chimney++;
3014 chim = txr->hn_sc->hn_chim +
3015 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3017 if (txr->hn_agg_pktmax > 1 &&
3018 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3019 txr->hn_agg_txd = txd;
3020 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3021 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3022 txr->hn_agg_prevpkt = chim;
3029 * If this function fails, then both txd and m_head0 will be freed.
3032 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3033 struct mbuf **m_head0)
3035 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3036 int error, nsegs, i;
3037 struct mbuf *m_head = *m_head0;
3038 struct rndis_packet_msg *pkt;
3041 int pkt_hlen, pkt_size;
3043 pkt = txd->rndis_pkt;
3044 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3045 if (pkt_size < txr->hn_chim_size) {
3046 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3050 if (txr->hn_agg_txd != NULL)
3051 hn_flush_txagg(ifp, txr);
3054 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3055 pkt->rm_len = m_head->m_pkthdr.len;
3056 pkt->rm_dataoffset = 0;
3057 pkt->rm_datalen = m_head->m_pkthdr.len;
3058 pkt->rm_oobdataoffset = 0;
3059 pkt->rm_oobdatalen = 0;
3060 pkt->rm_oobdataelements = 0;
3061 pkt->rm_pktinfooffset = sizeof(*pkt);
3062 pkt->rm_pktinfolen = 0;
3063 pkt->rm_vchandle = 0;
3064 pkt->rm_reserved = 0;
3066 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3068 * Set the hash value for this packet, so that the host could
3069 * dispatch the TX done event for this packet back to this TX
3072 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3073 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3074 *pi_data = txr->hn_tx_idx;
3077 if (m_head->m_flags & M_VLANTAG) {
3078 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3079 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3080 *pi_data = NDIS_VLAN_INFO_MAKE(
3081 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3082 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3083 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3086 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3087 #if defined(INET6) || defined(INET)
3088 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3089 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3091 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3092 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3093 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3094 m_head->m_pkthdr.tso_segsz);
3097 #if defined(INET6) && defined(INET)
3102 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3103 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3104 m_head->m_pkthdr.tso_segsz);
3107 #endif /* INET6 || INET */
3108 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3109 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3110 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3111 if (m_head->m_pkthdr.csum_flags &
3112 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3113 *pi_data = NDIS_TXCSUM_INFO_IPV6;
3115 *pi_data = NDIS_TXCSUM_INFO_IPV4;
3116 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3117 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
3120 if (m_head->m_pkthdr.csum_flags &
3121 (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3122 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3123 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3124 } else if (m_head->m_pkthdr.csum_flags &
3125 (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3126 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3127 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3131 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3132 /* Fixup RNDIS packet message total length */
3133 pkt->rm_len += pkt_hlen;
3134 /* Convert RNDIS packet message offsets */
3135 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3136 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3139 * Fast path: Chimney sending.
3142 struct hn_txdesc *tgt_txd = txd;
3144 if (txr->hn_agg_txd != NULL) {
3145 tgt_txd = txr->hn_agg_txd;
3151 KASSERT(pkt == chim,
3152 ("RNDIS pkt not in chimney sending buffer"));
3153 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3154 ("chimney sending buffer is not used"));
3155 tgt_txd->chim_size += pkt->rm_len;
3157 m_copydata(m_head, 0, m_head->m_pkthdr.len,
3158 ((uint8_t *)chim) + pkt_hlen);
3160 txr->hn_gpa_cnt = 0;
3161 txr->hn_sendpkt = hn_txpkt_chim;
3165 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3166 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3167 ("chimney buffer is used"));
3168 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3170 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3171 if (__predict_false(error)) {
3175 * This mbuf is not linked w/ the txd yet, so free it now.
3180 freed = hn_txdesc_put(txr, txd);
3182 ("fail to free txd upon txdma error"));
3184 txr->hn_txdma_failed++;
3185 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3190 /* +1 RNDIS packet message */
3191 txr->hn_gpa_cnt = nsegs + 1;
3193 /* send packet with page buffer */
3194 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3195 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3196 txr->hn_gpa[0].gpa_len = pkt_hlen;
3199 * Fill the page buffers with mbuf info after the page
3200 * buffer for RNDIS packet message.
3202 for (i = 0; i < nsegs; ++i) {
3203 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3205 gpa->gpa_page = atop(segs[i].ds_addr);
3206 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3207 gpa->gpa_len = segs[i].ds_len;
3210 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3212 txr->hn_sendpkt = hn_txpkt_sglist;
3216 /* Set the completion routine */
3217 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3219 /* Update temporary stats for later use. */
3220 txr->hn_stat_pkts++;
3221 txr->hn_stat_size += m_head->m_pkthdr.len;
3222 if (m_head->m_flags & M_MCAST)
3223 txr->hn_stat_mcasts++;
3230 * If this function fails, then txd will be freed, but the mbuf
3231 * associated w/ the txd will _not_ be freed.
3234 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3236 int error, send_failed = 0, has_bpf;
3239 has_bpf = bpf_peers_present(ifp->if_bpf);
3242 * Make sure that this txd and any aggregated txds are not
3243 * freed before ETHER_BPF_MTAP.
3245 hn_txdesc_hold(txd);
3247 error = txr->hn_sendpkt(txr, txd);
3250 const struct hn_txdesc *tmp_txd;
3252 ETHER_BPF_MTAP(ifp, txd->m);
3253 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3254 ETHER_BPF_MTAP(ifp, tmp_txd->m);
3257 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3258 #ifdef HN_IFSTART_SUPPORT
3259 if (!hn_use_if_start)
3262 if_inc_counter(ifp, IFCOUNTER_OBYTES,
3264 if (txr->hn_stat_mcasts != 0) {
3265 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3266 txr->hn_stat_mcasts);
3269 txr->hn_pkts += txr->hn_stat_pkts;
3273 hn_txdesc_put(txr, txd);
3275 if (__predict_false(error)) {
3279 * This should "really rarely" happen.
3281 * XXX Too many RX to be acked or too many sideband
3282 * commands to run? Ask netvsc_channel_rollup()
3283 * to kick start later.
3285 txr->hn_has_txeof = 1;
3287 txr->hn_send_failed++;
3290 * Try sending again after set hn_has_txeof;
3291 * in case that we missed the last
3292 * netvsc_channel_rollup().
3296 if_printf(ifp, "send failed\n");
3299 * Caller will perform further processing on the
3300 * associated mbuf, so don't free it in hn_txdesc_put();
3301 * only unload it from the DMA map in hn_txdesc_put(),
3305 freed = hn_txdesc_put(txr, txd);
3307 ("fail to free txd upon send error"));
3309 txr->hn_send_failed++;
3312 /* Reset temporary stats, after this sending is done. */
3313 txr->hn_stat_size = 0;
3314 txr->hn_stat_pkts = 0;
3315 txr->hn_stat_mcasts = 0;
3321 * Append the specified data to the indicated mbuf chain,
3322 * Extend the mbuf chain if the new data does not fit in
3325 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3326 * There should be an equivalent in the kernel mbuf code,
3327 * but there does not appear to be one yet.
3329 * Differs from m_append() in that additional mbufs are
3330 * allocated with cluster size MJUMPAGESIZE, and filled
3333 * Return 1 if able to complete the job; otherwise 0.
3336 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3339 int remainder, space;
3341 for (m = m0; m->m_next != NULL; m = m->m_next)
3344 space = M_TRAILINGSPACE(m);
3347 * Copy into available space.
3349 if (space > remainder)
3351 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3356 while (remainder > 0) {
3358 * Allocate a new mbuf; could check space
3359 * and allocate a cluster instead.
3361 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
3364 n->m_len = min(MJUMPAGESIZE, remainder);
3365 bcopy(cp, mtod(n, caddr_t), n->m_len);
3367 remainder -= n->m_len;
3371 if (m0->m_flags & M_PKTHDR)
3372 m0->m_pkthdr.len += len - remainder;
3374 return (remainder == 0);
3377 #if defined(INET) || defined(INET6)
3379 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3381 #if __FreeBSD_version >= 1100095
3382 if (hn_lro_mbufq_depth) {
3383 tcp_lro_queue_mbuf(lc, m);
3387 return tcp_lro_rx(lc, m, 0);
3392 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3393 const struct hn_rxinfo *info)
3395 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3397 int size, do_lro = 0, do_csum = 1, is_vf = 0;
3398 int hash_type = M_HASHTYPE_NONE;
3399 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3402 if (rxr->hn_rxvf_ifp != NULL) {
3404 * Non-transparent mode VF; pretend this packet is from
3407 ifp = rxr->hn_rxvf_ifp;
3409 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3410 /* Transparent mode VF. */
3414 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3417 * See the NOTE of hn_rndis_init_fixat(). This
3418 * function can be reached, immediately after the
3419 * RNDIS is initialized but before the ifnet is
3420 * setup on the hn_attach() path; drop the unexpected
3426 if (__predict_false(dlen < ETHER_HDR_LEN)) {
3427 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3431 if (dlen <= MHLEN) {
3432 m_new = m_gethdr(M_NOWAIT, MT_DATA);
3433 if (m_new == NULL) {
3434 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3437 memcpy(mtod(m_new, void *), data, dlen);
3438 m_new->m_pkthdr.len = m_new->m_len = dlen;
3439 rxr->hn_small_pkts++;
3442 * Get an mbuf with a cluster. For packets 2K or less,
3443 * get a standard 2K cluster. For anything larger, get a
3444 * 4K cluster. Any buffers larger than 4K can cause problems
3445 * if looped around to the Hyper-V TX channel, so avoid them.
3448 if (dlen > MCLBYTES) {
3450 size = MJUMPAGESIZE;
3453 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3454 if (m_new == NULL) {
3455 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3459 hv_m_append(m_new, dlen, data);
3461 m_new->m_pkthdr.rcvif = ifp;
3463 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3466 /* receive side checksum offload */
3467 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3468 /* IP csum offload */
3469 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3470 m_new->m_pkthdr.csum_flags |=
3471 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3475 /* TCP/UDP csum offload */
3476 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3477 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3478 m_new->m_pkthdr.csum_flags |=
3479 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3480 m_new->m_pkthdr.csum_data = 0xffff;
3481 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3489 * As of this write (Oct 28th, 2016), host side will turn
3490 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3491 * the do_lro setting here is actually _not_ accurate. We
3492 * depend on the RSS hash type check to reset do_lro.
3494 if ((info->csum_info &
3495 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3496 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3499 hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3500 if (l3proto == ETHERTYPE_IP) {
3501 if (l4proto == IPPROTO_TCP) {
3503 (rxr->hn_trust_hcsum &
3504 HN_TRUST_HCSUM_TCP)) {
3505 rxr->hn_csum_trusted++;
3506 m_new->m_pkthdr.csum_flags |=
3507 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3508 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3509 m_new->m_pkthdr.csum_data = 0xffff;
3512 } else if (l4proto == IPPROTO_UDP) {
3514 (rxr->hn_trust_hcsum &
3515 HN_TRUST_HCSUM_UDP)) {
3516 rxr->hn_csum_trusted++;
3517 m_new->m_pkthdr.csum_flags |=
3518 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3519 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3520 m_new->m_pkthdr.csum_data = 0xffff;
3522 } else if (l4proto != IPPROTO_DONE && do_csum &&
3523 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3524 rxr->hn_csum_trusted++;
3525 m_new->m_pkthdr.csum_flags |=
3526 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3531 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3532 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3533 NDIS_VLAN_INFO_ID(info->vlan_info),
3534 NDIS_VLAN_INFO_PRI(info->vlan_info),
3535 NDIS_VLAN_INFO_CFI(info->vlan_info));
3536 m_new->m_flags |= M_VLANTAG;
3540 * If VF is activated (tranparent/non-transparent mode does not
3545 * hn(4) will only receive broadcast packets, multicast packets,
3546 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3549 * For non-transparent, we definitely _cannot_ enable LRO at
3550 * all, since the LRO flush will use hn(4) as the receiving
3551 * interface; i.e. hn_ifp->if_input(hn_ifp, m).
3557 * If VF is activated (tranparent/non-transparent mode does not
3558 * matter here), do _not_ mess with unsupported hash types or
3561 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3563 m_new->m_pkthdr.flowid = info->hash_value;
3565 hash_type = M_HASHTYPE_OPAQUE;
3566 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3567 NDIS_HASH_FUNCTION_TOEPLITZ) {
3568 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3573 * do_lro is resetted, if the hash types are not TCP
3574 * related. See the comment in the above csum_flags
3578 case NDIS_HASH_IPV4:
3579 hash_type = M_HASHTYPE_RSS_IPV4;
3583 case NDIS_HASH_TCP_IPV4:
3584 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3585 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3586 int def_htype = M_HASHTYPE_OPAQUE;
3589 def_htype = M_HASHTYPE_NONE;
3592 * UDP 4-tuple hash is delivered as
3595 if (l3proto == ETHERTYPE_MAX) {
3596 hn_rxpkt_proto(m_new,
3597 &l3proto, &l4proto);
3599 if (l3proto == ETHERTYPE_IP) {
3600 if (l4proto == IPPROTO_UDP &&
3601 (rxr->hn_mbuf_hash &
3602 NDIS_HASH_UDP_IPV4_X)) {
3604 M_HASHTYPE_RSS_UDP_IPV4;
3606 } else if (l4proto !=
3608 hash_type = def_htype;
3612 hash_type = def_htype;
3618 case NDIS_HASH_IPV6:
3619 hash_type = M_HASHTYPE_RSS_IPV6;
3623 case NDIS_HASH_IPV6_EX:
3624 hash_type = M_HASHTYPE_RSS_IPV6_EX;
3628 case NDIS_HASH_TCP_IPV6:
3629 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3632 case NDIS_HASH_TCP_IPV6_EX:
3633 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3637 } else if (!is_vf) {
3638 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3640 M_HASHTYPE_SET(m_new, hash_type);
3642 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3643 if (hn_ifp != ifp) {
3644 const struct ether_header *eh;
3647 * Non-transparent mode VF is activated.
3651 * Allow tapping on hn(4).
3653 ETHER_BPF_MTAP(hn_ifp, m_new);
3656 * Update hn(4)'s stats.
3658 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3659 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3660 /* Checked at the beginning of this function. */
3661 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3662 eh = mtod(m_new, struct ether_header *);
3663 if (ETHER_IS_MULTICAST(eh->ether_dhost))
3664 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3668 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3669 #if defined(INET) || defined(INET6)
3670 struct lro_ctrl *lro = &rxr->hn_lro;
3673 rxr->hn_lro_tried++;
3674 if (hn_lro_rx(lro, m_new) == 0) {
3681 ifp->if_input(ifp, m_new);
3687 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3689 struct hn_softc *sc = ifp->if_softc;
3690 struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3691 struct ifnet *vf_ifp;
3692 int mask, error = 0;
3693 struct ifrsskey *ifrk;
3694 struct ifrsshash *ifrh;
3699 if (ifr->ifr_mtu > HN_MTU_MAX) {
3706 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3711 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3712 /* Can't change MTU */
3718 if (ifp->if_mtu == ifr->ifr_mtu) {
3723 if (hn_xpnt_vf_isready(sc)) {
3724 vf_ifp = sc->hn_vf_ifp;
3726 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3727 sizeof(ifr_vf.ifr_name));
3728 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3732 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3733 vf_ifp->if_xname, ifr->ifr_mtu, error);
3739 * Suspend this interface before the synthetic parts
3745 * Detach the synthetics parts, i.e. NVS and RNDIS.
3747 hn_synth_detach(sc);
3750 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3751 * with the new MTU setting.
3753 error = hn_synth_attach(sc, ifr->ifr_mtu);
3759 error = hn_rndis_get_mtu(sc, &mtu);
3762 else if (bootverbose)
3763 if_printf(ifp, "RNDIS mtu %u\n", mtu);
3766 * Commit the requested MTU, after the synthetic parts
3767 * have been successfully attached.
3769 if (mtu >= ifr->ifr_mtu) {
3772 if_printf(ifp, "fixup mtu %d -> %u\n",
3778 * Synthetic parts' reattach may change the chimney
3779 * sending size; update it.
3781 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3782 hn_set_chim_size(sc, sc->hn_chim_szmax);
3785 * Make sure that various parameters based on MTU are
3786 * still valid, after the MTU change.
3788 hn_mtu_change_fixup(sc);
3791 * All done! Resume the interface now.
3795 if ((sc->hn_flags & HN_FLAG_RXVF) ||
3796 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3798 * Since we have reattached the NVS part,
3799 * change the datapath to VF again; in case
3800 * that it is lost, after the NVS was detached.
3802 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3811 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3816 if (hn_xpnt_vf_isready(sc))
3817 hn_xpnt_vf_saveifflags(sc);
3819 if (ifp->if_flags & IFF_UP) {
3820 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3822 * Caller meight hold mutex, e.g.
3823 * bpf; use busy-wait for the RNDIS
3827 hn_rxfilter_config(sc);
3830 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3831 error = hn_xpnt_vf_iocsetflags(sc);
3836 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3839 sc->hn_if_flags = ifp->if_flags;
3847 if (hn_xpnt_vf_isready(sc)) {
3849 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3850 sizeof(ifr_vf.ifr_name));
3851 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3857 * Fix up requested capabilities w/ supported capabilities,
3858 * since the supported capabilities could have been changed.
3860 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3863 if (mask & IFCAP_TXCSUM) {
3864 ifp->if_capenable ^= IFCAP_TXCSUM;
3865 if (ifp->if_capenable & IFCAP_TXCSUM)
3866 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3868 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3870 if (mask & IFCAP_TXCSUM_IPV6) {
3871 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3872 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3873 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3875 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3878 /* TODO: flip RNDIS offload parameters for RXCSUM. */
3879 if (mask & IFCAP_RXCSUM)
3880 ifp->if_capenable ^= IFCAP_RXCSUM;
3882 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
3883 if (mask & IFCAP_RXCSUM_IPV6)
3884 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3887 if (mask & IFCAP_LRO)
3888 ifp->if_capenable ^= IFCAP_LRO;
3890 if (mask & IFCAP_TSO4) {
3891 ifp->if_capenable ^= IFCAP_TSO4;
3892 if (ifp->if_capenable & IFCAP_TSO4)
3893 ifp->if_hwassist |= CSUM_IP_TSO;
3895 ifp->if_hwassist &= ~CSUM_IP_TSO;
3897 if (mask & IFCAP_TSO6) {
3898 ifp->if_capenable ^= IFCAP_TSO6;
3899 if (ifp->if_capenable & IFCAP_TSO6)
3900 ifp->if_hwassist |= CSUM_IP6_TSO;
3902 ifp->if_hwassist &= ~CSUM_IP6_TSO;
3912 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3916 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3918 * Multicast uses mutex; use busy-wait for
3922 hn_rxfilter_config(sc);
3926 /* XXX vlan(4) style mcast addr maintenance */
3927 if (hn_xpnt_vf_isready(sc)) {
3930 old_if_flags = sc->hn_vf_ifp->if_flags;
3931 hn_xpnt_vf_saveifflags(sc);
3933 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3934 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3936 error = hn_xpnt_vf_iocsetflags(sc);
3945 if (hn_xpnt_vf_isready(sc)) {
3947 * SIOCGIFMEDIA expects ifmediareq, so don't
3948 * create and pass ifr_vf to the VF here; just
3949 * replace the ifr_name.
3951 vf_ifp = sc->hn_vf_ifp;
3952 strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3953 sizeof(ifr->ifr_name));
3954 error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3955 /* Restore the ifr_name. */
3956 strlcpy(ifr->ifr_name, ifp->if_xname,
3957 sizeof(ifr->ifr_name));
3962 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3965 case SIOCGIFRSSHASH:
3966 ifrh = (struct ifrsshash *)data;
3968 if (sc->hn_rx_ring_inuse == 1) {
3970 ifrh->ifrh_func = RSS_FUNC_NONE;
3971 ifrh->ifrh_types = 0;
3975 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3976 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3978 ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3979 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3984 ifrk = (struct ifrsskey *)data;
3986 if (sc->hn_rx_ring_inuse == 1) {
3988 ifrk->ifrk_func = RSS_FUNC_NONE;
3989 ifrk->ifrk_keylen = 0;
3992 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3993 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3995 ifrk->ifrk_func = RSS_FUNC_PRIVATE;
3996 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
3997 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
3998 NDIS_HASH_KEYSIZE_TOEPLITZ);
4003 error = ether_ioctl(ifp, cmd, data);
4010 hn_stop(struct hn_softc *sc, bool detaching)
4012 struct ifnet *ifp = sc->hn_ifp;
4017 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4018 ("synthetic parts were not attached"));
4020 /* Clear RUNNING bit ASAP. */
4021 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4023 /* Disable polling. */
4026 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4027 KASSERT(sc->hn_vf_ifp != NULL,
4028 ("%s: VF is not attached", ifp->if_xname));
4030 /* Mark transparent mode VF as disabled. */
4031 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4035 * Datapath setting must happen _before_ bringing
4038 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4041 * Bring the VF down.
4043 hn_xpnt_vf_saveifflags(sc);
4044 sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4045 hn_xpnt_vf_iocsetflags(sc);
4048 /* Suspend data transfers. */
4049 hn_suspend_data(sc);
4051 /* Clear OACTIVE bit. */
4052 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4053 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4054 sc->hn_tx_ring[i].hn_oactive = 0;
4057 * If the non-transparent mode VF is active, make sure
4058 * that the RX filter still allows packet reception.
4060 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4061 hn_rxfilter_config(sc);
4065 hn_init_locked(struct hn_softc *sc)
4067 struct ifnet *ifp = sc->hn_ifp;
4072 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4075 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4078 /* Configure RX filter */
4079 hn_rxfilter_config(sc);
4081 /* Clear OACTIVE bit. */
4082 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4083 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4084 sc->hn_tx_ring[i].hn_oactive = 0;
4086 /* Clear TX 'suspended' bit. */
4087 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4089 if (hn_xpnt_vf_isready(sc)) {
4090 /* Initialize transparent VF. */
4091 hn_xpnt_vf_init(sc);
4094 /* Everything is ready; unleash! */
4095 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4097 /* Re-enable polling if requested. */
4098 if (sc->hn_pollhz > 0)
4099 hn_polling(sc, sc->hn_pollhz);
4105 struct hn_softc *sc = xsc;
4112 #if __FreeBSD_version >= 1100099
4115 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4117 struct hn_softc *sc = arg1;
4118 unsigned int lenlim;
4121 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4122 error = sysctl_handle_int(oidp, &lenlim, 0, req);
4123 if (error || req->newptr == NULL)
4127 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4128 lenlim > TCP_LRO_LENGTH_MAX) {
4132 hn_set_lro_lenlim(sc, lenlim);
4139 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4141 struct hn_softc *sc = arg1;
4142 int ackcnt, error, i;
4145 * lro_ackcnt_lim is append count limit,
4146 * +1 to turn it into aggregation limit.
4148 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4149 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4150 if (error || req->newptr == NULL)
4153 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4157 * Convert aggregation limit back to append
4162 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4163 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4171 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4173 struct hn_softc *sc = arg1;
4178 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4181 error = sysctl_handle_int(oidp, &on, 0, req);
4182 if (error || req->newptr == NULL)
4186 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4187 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4190 rxr->hn_trust_hcsum |= hcsum;
4192 rxr->hn_trust_hcsum &= ~hcsum;
4199 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4201 struct hn_softc *sc = arg1;
4202 int chim_size, error;
4204 chim_size = sc->hn_tx_ring[0].hn_chim_size;
4205 error = sysctl_handle_int(oidp, &chim_size, 0, req);
4206 if (error || req->newptr == NULL)
4209 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4213 hn_set_chim_size(sc, chim_size);
4218 #if __FreeBSD_version < 1100095
4220 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4222 struct hn_softc *sc = arg1;
4223 int ofs = arg2, i, error;
4224 struct hn_rx_ring *rxr;
4228 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4229 rxr = &sc->hn_rx_ring[i];
4230 stat += *((int *)((uint8_t *)rxr + ofs));
4233 error = sysctl_handle_64(oidp, &stat, 0, req);
4234 if (error || req->newptr == NULL)
4237 /* Zero out this stat. */
4238 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4239 rxr = &sc->hn_rx_ring[i];
4240 *((int *)((uint8_t *)rxr + ofs)) = 0;
4246 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4248 struct hn_softc *sc = arg1;
4249 int ofs = arg2, i, error;
4250 struct hn_rx_ring *rxr;
4254 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4255 rxr = &sc->hn_rx_ring[i];
4256 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4259 error = sysctl_handle_64(oidp, &stat, 0, req);
4260 if (error || req->newptr == NULL)
4263 /* Zero out this stat. */
4264 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4265 rxr = &sc->hn_rx_ring[i];
4266 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4274 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4276 struct hn_softc *sc = arg1;
4277 int ofs = arg2, i, error;
4278 struct hn_rx_ring *rxr;
4282 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4283 rxr = &sc->hn_rx_ring[i];
4284 stat += *((u_long *)((uint8_t *)rxr + ofs));
4287 error = sysctl_handle_long(oidp, &stat, 0, req);
4288 if (error || req->newptr == NULL)
4291 /* Zero out this stat. */
4292 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4293 rxr = &sc->hn_rx_ring[i];
4294 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
4300 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4302 struct hn_softc *sc = arg1;
4303 int ofs = arg2, i, error;
4304 struct hn_tx_ring *txr;
4308 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4309 txr = &sc->hn_tx_ring[i];
4310 stat += *((u_long *)((uint8_t *)txr + ofs));
4313 error = sysctl_handle_long(oidp, &stat, 0, req);
4314 if (error || req->newptr == NULL)
4317 /* Zero out this stat. */
4318 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4319 txr = &sc->hn_tx_ring[i];
4320 *((u_long *)((uint8_t *)txr + ofs)) = 0;
4326 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4328 struct hn_softc *sc = arg1;
4329 int ofs = arg2, i, error, conf;
4330 struct hn_tx_ring *txr;
4332 txr = &sc->hn_tx_ring[0];
4333 conf = *((int *)((uint8_t *)txr + ofs));
4335 error = sysctl_handle_int(oidp, &conf, 0, req);
4336 if (error || req->newptr == NULL)
4340 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4341 txr = &sc->hn_tx_ring[i];
4342 *((int *)((uint8_t *)txr + ofs)) = conf;
4350 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4352 struct hn_softc *sc = arg1;
4355 size = sc->hn_agg_size;
4356 error = sysctl_handle_int(oidp, &size, 0, req);
4357 if (error || req->newptr == NULL)
4361 sc->hn_agg_size = size;
4369 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4371 struct hn_softc *sc = arg1;
4374 pkts = sc->hn_agg_pkts;
4375 error = sysctl_handle_int(oidp, &pkts, 0, req);
4376 if (error || req->newptr == NULL)
4380 sc->hn_agg_pkts = pkts;
4388 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4390 struct hn_softc *sc = arg1;
4393 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4394 return (sysctl_handle_int(oidp, &pkts, 0, req));
4398 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4400 struct hn_softc *sc = arg1;
4403 align = sc->hn_tx_ring[0].hn_agg_align;
4404 return (sysctl_handle_int(oidp, &align, 0, req));
4408 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4411 vmbus_chan_poll_disable(chan);
4413 vmbus_chan_poll_enable(chan, pollhz);
4417 hn_polling(struct hn_softc *sc, u_int pollhz)
4419 int nsubch = sc->hn_rx_ring_inuse - 1;
4424 struct vmbus_channel **subch;
4427 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4428 for (i = 0; i < nsubch; ++i)
4429 hn_chan_polling(subch[i], pollhz);
4430 vmbus_subchan_rel(subch, nsubch);
4432 hn_chan_polling(sc->hn_prichan, pollhz);
4436 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4438 struct hn_softc *sc = arg1;
4441 pollhz = sc->hn_pollhz;
4442 error = sysctl_handle_int(oidp, &pollhz, 0, req);
4443 if (error || req->newptr == NULL)
4447 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4451 if (sc->hn_pollhz != pollhz) {
4452 sc->hn_pollhz = pollhz;
4453 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4454 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4455 hn_polling(sc, sc->hn_pollhz);
4463 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4465 struct hn_softc *sc = arg1;
4468 snprintf(verstr, sizeof(verstr), "%u.%u",
4469 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4470 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4471 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4475 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4477 struct hn_softc *sc = arg1;
4484 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4485 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4489 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4491 struct hn_softc *sc = arg1;
4492 char assist_str[128];
4496 hwassist = sc->hn_ifp->if_hwassist;
4498 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4499 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4503 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4505 struct hn_softc *sc = arg1;
4506 char filter_str[128];
4510 filter = sc->hn_rx_filter;
4512 snprintf(filter_str, sizeof(filter_str), "%b", filter,
4514 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4518 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4520 struct hn_softc *sc = arg1;
4525 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4526 if (error || req->newptr == NULL)
4529 if ((sc->hn_flags & HN_FLAG_RXVF) ||
4530 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4532 * RSS key is synchronized w/ VF's, don't allow users
4539 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4542 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4544 if (sc->hn_rx_ring_inuse > 1) {
4545 error = hn_rss_reconfig(sc);
4547 /* Not RSS capable, at least for now; just save the RSS key. */
4556 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4558 struct hn_softc *sc = arg1;
4563 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4564 if (error || req->newptr == NULL)
4568 * Don't allow RSS indirect table change, if this interface is not
4569 * RSS capable currently.
4571 if (sc->hn_rx_ring_inuse == 1) {
4576 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4579 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4581 hn_rss_ind_fixup(sc);
4582 error = hn_rss_reconfig(sc);
4589 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4591 struct hn_softc *sc = arg1;
4596 hash = sc->hn_rss_hash;
4598 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4599 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4603 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4605 struct hn_softc *sc = arg1;
4610 hash = sc->hn_rss_hcap;
4612 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4613 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4617 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4619 struct hn_softc *sc = arg1;
4624 hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4626 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4627 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4631 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4633 struct hn_softc *sc = arg1;
4634 char vf_name[IFNAMSIZ + 1];
4635 struct ifnet *vf_ifp;
4639 vf_ifp = sc->hn_vf_ifp;
4641 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4643 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4647 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4649 struct hn_softc *sc = arg1;
4650 char vf_name[IFNAMSIZ + 1];
4651 struct ifnet *vf_ifp;
4655 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4657 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4659 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4663 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4665 struct rm_priotracker pt;
4670 error = sysctl_wire_old_buffer(req, 0);
4674 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4678 rm_rlock(&hn_vfmap_lock, &pt);
4681 for (i = 0; i < hn_vfmap_size; ++i) {
4684 if (hn_vfmap[i] == NULL)
4687 ifp = ifnet_byindex(i);
4690 sbuf_printf(sb, "%s", ifp->if_xname);
4692 sbuf_printf(sb, " %s", ifp->if_xname);
4697 rm_runlock(&hn_vfmap_lock, &pt);
4699 error = sbuf_finish(sb);
4705 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4707 struct rm_priotracker pt;
4712 error = sysctl_wire_old_buffer(req, 0);
4716 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4720 rm_rlock(&hn_vfmap_lock, &pt);
4723 for (i = 0; i < hn_vfmap_size; ++i) {
4724 struct ifnet *ifp, *hn_ifp;
4726 hn_ifp = hn_vfmap[i];
4730 ifp = ifnet_byindex(i);
4733 sbuf_printf(sb, "%s:%s", ifp->if_xname,
4736 sbuf_printf(sb, " %s:%s", ifp->if_xname,
4743 rm_runlock(&hn_vfmap_lock, &pt);
4745 error = sbuf_finish(sb);
4751 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4753 struct hn_softc *sc = arg1;
4754 int error, onoff = 0;
4756 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4758 error = sysctl_handle_int(oidp, &onoff, 0, req);
4759 if (error || req->newptr == NULL)
4763 /* NOTE: hn_vf_lock for hn_transmit() */
4764 rm_wlock(&sc->hn_vf_lock);
4766 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4768 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4769 rm_wunlock(&sc->hn_vf_lock);
4776 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4778 struct hn_softc *sc = arg1;
4781 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4783 return (sysctl_handle_int(oidp, &enabled, 0, req));
4787 hn_check_iplen(const struct mbuf *m, int hoff)
4789 const struct ip *ip;
4790 int len, iphlen, iplen;
4791 const struct tcphdr *th;
4792 int thoff; /* TCP data offset */
4794 len = hoff + sizeof(struct ip);
4796 /* The packet must be at least the size of an IP header. */
4797 if (m->m_pkthdr.len < len)
4798 return IPPROTO_DONE;
4800 /* The fixed IP header must reside completely in the first mbuf. */
4802 return IPPROTO_DONE;
4804 ip = mtodo(m, hoff);
4806 /* Bound check the packet's stated IP header length. */
4807 iphlen = ip->ip_hl << 2;
4808 if (iphlen < sizeof(struct ip)) /* minimum header length */
4809 return IPPROTO_DONE;
4811 /* The full IP header must reside completely in the one mbuf. */
4812 if (m->m_len < hoff + iphlen)
4813 return IPPROTO_DONE;
4815 iplen = ntohs(ip->ip_len);
4818 * Check that the amount of data in the buffers is as
4819 * at least much as the IP header would have us expect.
4821 if (m->m_pkthdr.len < hoff + iplen)
4822 return IPPROTO_DONE;
4825 * Ignore IP fragments.
4827 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4828 return IPPROTO_DONE;
4831 * The TCP/IP or UDP/IP header must be entirely contained within
4832 * the first fragment of a packet.
4836 if (iplen < iphlen + sizeof(struct tcphdr))
4837 return IPPROTO_DONE;
4838 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4839 return IPPROTO_DONE;
4840 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4841 thoff = th->th_off << 2;
4842 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4843 return IPPROTO_DONE;
4844 if (m->m_len < hoff + iphlen + thoff)
4845 return IPPROTO_DONE;
4848 if (iplen < iphlen + sizeof(struct udphdr))
4849 return IPPROTO_DONE;
4850 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4851 return IPPROTO_DONE;
4855 return IPPROTO_DONE;
4862 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4864 const struct ether_header *eh;
4869 /* Checked at the beginning of this function. */
4870 KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4872 eh = mtod(m_new, const struct ether_header *);
4873 etype = ntohs(eh->ether_type);
4874 if (etype == ETHERTYPE_VLAN) {
4875 const struct ether_vlan_header *evl;
4877 hoff = sizeof(*evl);
4878 if (m_new->m_len < hoff)
4880 evl = mtod(m_new, const struct ether_vlan_header *);
4881 etype = ntohs(evl->evl_proto);
4885 if (etype == ETHERTYPE_IP)
4886 *l4proto = hn_check_iplen(m_new, hoff);
4888 *l4proto = IPPROTO_DONE;
4892 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4894 struct sysctl_oid_list *child;
4895 struct sysctl_ctx_list *ctx;
4896 device_t dev = sc->hn_dev;
4897 #if defined(INET) || defined(INET6)
4898 #if __FreeBSD_version >= 1100095
4905 * Create RXBUF for reception.
4908 * - It is shared by all channels.
4909 * - A large enough buffer is allocated, certain version of NVSes
4910 * may further limit the usable space.
4912 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4913 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4914 BUS_DMA_WAITOK | BUS_DMA_ZERO);
4915 if (sc->hn_rxbuf == NULL) {
4916 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4920 sc->hn_rx_ring_cnt = ring_cnt;
4921 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4923 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4924 M_DEVBUF, M_WAITOK | M_ZERO);
4926 #if defined(INET) || defined(INET6)
4927 #if __FreeBSD_version >= 1100095
4928 lroent_cnt = hn_lro_entry_count;
4929 if (lroent_cnt < TCP_LRO_ENTRIES)
4930 lroent_cnt = TCP_LRO_ENTRIES;
4932 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4934 #endif /* INET || INET6 */
4936 ctx = device_get_sysctl_ctx(dev);
4937 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4939 /* Create dev.hn.UNIT.rx sysctl tree */
4940 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4941 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4943 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4944 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4946 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4947 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4948 &rxr->hn_br_dma, BUS_DMA_WAITOK);
4949 if (rxr->hn_br == NULL) {
4950 device_printf(dev, "allocate bufring failed\n");
4954 if (hn_trust_hosttcp)
4955 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4956 if (hn_trust_hostudp)
4957 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4958 if (hn_trust_hostip)
4959 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4960 rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4961 rxr->hn_ifp = sc->hn_ifp;
4962 if (i < sc->hn_tx_ring_cnt)
4963 rxr->hn_txr = &sc->hn_tx_ring[i];
4964 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4965 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4967 rxr->hn_rxbuf = sc->hn_rxbuf;
4972 #if defined(INET) || defined(INET6)
4973 #if __FreeBSD_version >= 1100095
4974 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4975 hn_lro_mbufq_depth);
4977 tcp_lro_init(&rxr->hn_lro);
4978 rxr->hn_lro.ifp = sc->hn_ifp;
4980 #if __FreeBSD_version >= 1100099
4981 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4982 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4984 #endif /* INET || INET6 */
4986 if (sc->hn_rx_sysctl_tree != NULL) {
4990 * Create per RX ring sysctl tree:
4991 * dev.hn.UNIT.rx.RINGID
4993 snprintf(name, sizeof(name), "%d", i);
4994 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4995 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4996 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4998 if (rxr->hn_rx_sysctl_tree != NULL) {
4999 SYSCTL_ADD_ULONG(ctx,
5000 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5001 OID_AUTO, "packets", CTLFLAG_RW,
5002 &rxr->hn_pkts, "# of packets received");
5003 SYSCTL_ADD_ULONG(ctx,
5004 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5005 OID_AUTO, "rss_pkts", CTLFLAG_RW,
5007 "# of packets w/ RSS info received");
5009 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5010 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5011 &rxr->hn_pktbuf_len, 0,
5012 "Temporary channel packet buffer length");
5017 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5018 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5019 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5020 #if __FreeBSD_version < 1100095
5021 hn_rx_stat_int_sysctl,
5023 hn_rx_stat_u64_sysctl,
5025 "LU", "LRO queued");
5026 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5027 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5028 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5029 #if __FreeBSD_version < 1100095
5030 hn_rx_stat_int_sysctl,
5032 hn_rx_stat_u64_sysctl,
5034 "LU", "LRO flushed");
5035 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5036 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5037 __offsetof(struct hn_rx_ring, hn_lro_tried),
5038 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5039 #if __FreeBSD_version >= 1100099
5040 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5041 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5042 hn_lro_lenlim_sysctl, "IU",
5043 "Max # of data bytes to be aggregated by LRO");
5044 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5045 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5046 hn_lro_ackcnt_sysctl, "I",
5047 "Max # of ACKs to be aggregated by LRO");
5049 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5050 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5051 hn_trust_hcsum_sysctl, "I",
5052 "Trust tcp segement verification on host side, "
5053 "when csum info is missing");
5054 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5055 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5056 hn_trust_hcsum_sysctl, "I",
5057 "Trust udp datagram verification on host side, "
5058 "when csum info is missing");
5059 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5060 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5061 hn_trust_hcsum_sysctl, "I",
5062 "Trust ip packet verification on host side, "
5063 "when csum info is missing");
5064 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5065 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5066 __offsetof(struct hn_rx_ring, hn_csum_ip),
5067 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5068 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5069 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5070 __offsetof(struct hn_rx_ring, hn_csum_tcp),
5071 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5072 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5073 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5074 __offsetof(struct hn_rx_ring, hn_csum_udp),
5075 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5076 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5077 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5078 __offsetof(struct hn_rx_ring, hn_csum_trusted),
5079 hn_rx_stat_ulong_sysctl, "LU",
5080 "# of packets that we trust host's csum verification");
5081 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5082 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5083 __offsetof(struct hn_rx_ring, hn_small_pkts),
5084 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5085 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5086 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5087 __offsetof(struct hn_rx_ring, hn_ack_failed),
5088 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5089 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5090 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5091 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5092 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5098 hn_destroy_rx_data(struct hn_softc *sc)
5102 if (sc->hn_rxbuf != NULL) {
5103 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5104 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5106 device_printf(sc->hn_dev, "RXBUF is referenced\n");
5107 sc->hn_rxbuf = NULL;
5110 if (sc->hn_rx_ring_cnt == 0)
5113 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5114 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5116 if (rxr->hn_br == NULL)
5118 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5119 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5121 device_printf(sc->hn_dev,
5122 "%dth channel bufring is referenced", i);
5126 #if defined(INET) || defined(INET6)
5127 tcp_lro_free(&rxr->hn_lro);
5129 free(rxr->hn_pktbuf, M_DEVBUF);
5131 free(sc->hn_rx_ring, M_DEVBUF);
5132 sc->hn_rx_ring = NULL;
5134 sc->hn_rx_ring_cnt = 0;
5135 sc->hn_rx_ring_inuse = 0;
5139 hn_tx_ring_create(struct hn_softc *sc, int id)
5141 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5142 device_t dev = sc->hn_dev;
5143 bus_dma_tag_t parent_dtag;
5147 txr->hn_tx_idx = id;
5149 #ifndef HN_USE_TXDESC_BUFRING
5150 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5152 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5154 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5155 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5156 M_DEVBUF, M_WAITOK | M_ZERO);
5157 #ifndef HN_USE_TXDESC_BUFRING
5158 SLIST_INIT(&txr->hn_txlist);
5160 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5161 M_WAITOK, &txr->hn_tx_lock);
5164 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5165 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5166 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5168 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5171 #ifdef HN_IFSTART_SUPPORT
5172 if (hn_use_if_start) {
5173 txr->hn_txeof = hn_start_txeof;
5174 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5175 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5181 txr->hn_txeof = hn_xmit_txeof;
5182 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5183 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5185 br_depth = hn_get_txswq_depth(txr);
5186 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5187 M_WAITOK, &txr->hn_tx_lock);
5190 txr->hn_direct_tx_size = hn_direct_tx_size;
5193 * Always schedule transmission instead of trying to do direct
5194 * transmission. This one gives the best performance so far.
5196 txr->hn_sched_tx = 1;
5198 parent_dtag = bus_get_dma_tag(dev);
5200 /* DMA tag for RNDIS packet messages. */
5201 error = bus_dma_tag_create(parent_dtag, /* parent */
5202 HN_RNDIS_PKT_ALIGN, /* alignment */
5203 HN_RNDIS_PKT_BOUNDARY, /* boundary */
5204 BUS_SPACE_MAXADDR, /* lowaddr */
5205 BUS_SPACE_MAXADDR, /* highaddr */
5206 NULL, NULL, /* filter, filterarg */
5207 HN_RNDIS_PKT_LEN, /* maxsize */
5209 HN_RNDIS_PKT_LEN, /* maxsegsize */
5211 NULL, /* lockfunc */
5212 NULL, /* lockfuncarg */
5213 &txr->hn_tx_rndis_dtag);
5215 device_printf(dev, "failed to create rndis dmatag\n");
5219 /* DMA tag for data. */
5220 error = bus_dma_tag_create(parent_dtag, /* parent */
5222 HN_TX_DATA_BOUNDARY, /* boundary */
5223 BUS_SPACE_MAXADDR, /* lowaddr */
5224 BUS_SPACE_MAXADDR, /* highaddr */
5225 NULL, NULL, /* filter, filterarg */
5226 HN_TX_DATA_MAXSIZE, /* maxsize */
5227 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
5228 HN_TX_DATA_SEGSIZE, /* maxsegsize */
5230 NULL, /* lockfunc */
5231 NULL, /* lockfuncarg */
5232 &txr->hn_tx_data_dtag);
5234 device_printf(dev, "failed to create data dmatag\n");
5238 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5239 struct hn_txdesc *txd = &txr->hn_txdesc[i];
5242 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5243 STAILQ_INIT(&txd->agg_list);
5246 * Allocate and load RNDIS packet message.
5248 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5249 (void **)&txd->rndis_pkt,
5250 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5251 &txd->rndis_pkt_dmap);
5254 "failed to allocate rndis_packet_msg, %d\n", i);
5258 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5259 txd->rndis_pkt_dmap,
5260 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5261 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5265 "failed to load rndis_packet_msg, %d\n", i);
5266 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5267 txd->rndis_pkt, txd->rndis_pkt_dmap);
5271 /* DMA map for TX data. */
5272 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5276 "failed to allocate tx data dmamap\n");
5277 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5278 txd->rndis_pkt_dmap);
5279 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5280 txd->rndis_pkt, txd->rndis_pkt_dmap);
5284 /* All set, put it to list */
5285 txd->flags |= HN_TXD_FLAG_ONLIST;
5286 #ifndef HN_USE_TXDESC_BUFRING
5287 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5289 buf_ring_enqueue(txr->hn_txdesc_br, txd);
5292 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5294 if (sc->hn_tx_sysctl_tree != NULL) {
5295 struct sysctl_oid_list *child;
5296 struct sysctl_ctx_list *ctx;
5300 * Create per TX ring sysctl tree:
5301 * dev.hn.UNIT.tx.RINGID
5303 ctx = device_get_sysctl_ctx(dev);
5304 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5306 snprintf(name, sizeof(name), "%d", id);
5307 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5308 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5310 if (txr->hn_tx_sysctl_tree != NULL) {
5311 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5314 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5315 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5316 "# of available TX descs");
5318 #ifdef HN_IFSTART_SUPPORT
5319 if (!hn_use_if_start)
5322 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5323 CTLFLAG_RD, &txr->hn_oactive, 0,
5326 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5327 CTLFLAG_RW, &txr->hn_pkts,
5328 "# of packets transmitted");
5329 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5330 CTLFLAG_RW, &txr->hn_sends, "# of sends");
5338 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5340 struct hn_tx_ring *txr = txd->txr;
5342 KASSERT(txd->m == NULL, ("still has mbuf installed"));
5343 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5345 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5346 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5347 txd->rndis_pkt_dmap);
5348 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5352 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5355 KASSERT(txd->refs == 0 || txd->refs == 1,
5356 ("invalid txd refs %d", txd->refs));
5358 /* Aggregated txds will be freed by their aggregating txd. */
5359 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5362 freed = hn_txdesc_put(txr, txd);
5363 KASSERT(freed, ("can't free txdesc"));
5368 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5372 if (txr->hn_txdesc == NULL)
5377 * Because the freeing of aggregated txds will be deferred
5378 * to the aggregating txd, two passes are used here:
5379 * - The first pass GCes any pending txds. This GC is necessary,
5380 * since if the channels are revoked, hypervisor will not
5381 * deliver send-done for all pending txds.
5382 * - The second pass frees the busdma stuffs, i.e. after all txds
5385 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5386 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5387 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5388 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5390 if (txr->hn_tx_data_dtag != NULL)
5391 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5392 if (txr->hn_tx_rndis_dtag != NULL)
5393 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5395 #ifdef HN_USE_TXDESC_BUFRING
5396 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5399 free(txr->hn_txdesc, M_DEVBUF);
5400 txr->hn_txdesc = NULL;
5402 if (txr->hn_mbuf_br != NULL)
5403 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5405 #ifndef HN_USE_TXDESC_BUFRING
5406 mtx_destroy(&txr->hn_txlist_spin);
5408 mtx_destroy(&txr->hn_tx_lock);
5412 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5414 struct sysctl_oid_list *child;
5415 struct sysctl_ctx_list *ctx;
5419 * Create TXBUF for chimney sending.
5421 * NOTE: It is shared by all channels.
5423 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5424 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5425 BUS_DMA_WAITOK | BUS_DMA_ZERO);
5426 if (sc->hn_chim == NULL) {
5427 device_printf(sc->hn_dev, "allocate txbuf failed\n");
5431 sc->hn_tx_ring_cnt = ring_cnt;
5432 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5434 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5435 M_DEVBUF, M_WAITOK | M_ZERO);
5437 ctx = device_get_sysctl_ctx(sc->hn_dev);
5438 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5440 /* Create dev.hn.UNIT.tx sysctl tree */
5441 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5442 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5444 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5447 error = hn_tx_ring_create(sc, i);
5452 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5453 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5454 __offsetof(struct hn_tx_ring, hn_no_txdescs),
5455 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5456 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5457 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5458 __offsetof(struct hn_tx_ring, hn_send_failed),
5459 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5460 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5461 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5462 __offsetof(struct hn_tx_ring, hn_txdma_failed),
5463 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5464 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5465 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5466 __offsetof(struct hn_tx_ring, hn_flush_failed),
5467 hn_tx_stat_ulong_sysctl, "LU",
5468 "# of packet transmission aggregation flush failure");
5469 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5470 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5471 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5472 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5473 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5474 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5475 __offsetof(struct hn_tx_ring, hn_tx_chimney),
5476 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5477 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5478 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5479 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5480 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5481 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5482 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5483 "# of total TX descs");
5484 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5485 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5486 "Chimney send packet size upper boundary");
5487 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5488 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5489 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5490 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5491 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5492 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5493 hn_tx_conf_int_sysctl, "I",
5494 "Size of the packet for direct transmission");
5495 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5496 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5497 __offsetof(struct hn_tx_ring, hn_sched_tx),
5498 hn_tx_conf_int_sysctl, "I",
5499 "Always schedule transmission "
5500 "instead of doing direct transmission");
5501 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5502 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5503 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5504 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5505 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5506 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5507 "Applied packet transmission aggregation size");
5508 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5509 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5510 hn_txagg_pktmax_sysctl, "I",
5511 "Applied packet transmission aggregation packets");
5512 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5513 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5514 hn_txagg_align_sysctl, "I",
5515 "Applied packet transmission aggregation alignment");
5521 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5525 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5526 sc->hn_tx_ring[i].hn_chim_size = chim_size;
5530 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5532 struct ifnet *ifp = sc->hn_ifp;
5538 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5541 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5542 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5543 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5545 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5546 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5547 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5549 if (tso_maxlen < tso_minlen)
5550 tso_maxlen = tso_minlen;
5551 else if (tso_maxlen > IP_MAXPACKET)
5552 tso_maxlen = IP_MAXPACKET;
5553 if (tso_maxlen > sc->hn_ndis_tso_szmax)
5554 tso_maxlen = sc->hn_ndis_tso_szmax;
5555 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5557 if (hn_xpnt_vf_isready(sc)) {
5558 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5559 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5561 ifp->if_hw_tsomax = hw_tsomax;
5563 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5567 hn_fixup_tx_data(struct hn_softc *sc)
5569 uint64_t csum_assist;
5572 hn_set_chim_size(sc, sc->hn_chim_szmax);
5573 if (hn_tx_chimney_size > 0 &&
5574 hn_tx_chimney_size < sc->hn_chim_szmax)
5575 hn_set_chim_size(sc, hn_tx_chimney_size);
5578 if (sc->hn_caps & HN_CAP_IPCS)
5579 csum_assist |= CSUM_IP;
5580 if (sc->hn_caps & HN_CAP_TCP4CS)
5581 csum_assist |= CSUM_IP_TCP;
5582 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5583 csum_assist |= CSUM_IP_UDP;
5584 if (sc->hn_caps & HN_CAP_TCP6CS)
5585 csum_assist |= CSUM_IP6_TCP;
5586 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5587 csum_assist |= CSUM_IP6_UDP;
5588 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5589 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5591 if (sc->hn_caps & HN_CAP_HASHVAL) {
5593 * Support HASHVAL pktinfo on TX path.
5596 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5597 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5598 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5603 hn_fixup_rx_data(struct hn_softc *sc)
5606 if (sc->hn_caps & HN_CAP_UDPHASH) {
5609 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5610 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5615 hn_destroy_tx_data(struct hn_softc *sc)
5619 if (sc->hn_chim != NULL) {
5620 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5621 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5623 device_printf(sc->hn_dev,
5624 "chimney sending buffer is referenced");
5629 if (sc->hn_tx_ring_cnt == 0)
5632 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5633 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5635 free(sc->hn_tx_ring, M_DEVBUF);
5636 sc->hn_tx_ring = NULL;
5638 sc->hn_tx_ring_cnt = 0;
5639 sc->hn_tx_ring_inuse = 0;
5642 #ifdef HN_IFSTART_SUPPORT
5645 hn_start_taskfunc(void *xtxr, int pending __unused)
5647 struct hn_tx_ring *txr = xtxr;
5649 mtx_lock(&txr->hn_tx_lock);
5650 hn_start_locked(txr, 0);
5651 mtx_unlock(&txr->hn_tx_lock);
5655 hn_start_locked(struct hn_tx_ring *txr, int len)
5657 struct hn_softc *sc = txr->hn_sc;
5658 struct ifnet *ifp = sc->hn_ifp;
5661 KASSERT(hn_use_if_start,
5662 ("hn_start_locked is called, when if_start is disabled"));
5663 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5664 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5665 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5667 if (__predict_false(txr->hn_suspended))
5670 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5674 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5675 struct hn_txdesc *txd;
5676 struct mbuf *m_head;
5679 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5683 if (len > 0 && m_head->m_pkthdr.len > len) {
5685 * This sending could be time consuming; let callers
5686 * dispatch this packet sending (and sending of any
5687 * following up packets) to tx taskqueue.
5689 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5694 #if defined(INET6) || defined(INET)
5695 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5696 m_head = hn_tso_fixup(m_head);
5697 if (__predict_false(m_head == NULL)) {
5698 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5701 } else if (m_head->m_pkthdr.csum_flags &
5702 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5703 m_head = hn_set_hlen(m_head);
5704 if (__predict_false(m_head == NULL)) {
5705 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5711 txd = hn_txdesc_get(txr);
5713 txr->hn_no_txdescs++;
5714 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5715 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5719 error = hn_encap(ifp, txr, txd, &m_head);
5721 /* Both txd and m_head are freed */
5722 KASSERT(txr->hn_agg_txd == NULL,
5723 ("encap failed w/ pending aggregating txdesc"));
5727 if (txr->hn_agg_pktleft == 0) {
5728 if (txr->hn_agg_txd != NULL) {
5729 KASSERT(m_head == NULL,
5730 ("pending mbuf for aggregating txdesc"));
5731 error = hn_flush_txagg(ifp, txr);
5732 if (__predict_false(error)) {
5733 atomic_set_int(&ifp->if_drv_flags,
5738 KASSERT(m_head != NULL, ("mbuf was freed"));
5739 error = hn_txpkt(ifp, txr, txd);
5740 if (__predict_false(error)) {
5741 /* txd is freed, but m_head is not */
5742 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5743 atomic_set_int(&ifp->if_drv_flags,
5751 KASSERT(txr->hn_agg_txd != NULL,
5752 ("no aggregating txdesc"));
5753 KASSERT(m_head == NULL,
5754 ("pending mbuf for aggregating txdesc"));
5759 /* Flush pending aggerated transmission. */
5760 if (txr->hn_agg_txd != NULL)
5761 hn_flush_txagg(ifp, txr);
5766 hn_start(struct ifnet *ifp)
5768 struct hn_softc *sc = ifp->if_softc;
5769 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5771 if (txr->hn_sched_tx)
5774 if (mtx_trylock(&txr->hn_tx_lock)) {
5777 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5778 mtx_unlock(&txr->hn_tx_lock);
5783 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5787 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5789 struct hn_tx_ring *txr = xtxr;
5791 mtx_lock(&txr->hn_tx_lock);
5792 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5793 hn_start_locked(txr, 0);
5794 mtx_unlock(&txr->hn_tx_lock);
5798 hn_start_txeof(struct hn_tx_ring *txr)
5800 struct hn_softc *sc = txr->hn_sc;
5801 struct ifnet *ifp = sc->hn_ifp;
5803 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5805 if (txr->hn_sched_tx)
5808 if (mtx_trylock(&txr->hn_tx_lock)) {
5811 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5812 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5813 mtx_unlock(&txr->hn_tx_lock);
5815 taskqueue_enqueue(txr->hn_tx_taskq,
5821 * Release the OACTIVE earlier, with the hope, that
5822 * others could catch up. The task will clear the
5823 * flag again with the hn_tx_lock to avoid possible
5826 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5827 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5831 #endif /* HN_IFSTART_SUPPORT */
5834 hn_xmit(struct hn_tx_ring *txr, int len)
5836 struct hn_softc *sc = txr->hn_sc;
5837 struct ifnet *ifp = sc->hn_ifp;
5838 struct mbuf *m_head;
5841 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5842 #ifdef HN_IFSTART_SUPPORT
5843 KASSERT(hn_use_if_start == 0,
5844 ("hn_xmit is called, when if_start is enabled"));
5846 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5848 if (__predict_false(txr->hn_suspended))
5851 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5854 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5855 struct hn_txdesc *txd;
5858 if (len > 0 && m_head->m_pkthdr.len > len) {
5860 * This sending could be time consuming; let callers
5861 * dispatch this packet sending (and sending of any
5862 * following up packets) to tx taskqueue.
5864 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5869 txd = hn_txdesc_get(txr);
5871 txr->hn_no_txdescs++;
5872 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5873 txr->hn_oactive = 1;
5877 error = hn_encap(ifp, txr, txd, &m_head);
5879 /* Both txd and m_head are freed; discard */
5880 KASSERT(txr->hn_agg_txd == NULL,
5881 ("encap failed w/ pending aggregating txdesc"));
5882 drbr_advance(ifp, txr->hn_mbuf_br);
5886 if (txr->hn_agg_pktleft == 0) {
5887 if (txr->hn_agg_txd != NULL) {
5888 KASSERT(m_head == NULL,
5889 ("pending mbuf for aggregating txdesc"));
5890 error = hn_flush_txagg(ifp, txr);
5891 if (__predict_false(error)) {
5892 txr->hn_oactive = 1;
5896 KASSERT(m_head != NULL, ("mbuf was freed"));
5897 error = hn_txpkt(ifp, txr, txd);
5898 if (__predict_false(error)) {
5899 /* txd is freed, but m_head is not */
5900 drbr_putback(ifp, txr->hn_mbuf_br,
5902 txr->hn_oactive = 1;
5909 KASSERT(txr->hn_agg_txd != NULL,
5910 ("no aggregating txdesc"));
5911 KASSERT(m_head == NULL,
5912 ("pending mbuf for aggregating txdesc"));
5917 drbr_advance(ifp, txr->hn_mbuf_br);
5920 /* Flush pending aggerated transmission. */
5921 if (txr->hn_agg_txd != NULL)
5922 hn_flush_txagg(ifp, txr);
5927 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5929 struct hn_softc *sc = ifp->if_softc;
5930 struct hn_tx_ring *txr;
5933 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5934 struct rm_priotracker pt;
5936 rm_rlock(&sc->hn_vf_lock, &pt);
5937 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5938 struct mbuf *m_bpf = NULL;
5941 obytes = m->m_pkthdr.len;
5942 if (m->m_flags & M_MCAST)
5945 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5946 if (bpf_peers_present(ifp->if_bpf)) {
5947 m_bpf = m_copypacket(m, M_NOWAIT);
5948 if (m_bpf == NULL) {
5950 * Failed to grab a shallow
5953 ETHER_BPF_MTAP(ifp, m);
5957 ETHER_BPF_MTAP(ifp, m);
5960 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5961 rm_runlock(&sc->hn_vf_lock, &pt);
5963 if (m_bpf != NULL) {
5965 ETHER_BPF_MTAP(ifp, m_bpf);
5969 if (error == ENOBUFS) {
5970 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5972 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5974 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5975 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5977 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5983 rm_runlock(&sc->hn_vf_lock, &pt);
5986 #if defined(INET6) || defined(INET)
5988 * Perform TSO packet header fixup or get l2/l3 header length now,
5989 * since packet headers should be cache-hot.
5991 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5992 m = hn_tso_fixup(m);
5993 if (__predict_false(m == NULL)) {
5994 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5997 } else if (m->m_pkthdr.csum_flags &
5998 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6000 if (__predict_false(m == NULL)) {
6001 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6008 * Select the TX ring based on flowid
6010 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6011 #if defined(INET6) || defined(INET)
6014 if (m->m_pkthdr.len < 128 &&
6015 (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6016 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6017 m = hn_check_tcpsyn(m, &tcpsyn);
6018 if (__predict_false(m == NULL)) {
6020 IFCOUNTER_OERRORS, 1);
6025 const int tcpsyn = 0;
6030 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6032 txr = &sc->hn_tx_ring[idx];
6034 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6036 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6040 if (txr->hn_oactive)
6043 if (txr->hn_sched_tx)
6046 if (mtx_trylock(&txr->hn_tx_lock)) {
6049 sched = hn_xmit(txr, txr->hn_direct_tx_size);
6050 mtx_unlock(&txr->hn_tx_lock);
6055 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6060 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6064 mtx_lock(&txr->hn_tx_lock);
6065 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6067 mtx_unlock(&txr->hn_tx_lock);
6071 hn_xmit_qflush(struct ifnet *ifp)
6073 struct hn_softc *sc = ifp->if_softc;
6074 struct rm_priotracker pt;
6077 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6078 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6081 rm_rlock(&sc->hn_vf_lock, &pt);
6082 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6083 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6084 rm_runlock(&sc->hn_vf_lock, &pt);
6088 hn_xmit_txeof(struct hn_tx_ring *txr)
6091 if (txr->hn_sched_tx)
6094 if (mtx_trylock(&txr->hn_tx_lock)) {
6097 txr->hn_oactive = 0;
6098 sched = hn_xmit(txr, txr->hn_direct_tx_size);
6099 mtx_unlock(&txr->hn_tx_lock);
6101 taskqueue_enqueue(txr->hn_tx_taskq,
6107 * Release the oactive earlier, with the hope, that
6108 * others could catch up. The task will clear the
6109 * oactive again with the hn_tx_lock to avoid possible
6112 txr->hn_oactive = 0;
6113 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6118 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6120 struct hn_tx_ring *txr = xtxr;
6122 mtx_lock(&txr->hn_tx_lock);
6124 mtx_unlock(&txr->hn_tx_lock);
6128 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6130 struct hn_tx_ring *txr = xtxr;
6132 mtx_lock(&txr->hn_tx_lock);
6133 txr->hn_oactive = 0;
6135 mtx_unlock(&txr->hn_tx_lock);
6139 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6141 struct vmbus_chan_br cbr;
6142 struct hn_rx_ring *rxr;
6143 struct hn_tx_ring *txr = NULL;
6146 idx = vmbus_chan_subidx(chan);
6149 * Link this channel to RX/TX ring.
6151 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6152 ("invalid channel index %d, should > 0 && < %d",
6153 idx, sc->hn_rx_ring_inuse));
6154 rxr = &sc->hn_rx_ring[idx];
6155 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6156 ("RX ring %d already attached", idx));
6157 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6158 rxr->hn_chan = chan;
6161 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6162 idx, vmbus_chan_id(chan));
6165 if (idx < sc->hn_tx_ring_inuse) {
6166 txr = &sc->hn_tx_ring[idx];
6167 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6168 ("TX ring %d already attached", idx));
6169 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6171 txr->hn_chan = chan;
6173 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6174 idx, vmbus_chan_id(chan));
6178 /* Bind this channel to a proper CPU. */
6179 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6184 cbr.cbr = rxr->hn_br;
6185 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6186 cbr.cbr_txsz = HN_TXBR_SIZE;
6187 cbr.cbr_rxsz = HN_RXBR_SIZE;
6188 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6190 if (error == EISCONN) {
6191 if_printf(sc->hn_ifp, "bufring is connected after "
6192 "chan%u open failure\n", vmbus_chan_id(chan));
6193 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6195 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6196 vmbus_chan_id(chan), error);
6203 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6205 struct hn_rx_ring *rxr;
6208 idx = vmbus_chan_subidx(chan);
6211 * Link this channel to RX/TX ring.
6213 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6214 ("invalid channel index %d, should > 0 && < %d",
6215 idx, sc->hn_rx_ring_inuse));
6216 rxr = &sc->hn_rx_ring[idx];
6217 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6218 ("RX ring %d is not attached", idx));
6219 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6221 if (idx < sc->hn_tx_ring_inuse) {
6222 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6224 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6225 ("TX ring %d is not attached attached", idx));
6226 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6230 * Close this channel.
6233 * Channel closing does _not_ destroy the target channel.
6235 error = vmbus_chan_close_direct(chan);
6236 if (error == EISCONN) {
6237 if_printf(sc->hn_ifp, "chan%u bufring is connected "
6238 "after being closed\n", vmbus_chan_id(chan));
6239 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6241 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6242 vmbus_chan_id(chan), error);
6247 hn_attach_subchans(struct hn_softc *sc)
6249 struct vmbus_channel **subchans;
6250 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6253 KASSERT(subchan_cnt > 0, ("no sub-channels"));
6255 /* Attach the sub-channels. */
6256 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6257 for (i = 0; i < subchan_cnt; ++i) {
6260 error1 = hn_chan_attach(sc, subchans[i]);
6263 /* Move on; all channels will be detached later. */
6266 vmbus_subchan_rel(subchans, subchan_cnt);
6269 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6272 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6280 hn_detach_allchans(struct hn_softc *sc)
6282 struct vmbus_channel **subchans;
6283 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6286 if (subchan_cnt == 0)
6289 /* Detach the sub-channels. */
6290 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6291 for (i = 0; i < subchan_cnt; ++i)
6292 hn_chan_detach(sc, subchans[i]);
6293 vmbus_subchan_rel(subchans, subchan_cnt);
6297 * Detach the primary channel, _after_ all sub-channels
6300 hn_chan_detach(sc, sc->hn_prichan);
6302 /* Wait for sub-channels to be destroyed, if any. */
6303 vmbus_subchan_drain(sc->hn_prichan);
6306 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6307 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6308 HN_RX_FLAG_ATTACHED) == 0,
6309 ("%dth RX ring is still attached", i));
6311 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6312 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6313 HN_TX_FLAG_ATTACHED) == 0,
6314 ("%dth TX ring is still attached", i));
6320 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6322 struct vmbus_channel **subchans;
6323 int nchan, rxr_cnt, error;
6325 nchan = *nsubch + 1;
6328 * Multiple RX/TX rings are not requested.
6335 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6338 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6340 /* No RSS; this is benign. */
6345 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6349 if (nchan > rxr_cnt)
6352 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6358 * Allocate sub-channels from NVS.
6360 *nsubch = nchan - 1;
6361 error = hn_nvs_alloc_subchans(sc, nsubch);
6362 if (error || *nsubch == 0) {
6363 /* Failed to allocate sub-channels. */
6369 * Wait for all sub-channels to become ready before moving on.
6371 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6372 vmbus_subchan_rel(subchans, *nsubch);
6377 hn_synth_attachable(const struct hn_softc *sc)
6381 if (sc->hn_flags & HN_FLAG_ERRORS)
6384 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6385 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6387 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6394 * Make sure that the RX filter is zero after the successful
6395 * RNDIS initialization.
6398 * Under certain conditions on certain versions of Hyper-V,
6399 * the RNDIS rxfilter is _not_ zero on the hypervisor side
6400 * after the successful RNDIS initialization, which breaks
6401 * the assumption of any following code (well, it breaks the
6402 * RNDIS API contract actually). Clear the RNDIS rxfilter
6403 * explicitly, drain packets sneaking through, and drain the
6404 * interrupt taskqueues scheduled due to the stealth packets.
6407 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6411 hn_drain_rxtx(sc, nchan);
6415 hn_synth_attach(struct hn_softc *sc, int mtu)
6417 #define ATTACHED_NVS 0x0002
6418 #define ATTACHED_RNDIS 0x0004
6420 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6421 int error, nsubch, nchan = 1, i, rndis_inited;
6422 uint32_t old_caps, attached = 0;
6424 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6425 ("synthetic parts were attached"));
6427 if (!hn_synth_attachable(sc))
6430 /* Save capabilities for later verification. */
6431 old_caps = sc->hn_caps;
6434 /* Clear RSS stuffs. */
6435 sc->hn_rss_ind_size = 0;
6436 sc->hn_rss_hash = 0;
6437 sc->hn_rss_hcap = 0;
6440 * Attach the primary channel _before_ attaching NVS and RNDIS.
6442 error = hn_chan_attach(sc, sc->hn_prichan);
6449 error = hn_nvs_attach(sc, mtu);
6452 attached |= ATTACHED_NVS;
6455 * Attach RNDIS _after_ NVS is attached.
6457 error = hn_rndis_attach(sc, mtu, &rndis_inited);
6459 attached |= ATTACHED_RNDIS;
6464 * Make sure capabilities are not changed.
6466 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6467 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6468 old_caps, sc->hn_caps);
6474 * Allocate sub-channels for multi-TX/RX rings.
6477 * The # of RX rings that can be used is equivalent to the # of
6478 * channels to be requested.
6480 nsubch = sc->hn_rx_ring_cnt - 1;
6481 error = hn_synth_alloc_subchans(sc, &nsubch);
6484 /* NOTE: _Full_ synthetic parts detach is required now. */
6485 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6488 * Set the # of TX/RX rings that could be used according to
6489 * the # of channels that NVS offered.
6492 hn_set_ring_inuse(sc, nchan);
6494 /* Only the primary channel can be used; done */
6499 * Attach the sub-channels.
6501 * NOTE: hn_set_ring_inuse() _must_ have been called.
6503 error = hn_attach_subchans(sc);
6508 * Configure RSS key and indirect table _after_ all sub-channels
6511 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6513 * RSS key is not set yet; set it to the default RSS key.
6516 if_printf(sc->hn_ifp, "setup default RSS key\n");
6517 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6518 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6521 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6523 * RSS indirect table is not set yet; set it up in round-
6527 if_printf(sc->hn_ifp, "setup default RSS indirect "
6530 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
6531 rss->rss_ind[i] = i % nchan;
6532 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6535 * # of usable channels may be changed, so we have to
6536 * make sure that all entries in RSS indirect table
6539 * NOTE: hn_set_ring_inuse() _must_ have been called.
6541 hn_rss_ind_fixup(sc);
6544 sc->hn_rss_hash = sc->hn_rss_hcap;
6545 if ((sc->hn_flags & HN_FLAG_RXVF) ||
6546 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6547 /* NOTE: Don't reconfigure RSS; will do immediately. */
6548 hn_vf_rss_fixup(sc, false);
6550 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6555 * Fixup transmission aggregation setup.
6558 hn_rndis_init_fixat(sc, nchan);
6562 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6563 hn_rndis_init_fixat(sc, nchan);
6564 hn_synth_detach(sc);
6566 if (attached & ATTACHED_RNDIS) {
6567 hn_rndis_init_fixat(sc, nchan);
6568 hn_rndis_detach(sc);
6570 if (attached & ATTACHED_NVS)
6572 hn_chan_detach(sc, sc->hn_prichan);
6573 /* Restore old capabilities. */
6574 sc->hn_caps = old_caps;
6578 #undef ATTACHED_RNDIS
6584 * The interface must have been suspended though hn_suspend(), before
6585 * this function get called.
6588 hn_synth_detach(struct hn_softc *sc)
6591 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6592 ("synthetic parts were not attached"));
6594 /* Detach the RNDIS first. */
6595 hn_rndis_detach(sc);
6600 /* Detach all of the channels. */
6601 hn_detach_allchans(sc);
6603 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6607 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6609 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6610 ("invalid ring count %d", ring_cnt));
6612 if (sc->hn_tx_ring_cnt > ring_cnt)
6613 sc->hn_tx_ring_inuse = ring_cnt;
6615 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6616 sc->hn_rx_ring_inuse = ring_cnt;
6619 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6620 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6625 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6630 * The TX bufring will not be drained by the hypervisor,
6631 * if the primary channel is revoked.
6633 while (!vmbus_chan_rx_empty(chan) ||
6634 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6635 !vmbus_chan_tx_empty(chan)))
6637 vmbus_chan_intr_drain(chan);
6641 hn_disable_rx(struct hn_softc *sc)
6645 * Disable RX by clearing RX filter forcefully.
6647 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6648 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6651 * Give RNDIS enough time to flush all pending data packets.
6653 pause("waitrx", (200 * hz) / 1000);
6658 * RX/TX _must_ have been suspended/disabled, before this function
6662 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6664 struct vmbus_channel **subch = NULL;
6668 * Drain RX/TX bufrings and interrupts.
6672 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6674 if (subch != NULL) {
6677 for (i = 0; i < nsubch; ++i)
6678 hn_chan_drain(sc, subch[i]);
6680 hn_chan_drain(sc, sc->hn_prichan);
6683 vmbus_subchan_rel(subch, nsubch);
6687 hn_suspend_data(struct hn_softc *sc)
6689 struct hn_tx_ring *txr;
6697 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6698 txr = &sc->hn_tx_ring[i];
6700 mtx_lock(&txr->hn_tx_lock);
6701 txr->hn_suspended = 1;
6702 mtx_unlock(&txr->hn_tx_lock);
6703 /* No one is able send more packets now. */
6706 * Wait for all pending sends to finish.
6709 * We will _not_ receive all pending send-done, if the
6710 * primary channel is revoked.
6712 while (hn_tx_ring_pending(txr) &&
6713 !vmbus_chan_is_revoked(sc->hn_prichan))
6714 pause("hnwtx", 1 /* 1 tick */);
6725 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6728 * Drain any pending TX tasks.
6731 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6732 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6734 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6735 txr = &sc->hn_tx_ring[i];
6737 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6738 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6743 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6746 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6750 hn_suspend_mgmt(struct hn_softc *sc)
6757 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6758 * through hn_mgmt_taskq.
6760 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6761 vmbus_chan_run_task(sc->hn_prichan, &task);
6764 * Make sure that all pending management tasks are completed.
6766 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6767 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6768 taskqueue_drain_all(sc->hn_mgmt_taskq0);
6772 hn_suspend(struct hn_softc *sc)
6775 /* Disable polling. */
6779 * If the non-transparent mode VF is activated, the synthetic
6780 * device is receiving packets, so the data path of the
6781 * synthetic device must be suspended.
6783 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6784 (sc->hn_flags & HN_FLAG_RXVF))
6785 hn_suspend_data(sc);
6786 hn_suspend_mgmt(sc);
6790 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6794 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6795 ("invalid TX ring count %d", tx_ring_cnt));
6797 for (i = 0; i < tx_ring_cnt; ++i) {
6798 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6800 mtx_lock(&txr->hn_tx_lock);
6801 txr->hn_suspended = 0;
6802 mtx_unlock(&txr->hn_tx_lock);
6807 hn_resume_data(struct hn_softc *sc)
6816 hn_rxfilter_config(sc);
6819 * Make sure to clear suspend status on "all" TX rings,
6820 * since hn_tx_ring_inuse can be changed after
6821 * hn_suspend_data().
6823 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6825 #ifdef HN_IFSTART_SUPPORT
6826 if (!hn_use_if_start)
6830 * Flush unused drbrs, since hn_tx_ring_inuse may be
6833 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6834 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6840 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6841 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6844 * Use txeof task, so that any pending oactive can be
6847 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6852 hn_resume_mgmt(struct hn_softc *sc)
6855 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6858 * Kick off network change detection, if it was pending.
6859 * If no network change was pending, start link status
6860 * checks, which is more lightweight than network change
6863 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6864 hn_change_network(sc);
6866 hn_update_link_status(sc);
6870 hn_resume(struct hn_softc *sc)
6874 * If the non-transparent mode VF is activated, the synthetic
6875 * device have to receive packets, so the data path of the
6876 * synthetic device must be resumed.
6878 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6879 (sc->hn_flags & HN_FLAG_RXVF))
6883 * Don't resume link status change if VF is attached/activated.
6884 * - In the non-transparent VF mode, the synthetic device marks
6885 * link down until the VF is deactivated; i.e. VF is down.
6886 * - In transparent VF mode, VF's media status is used until
6887 * the VF is detached.
6889 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6890 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6894 * Re-enable polling if this interface is running and
6895 * the polling is requested.
6897 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6898 hn_polling(sc, sc->hn_pollhz);
6902 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6904 const struct rndis_status_msg *msg;
6907 if (dlen < sizeof(*msg)) {
6908 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6913 switch (msg->rm_status) {
6914 case RNDIS_STATUS_MEDIA_CONNECT:
6915 case RNDIS_STATUS_MEDIA_DISCONNECT:
6916 hn_update_link_status(sc);
6919 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6920 case RNDIS_STATUS_LINK_SPEED_CHANGE:
6921 /* Not really useful; ignore. */
6924 case RNDIS_STATUS_NETWORK_CHANGE:
6925 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6926 if (dlen < ofs + msg->rm_stbuflen ||
6927 msg->rm_stbuflen < sizeof(uint32_t)) {
6928 if_printf(sc->hn_ifp, "network changed\n");
6932 memcpy(&change, ((const uint8_t *)msg) + ofs,
6934 if_printf(sc->hn_ifp, "network changed, change %u\n",
6937 hn_change_network(sc);
6941 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6948 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6950 const struct rndis_pktinfo *pi = info_data;
6953 while (info_dlen != 0) {
6957 if (__predict_false(info_dlen < sizeof(*pi)))
6959 if (__predict_false(info_dlen < pi->rm_size))
6961 info_dlen -= pi->rm_size;
6963 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6965 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6967 dlen = pi->rm_size - pi->rm_pktinfooffset;
6970 switch (pi->rm_type) {
6971 case NDIS_PKTINFO_TYPE_VLAN:
6972 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
6974 info->vlan_info = *((const uint32_t *)data);
6975 mask |= HN_RXINFO_VLAN;
6978 case NDIS_PKTINFO_TYPE_CSUM:
6979 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
6981 info->csum_info = *((const uint32_t *)data);
6982 mask |= HN_RXINFO_CSUM;
6985 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
6986 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
6988 info->hash_value = *((const uint32_t *)data);
6989 mask |= HN_RXINFO_HASHVAL;
6992 case HN_NDIS_PKTINFO_TYPE_HASHINF:
6993 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
6995 info->hash_info = *((const uint32_t *)data);
6996 mask |= HN_RXINFO_HASHINF;
7003 if (mask == HN_RXINFO_ALL) {
7004 /* All found; done */
7008 pi = (const struct rndis_pktinfo *)
7009 ((const uint8_t *)pi + pi->rm_size);
7014 * - If there is no hash value, invalidate the hash info.
7016 if ((mask & HN_RXINFO_HASHVAL) == 0)
7017 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
7021 static __inline bool
7022 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7025 if (off < check_off) {
7026 if (__predict_true(off + len <= check_off))
7028 } else if (off > check_off) {
7029 if (__predict_true(check_off + check_len <= off))
7036 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7038 const struct rndis_packet_msg *pkt;
7039 struct hn_rxinfo info;
7040 int data_off, pktinfo_off, data_len, pktinfo_len;
7045 if (__predict_false(dlen < sizeof(*pkt))) {
7046 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7051 if (__predict_false(dlen < pkt->rm_len)) {
7052 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7053 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7056 if (__predict_false(pkt->rm_len <
7057 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7058 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7059 "msglen %u, data %u, oob %u, pktinfo %u\n",
7060 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7061 pkt->rm_pktinfolen);
7064 if (__predict_false(pkt->rm_datalen == 0)) {
7065 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7072 #define IS_OFFSET_INVALID(ofs) \
7073 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
7074 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7076 /* XXX Hyper-V does not meet data offset alignment requirement */
7077 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7078 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7079 "data offset %u\n", pkt->rm_dataoffset);
7082 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7083 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7084 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7085 "oob offset %u\n", pkt->rm_oobdataoffset);
7088 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7089 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7090 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7091 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7095 #undef IS_OFFSET_INVALID
7097 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7098 data_len = pkt->rm_datalen;
7099 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7100 pktinfo_len = pkt->rm_pktinfolen;
7103 * Check OOB coverage.
7105 if (__predict_false(pkt->rm_oobdatalen != 0)) {
7106 int oob_off, oob_len;
7108 if_printf(rxr->hn_ifp, "got oobdata\n");
7109 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7110 oob_len = pkt->rm_oobdatalen;
7112 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7113 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7114 "oob overflow, msglen %u, oob abs %d len %d\n",
7115 pkt->rm_len, oob_off, oob_len);
7120 * Check against data.
7122 if (hn_rndis_check_overlap(oob_off, oob_len,
7123 data_off, data_len)) {
7124 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7125 "oob overlaps data, oob abs %d len %d, "
7126 "data abs %d len %d\n",
7127 oob_off, oob_len, data_off, data_len);
7132 * Check against pktinfo.
7134 if (pktinfo_len != 0 &&
7135 hn_rndis_check_overlap(oob_off, oob_len,
7136 pktinfo_off, pktinfo_len)) {
7137 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7138 "oob overlaps pktinfo, oob abs %d len %d, "
7139 "pktinfo abs %d len %d\n",
7140 oob_off, oob_len, pktinfo_off, pktinfo_len);
7146 * Check per-packet-info coverage and find useful per-packet-info.
7148 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7149 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7150 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7151 if (__predict_true(pktinfo_len != 0)) {
7155 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7156 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7157 "pktinfo overflow, msglen %u, "
7158 "pktinfo abs %d len %d\n",
7159 pkt->rm_len, pktinfo_off, pktinfo_len);
7164 * Check packet info coverage.
7166 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7167 data_off, data_len);
7168 if (__predict_false(overlap)) {
7169 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7170 "pktinfo overlap data, pktinfo abs %d len %d, "
7171 "data abs %d len %d\n",
7172 pktinfo_off, pktinfo_len, data_off, data_len);
7177 * Find useful per-packet-info.
7179 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7180 pktinfo_len, &info);
7181 if (__predict_false(error)) {
7182 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7188 if (__predict_false(data_off + data_len > pkt->rm_len)) {
7189 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7190 "data overflow, msglen %u, data abs %d len %d\n",
7191 pkt->rm_len, data_off, data_len);
7194 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7197 static __inline void
7198 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7200 const struct rndis_msghdr *hdr;
7202 if (__predict_false(dlen < sizeof(*hdr))) {
7203 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7208 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7209 /* Hot data path. */
7210 hn_rndis_rx_data(rxr, data, dlen);
7215 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7216 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7218 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7222 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7224 const struct hn_nvs_hdr *hdr;
7226 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7227 if_printf(sc->hn_ifp, "invalid nvs notify\n");
7230 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7232 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7233 /* Useless; ignore */
7236 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7240 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7241 const struct vmbus_chanpkt_hdr *pkt)
7243 struct hn_nvs_sendctx *sndc;
7245 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7246 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7247 VMBUS_CHANPKT_DATALEN(pkt));
7250 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7256 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7257 const struct vmbus_chanpkt_hdr *pkthdr)
7259 const struct vmbus_chanpkt_rxbuf *pkt;
7260 const struct hn_nvs_hdr *nvs_hdr;
7263 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7264 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7267 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7269 /* Make sure that this is a RNDIS message. */
7270 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7271 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7276 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7277 if (__predict_false(hlen < sizeof(*pkt))) {
7278 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7281 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7283 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7284 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7289 count = pkt->cp_rxbuf_cnt;
7290 if (__predict_false(hlen <
7291 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7292 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7296 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7297 for (i = 0; i < count; ++i) {
7300 ofs = pkt->cp_rxbuf[i].rb_ofs;
7301 len = pkt->cp_rxbuf[i].rb_len;
7302 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7303 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7304 "ofs %d, len %d\n", i, ofs, len);
7307 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7311 * Ack the consumed RXBUF associated w/ this channel packet,
7312 * so that this RXBUF can be recycled by the hypervisor.
7314 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7318 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7321 struct hn_nvs_rndis_ack ack;
7324 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7325 ack.nvs_status = HN_NVS_STATUS_OK;
7329 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7330 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7331 if (__predict_false(error == EAGAIN)) {
7334 * This should _not_ happen in real world, since the
7335 * consumption of the TX bufring from the TX path is
7338 if (rxr->hn_ack_failed == 0)
7339 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7340 rxr->hn_ack_failed++;
7347 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7352 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7354 struct hn_rx_ring *rxr = xrxr;
7355 struct hn_softc *sc = rxr->hn_ifp->if_softc;
7358 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7361 pktlen = rxr->hn_pktbuf_len;
7362 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7363 if (__predict_false(error == ENOBUFS)) {
7368 * Expand channel packet buffer.
7371 * Use M_WAITOK here, since allocation failure
7374 nlen = rxr->hn_pktbuf_len * 2;
7375 while (nlen < pktlen)
7377 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7379 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7380 rxr->hn_pktbuf_len, nlen);
7382 free(rxr->hn_pktbuf, M_DEVBUF);
7383 rxr->hn_pktbuf = nbuf;
7384 rxr->hn_pktbuf_len = nlen;
7387 } else if (__predict_false(error == EAGAIN)) {
7388 /* No more channel packets; done! */
7391 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7393 switch (pkt->cph_type) {
7394 case VMBUS_CHANPKT_TYPE_COMP:
7395 hn_nvs_handle_comp(sc, chan, pkt);
7398 case VMBUS_CHANPKT_TYPE_RXBUF:
7399 hn_nvs_handle_rxbuf(rxr, chan, pkt);
7402 case VMBUS_CHANPKT_TYPE_INBAND:
7403 hn_nvs_handle_notify(sc, pkt);
7407 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7412 hn_chan_rollup(rxr, rxr->hn_txr);
7416 hn_sysinit(void *arg __unused)
7420 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7422 #ifdef HN_IFSTART_SUPPORT
7424 * Don't use ifnet.if_start if transparent VF mode is requested;
7425 * mainly due to the IFF_DRV_OACTIVE flag.
7427 if (hn_xpnt_vf && hn_use_if_start) {
7428 hn_use_if_start = 0;
7429 printf("hn: tranparent VF mode, if_transmit will be used, "
7430 "instead of if_start\n");
7433 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7434 printf("hn: invalid transparent VF attach routing "
7435 "wait timeout %d, reset to %d\n",
7436 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7437 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7441 * Initialize VF map.
7443 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7444 hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7445 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7449 * Fix the # of TX taskqueues.
7451 if (hn_tx_taskq_cnt <= 0)
7452 hn_tx_taskq_cnt = 1;
7453 else if (hn_tx_taskq_cnt > mp_ncpus)
7454 hn_tx_taskq_cnt = mp_ncpus;
7457 * Fix the TX taskqueue mode.
7459 switch (hn_tx_taskq_mode) {
7460 case HN_TX_TASKQ_M_INDEP:
7461 case HN_TX_TASKQ_M_GLOBAL:
7462 case HN_TX_TASKQ_M_EVTTQ:
7465 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7469 if (vm_guest != VM_GUEST_HV)
7472 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7475 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7476 M_DEVBUF, M_WAITOK);
7477 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7478 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7479 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7480 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7484 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7487 hn_sysuninit(void *arg __unused)
7490 if (hn_tx_taskque != NULL) {
7493 for (i = 0; i < hn_tx_taskq_cnt; ++i)
7494 taskqueue_free(hn_tx_taskque[i]);
7495 free(hn_tx_taskque, M_DEVBUF);
7498 if (hn_vfmap != NULL)
7499 free(hn_vfmap, M_DEVBUF);
7500 rm_destroy(&hn_vfmap_lock);
7502 counter_u64_free(hn_udpcs_fixup);
7504 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);