2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
62 #include <sys/param.h>
63 #include <sys/systm.h>
65 #include <sys/counter.h>
66 #include <sys/kernel.h>
67 #include <sys/limits.h>
68 #include <sys/malloc.h>
70 #include <sys/module.h>
72 #include <sys/queue.h>
74 #include <sys/rmlock.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
85 #include <machine/atomic.h>
86 #include <machine/in_cksum.h>
89 #include <net/ethernet.h>
91 #include <net/if_arp.h>
92 #include <net/if_dl.h>
93 #include <net/if_media.h>
94 #include <net/if_types.h>
95 #include <net/if_var.h>
96 #include <net/if_vlan_var.h>
97 #include <net/rndis.h>
99 #include <netinet/in_systm.h>
100 #include <netinet/in.h>
101 #include <netinet/ip.h>
102 #include <netinet/ip6.h>
103 #include <netinet/tcp.h>
104 #include <netinet/tcp_lro.h>
105 #include <netinet/udp.h>
107 #include <dev/hyperv/include/hyperv.h>
108 #include <dev/hyperv/include/hyperv_busdma.h>
109 #include <dev/hyperv/include/vmbus.h>
110 #include <dev/hyperv/include/vmbus_xact.h>
112 #include <dev/hyperv/netvsc/ndis.h>
113 #include <dev/hyperv/netvsc/if_hnreg.h>
114 #include <dev/hyperv/netvsc/if_hnvar.h>
115 #include <dev/hyperv/netvsc/hn_nvs.h>
116 #include <dev/hyperv/netvsc/hn_rndis.h>
118 #include "vmbus_if.h"
120 #define HN_IFSTART_SUPPORT
122 /* NOTE: M_HASHTYPE_RSS_UDP_IPV4 is not available on stable/10. */
123 #ifndef M_HASHTYPE_RSS_UDP_IPV4
124 #define M_HASHTYPE_RSS_UDP_IPV4 M_HASHTYPE_OPAQUE
127 #define HN_RING_CNT_DEF_MAX 8
129 #define HN_VFMAP_SIZE_DEF 8
131 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */
133 /* YYY should get it from the underlying channel */
134 #define HN_TX_DESC_CNT 512
136 #define HN_RNDIS_PKT_LEN \
137 (sizeof(struct rndis_packet_msg) + \
138 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
139 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
140 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
141 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
142 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
143 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
145 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
146 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
147 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
148 /* -1 for RNDIS packet message */
149 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
151 #define HN_DIRECT_TX_SIZE_DEF 128
153 #define HN_EARLY_TXEOF_THRESH 8
155 #define HN_PKTBUF_LEN_DEF (16 * 1024)
157 #define HN_LROENT_CNT_DEF 128
159 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
160 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
161 /* YYY 2*MTU is a bit rough, but should be good enough. */
162 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
164 #define HN_LRO_ACKCNT_DEF 1
166 #define HN_LOCK_INIT(sc) \
167 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
168 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
169 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
170 #define HN_LOCK(sc) \
172 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
175 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
177 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
178 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
179 #define HN_CSUM_IP_HWASSIST(sc) \
180 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
181 #define HN_CSUM_IP6_HWASSIST(sc) \
182 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
184 #define HN_PKTSIZE_MIN(align) \
185 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
186 HN_RNDIS_PKT_LEN, (align))
187 #define HN_PKTSIZE(m, align) \
188 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
190 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
193 #ifndef HN_USE_TXDESC_BUFRING
194 SLIST_ENTRY(hn_txdesc) link;
196 STAILQ_ENTRY(hn_txdesc) agg_link;
198 /* Aggregated txdescs, in sending order. */
199 STAILQ_HEAD(, hn_txdesc) agg_list;
201 /* The oldest packet, if transmission aggregation happens. */
203 struct hn_tx_ring *txr;
205 uint32_t flags; /* HN_TXD_FLAG_ */
206 struct hn_nvs_sendctx send_ctx;
210 bus_dmamap_t data_dmap;
212 bus_addr_t rndis_pkt_paddr;
213 struct rndis_packet_msg *rndis_pkt;
214 bus_dmamap_t rndis_pkt_dmap;
217 #define HN_TXD_FLAG_ONLIST 0x0001
218 #define HN_TXD_FLAG_DMAMAP 0x0002
219 #define HN_TXD_FLAG_ONAGG 0x0004
228 struct hn_rxvf_setarg {
229 struct hn_rx_ring *rxr;
230 struct ifnet *vf_ifp;
233 #define HN_RXINFO_VLAN 0x0001
234 #define HN_RXINFO_CSUM 0x0002
235 #define HN_RXINFO_HASHINF 0x0004
236 #define HN_RXINFO_HASHVAL 0x0008
237 #define HN_RXINFO_ALL \
240 HN_RXINFO_HASHINF | \
243 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
244 #define HN_NDIS_RXCSUM_INFO_INVALID 0
245 #define HN_NDIS_HASH_INFO_INVALID 0
247 static int hn_probe(device_t);
248 static int hn_attach(device_t);
249 static int hn_detach(device_t);
250 static int hn_shutdown(device_t);
251 static void hn_chan_callback(struct vmbus_channel *,
254 static void hn_init(void *);
255 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
256 #ifdef HN_IFSTART_SUPPORT
257 static void hn_start(struct ifnet *);
259 static int hn_transmit(struct ifnet *, struct mbuf *);
260 static void hn_xmit_qflush(struct ifnet *);
261 static int hn_ifmedia_upd(struct ifnet *);
262 static void hn_ifmedia_sts(struct ifnet *,
263 struct ifmediareq *);
265 static void hn_ifnet_event(void *, struct ifnet *, int);
266 static void hn_ifaddr_event(void *, struct ifnet *);
267 static void hn_ifnet_attevent(void *, struct ifnet *);
268 static void hn_ifnet_detevent(void *, struct ifnet *);
269 static void hn_ifnet_lnkevent(void *, struct ifnet *, int);
271 static bool hn_ismyvf(const struct hn_softc *,
272 const struct ifnet *);
273 static void hn_rxvf_change(struct hn_softc *,
274 struct ifnet *, bool);
275 static void hn_rxvf_set(struct hn_softc *, struct ifnet *);
276 static void hn_rxvf_set_task(void *, int);
277 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
278 static int hn_xpnt_vf_iocsetflags(struct hn_softc *);
279 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *,
281 static void hn_xpnt_vf_saveifflags(struct hn_softc *);
282 static bool hn_xpnt_vf_isready(struct hn_softc *);
283 static void hn_xpnt_vf_setready(struct hn_softc *);
284 static void hn_xpnt_vf_init_taskfunc(void *, int);
285 static void hn_xpnt_vf_init(struct hn_softc *);
286 static void hn_xpnt_vf_setenable(struct hn_softc *);
287 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool);
288 static void hn_vf_rss_fixup(struct hn_softc *, bool);
289 static void hn_vf_rss_restore(struct hn_softc *);
291 static int hn_rndis_rxinfo(const void *, int,
293 static void hn_rndis_rx_data(struct hn_rx_ring *,
295 static void hn_rndis_rx_status(struct hn_softc *,
297 static void hn_rndis_init_fixat(struct hn_softc *, int);
299 static void hn_nvs_handle_notify(struct hn_softc *,
300 const struct vmbus_chanpkt_hdr *);
301 static void hn_nvs_handle_comp(struct hn_softc *,
302 struct vmbus_channel *,
303 const struct vmbus_chanpkt_hdr *);
304 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
305 struct vmbus_channel *,
306 const struct vmbus_chanpkt_hdr *);
307 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
308 struct vmbus_channel *, uint64_t);
310 #if __FreeBSD_version >= 1100099
311 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
312 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
314 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
315 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
316 #if __FreeBSD_version < 1100095
317 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
319 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
321 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
322 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
323 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
324 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
325 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
326 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
327 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
328 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
329 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
330 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
331 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
332 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
333 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
334 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
335 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
336 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
337 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
338 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
339 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
340 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
341 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
342 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
343 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
345 static void hn_stop(struct hn_softc *, bool);
346 static void hn_init_locked(struct hn_softc *);
347 static int hn_chan_attach(struct hn_softc *,
348 struct vmbus_channel *);
349 static void hn_chan_detach(struct hn_softc *,
350 struct vmbus_channel *);
351 static int hn_attach_subchans(struct hn_softc *);
352 static void hn_detach_allchans(struct hn_softc *);
353 static void hn_chan_rollup(struct hn_rx_ring *,
354 struct hn_tx_ring *);
355 static void hn_set_ring_inuse(struct hn_softc *, int);
356 static int hn_synth_attach(struct hn_softc *, int);
357 static void hn_synth_detach(struct hn_softc *);
358 static int hn_synth_alloc_subchans(struct hn_softc *,
360 static bool hn_synth_attachable(const struct hn_softc *);
361 static void hn_suspend(struct hn_softc *);
362 static void hn_suspend_data(struct hn_softc *);
363 static void hn_suspend_mgmt(struct hn_softc *);
364 static void hn_resume(struct hn_softc *);
365 static void hn_resume_data(struct hn_softc *);
366 static void hn_resume_mgmt(struct hn_softc *);
367 static void hn_suspend_mgmt_taskfunc(void *, int);
368 static void hn_chan_drain(struct hn_softc *,
369 struct vmbus_channel *);
370 static void hn_disable_rx(struct hn_softc *);
371 static void hn_drain_rxtx(struct hn_softc *, int);
372 static void hn_polling(struct hn_softc *, u_int);
373 static void hn_chan_polling(struct vmbus_channel *, u_int);
374 static void hn_mtu_change_fixup(struct hn_softc *);
376 static void hn_update_link_status(struct hn_softc *);
377 static void hn_change_network(struct hn_softc *);
378 static void hn_link_taskfunc(void *, int);
379 static void hn_netchg_init_taskfunc(void *, int);
380 static void hn_netchg_status_taskfunc(void *, int);
381 static void hn_link_status(struct hn_softc *);
383 static int hn_create_rx_data(struct hn_softc *, int);
384 static void hn_destroy_rx_data(struct hn_softc *);
385 static int hn_check_iplen(const struct mbuf *, int);
386 static void hn_rxpkt_proto(const struct mbuf *, int *, int *);
387 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
388 static int hn_rxfilter_config(struct hn_softc *);
389 static int hn_rss_reconfig(struct hn_softc *);
390 static void hn_rss_ind_fixup(struct hn_softc *);
391 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
392 static int hn_rxpkt(struct hn_rx_ring *, const void *,
393 int, const struct hn_rxinfo *);
394 static uint32_t hn_rss_type_fromndis(uint32_t);
395 static uint32_t hn_rss_type_tondis(uint32_t);
397 static int hn_tx_ring_create(struct hn_softc *, int);
398 static void hn_tx_ring_destroy(struct hn_tx_ring *);
399 static int hn_create_tx_data(struct hn_softc *, int);
400 static void hn_fixup_tx_data(struct hn_softc *);
401 static void hn_fixup_rx_data(struct hn_softc *);
402 static void hn_destroy_tx_data(struct hn_softc *);
403 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
404 static void hn_txdesc_gc(struct hn_tx_ring *,
406 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
407 struct hn_txdesc *, struct mbuf **);
408 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
410 static void hn_set_chim_size(struct hn_softc *, int);
411 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
412 static bool hn_tx_ring_pending(struct hn_tx_ring *);
413 static void hn_tx_ring_qflush(struct hn_tx_ring *);
414 static void hn_resume_tx(struct hn_softc *, int);
415 static void hn_set_txagg(struct hn_softc *);
416 static void *hn_try_txagg(struct ifnet *,
417 struct hn_tx_ring *, struct hn_txdesc *,
419 static int hn_get_txswq_depth(const struct hn_tx_ring *);
420 static void hn_txpkt_done(struct hn_nvs_sendctx *,
421 struct hn_softc *, struct vmbus_channel *,
423 static int hn_txpkt_sglist(struct hn_tx_ring *,
425 static int hn_txpkt_chim(struct hn_tx_ring *,
427 static int hn_xmit(struct hn_tx_ring *, int);
428 static void hn_xmit_taskfunc(void *, int);
429 static void hn_xmit_txeof(struct hn_tx_ring *);
430 static void hn_xmit_txeof_taskfunc(void *, int);
431 #ifdef HN_IFSTART_SUPPORT
432 static int hn_start_locked(struct hn_tx_ring *, int);
433 static void hn_start_taskfunc(void *, int);
434 static void hn_start_txeof(struct hn_tx_ring *);
435 static void hn_start_txeof_taskfunc(void *, int);
438 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
439 "Hyper-V network interface");
441 /* Trust tcp segements verification on host side. */
442 static int hn_trust_hosttcp = 1;
443 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
444 &hn_trust_hosttcp, 0,
445 "Trust tcp segement verification on host side, "
446 "when csum info is missing (global setting)");
448 /* Trust udp datagrams verification on host side. */
449 static int hn_trust_hostudp = 1;
450 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
451 &hn_trust_hostudp, 0,
452 "Trust udp datagram verification on host side, "
453 "when csum info is missing (global setting)");
455 /* Trust ip packets verification on host side. */
456 static int hn_trust_hostip = 1;
457 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
459 "Trust ip packet verification on host side, "
460 "when csum info is missing (global setting)");
463 * Offload UDP/IPv4 checksum.
465 static int hn_enable_udp4cs = 1;
466 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
467 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
470 * Offload UDP/IPv6 checksum.
472 static int hn_enable_udp6cs = 1;
473 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
474 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
477 static counter_u64_t hn_udpcs_fixup;
478 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
479 &hn_udpcs_fixup, "# of UDP checksum fixup");
484 * This value is for Azure. For Hyper-V, set this above
485 * 65536 to disable UDP datagram checksum fixup.
487 static int hn_udpcs_fixup_mtu = 1420;
488 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
489 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
491 /* Limit TSO burst size */
492 static int hn_tso_maxlen = IP_MAXPACKET;
493 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
494 &hn_tso_maxlen, 0, "TSO burst limit");
496 /* Limit chimney send size */
497 static int hn_tx_chimney_size = 0;
498 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
499 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
501 /* Limit the size of packet for direct transmission */
502 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
503 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
504 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
506 /* # of LRO entries per RX ring */
507 #if defined(INET) || defined(INET6)
508 #if __FreeBSD_version >= 1100095
509 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
510 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
511 &hn_lro_entry_count, 0, "LRO entry count");
515 static int hn_tx_taskq_cnt = 1;
516 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
517 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
519 #define HN_TX_TASKQ_M_INDEP 0
520 #define HN_TX_TASKQ_M_GLOBAL 1
521 #define HN_TX_TASKQ_M_EVTTQ 2
523 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
524 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
525 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
526 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
528 #ifndef HN_USE_TXDESC_BUFRING
529 static int hn_use_txdesc_bufring = 0;
531 static int hn_use_txdesc_bufring = 1;
533 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
534 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
536 #ifdef HN_IFSTART_SUPPORT
537 /* Use ifnet.if_start instead of ifnet.if_transmit */
538 static int hn_use_if_start = 0;
539 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
540 &hn_use_if_start, 0, "Use if_start TX method");
543 /* # of channels to use */
544 static int hn_chan_cnt = 0;
545 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
547 "# of channels to use; each channel has one RX ring and one TX ring");
549 /* # of transmit rings to use */
550 static int hn_tx_ring_cnt = 0;
551 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
552 &hn_tx_ring_cnt, 0, "# of TX rings to use");
554 /* Software TX ring deptch */
555 static int hn_tx_swq_depth = 0;
556 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
557 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
559 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
560 #if __FreeBSD_version >= 1100095
561 static u_int hn_lro_mbufq_depth = 0;
562 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
563 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
566 /* Packet transmission aggregation size limit */
567 static int hn_tx_agg_size = -1;
568 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
569 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
571 /* Packet transmission aggregation count limit */
572 static int hn_tx_agg_pkts = -1;
573 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
574 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
577 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
578 0, 0, hn_vflist_sysctl, "A", "VF list");
581 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
582 0, 0, hn_vfmap_sysctl, "A", "VF mapping");
585 static int hn_xpnt_vf = 1;
586 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
587 &hn_xpnt_vf, 0, "Transparent VF mod");
589 /* Accurate BPF support for Transparent VF */
590 static int hn_xpnt_vf_accbpf = 0;
591 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
592 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
594 /* Extra wait for transparent VF attach routing; unit seconds. */
595 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
596 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
597 &hn_xpnt_vf_attwait, 0,
598 "Extra wait for transparent VF attach routing; unit: seconds");
600 static u_int hn_cpu_index; /* next CPU for channel */
601 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
603 static struct rmlock hn_vfmap_lock;
604 static int hn_vfmap_size;
605 static struct ifnet **hn_vfmap;
608 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
609 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
610 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
611 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
612 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
613 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
616 static const struct hyperv_guid hn_guid = {
618 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
619 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
622 static device_method_t hn_methods[] = {
623 /* Device interface */
624 DEVMETHOD(device_probe, hn_probe),
625 DEVMETHOD(device_attach, hn_attach),
626 DEVMETHOD(device_detach, hn_detach),
627 DEVMETHOD(device_shutdown, hn_shutdown),
631 static driver_t hn_driver = {
634 sizeof(struct hn_softc)
637 static devclass_t hn_devclass;
639 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
640 MODULE_VERSION(hn, 1);
641 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
643 #if __FreeBSD_version >= 1100099
645 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
649 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
650 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
655 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
658 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
659 txd->chim_size == 0, ("invalid rndis sglist txd"));
660 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
661 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
665 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
667 struct hn_nvs_rndis rndis;
669 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
670 txd->chim_size > 0, ("invalid rndis chim txd"));
672 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
673 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
674 rndis.nvs_chim_idx = txd->chim_index;
675 rndis.nvs_chim_sz = txd->chim_size;
677 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
678 &rndis, sizeof(rndis), &txd->send_ctx));
681 static __inline uint32_t
682 hn_chim_alloc(struct hn_softc *sc)
684 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
685 u_long *bmap = sc->hn_chim_bmap;
686 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
688 for (i = 0; i < bmap_cnt; ++i) {
691 idx = ffsl(~bmap[i]);
695 --idx; /* ffsl is 1-based */
696 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
697 ("invalid i %d and idx %d", i, idx));
699 if (atomic_testandset_long(&bmap[i], idx))
702 ret = i * LONG_BIT + idx;
709 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
714 idx = chim_idx / LONG_BIT;
715 KASSERT(idx < sc->hn_chim_bmap_cnt,
716 ("invalid chimney index 0x%x", chim_idx));
718 mask = 1UL << (chim_idx % LONG_BIT);
719 KASSERT(sc->hn_chim_bmap[idx] & mask,
720 ("index bitmap 0x%lx, chimney index %u, "
721 "bitmap idx %d, bitmask 0x%lx",
722 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
724 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
727 #if defined(INET6) || defined(INET)
729 #define PULLUP_HDR(m, len) \
731 if (__predict_false((m)->m_len < (len))) { \
732 (m) = m_pullup((m), (len)); \
739 * NOTE: If this function failed, the m_head would be freed.
741 static __inline struct mbuf *
742 hn_tso_fixup(struct mbuf *m_head)
744 struct ether_vlan_header *evl;
748 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
750 PULLUP_HDR(m_head, sizeof(*evl));
751 evl = mtod(m_head, struct ether_vlan_header *);
752 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
753 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
755 ehlen = ETHER_HDR_LEN;
756 m_head->m_pkthdr.l2hlen = ehlen;
759 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
763 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
764 ip = mtodo(m_head, ehlen);
765 iphlen = ip->ip_hl << 2;
766 m_head->m_pkthdr.l3hlen = iphlen;
768 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
769 th = mtodo(m_head, ehlen + iphlen);
773 th->th_sum = in_pseudo(ip->ip_src.s_addr,
774 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
777 #if defined(INET6) && defined(INET)
784 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
785 ip6 = mtodo(m_head, ehlen);
786 if (ip6->ip6_nxt != IPPROTO_TCP) {
790 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
792 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
793 th = mtodo(m_head, ehlen + sizeof(*ip6));
796 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
803 * NOTE: If this function failed, the m_head would be freed.
805 static __inline struct mbuf *
806 hn_set_hlen(struct mbuf *m_head)
808 const struct ether_vlan_header *evl;
811 PULLUP_HDR(m_head, sizeof(*evl));
812 evl = mtod(m_head, const struct ether_vlan_header *);
813 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
814 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
816 ehlen = ETHER_HDR_LEN;
817 m_head->m_pkthdr.l2hlen = ehlen;
820 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
824 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
825 ip = mtodo(m_head, ehlen);
826 iphlen = ip->ip_hl << 2;
827 m_head->m_pkthdr.l3hlen = iphlen;
830 * UDP checksum offload does not work in Azure, if the
831 * following conditions meet:
832 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
833 * - IP_DF is not set in the IP hdr.
835 * Fallback to software checksum for these UDP datagrams.
837 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
838 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
839 (ntohs(ip->ip_off) & IP_DF) == 0) {
840 uint16_t off = ehlen + iphlen;
842 counter_u64_add(hn_udpcs_fixup, 1);
843 PULLUP_HDR(m_head, off + sizeof(struct udphdr));
844 *(uint16_t *)(m_head->m_data + off +
845 m_head->m_pkthdr.csum_data) = in_cksum_skip(
846 m_head, m_head->m_pkthdr.len, off);
847 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
851 #if defined(INET6) && defined(INET)
856 const struct ip6_hdr *ip6;
858 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
859 ip6 = mtodo(m_head, ehlen);
860 if (ip6->ip6_nxt != IPPROTO_TCP) {
864 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
871 * NOTE: If this function failed, the m_head would be freed.
873 static __inline struct mbuf *
874 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
876 const struct tcphdr *th;
880 ehlen = m_head->m_pkthdr.l2hlen;
881 iphlen = m_head->m_pkthdr.l3hlen;
883 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
884 th = mtodo(m_head, ehlen + iphlen);
885 if (th->th_flags & TH_SYN)
892 #endif /* INET6 || INET */
895 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
901 if (sc->hn_rx_filter != filter) {
902 error = hn_rndis_set_rxfilter(sc, filter);
904 sc->hn_rx_filter = filter;
910 hn_rxfilter_config(struct hn_softc *sc)
912 struct ifnet *ifp = sc->hn_ifp;
918 * If the non-transparent mode VF is activated, we don't know how
919 * its RX filter is configured, so stick the synthetic device in
920 * the promiscous mode.
922 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
923 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
925 filter = NDIS_PACKET_TYPE_DIRECTED;
926 if (ifp->if_flags & IFF_BROADCAST)
927 filter |= NDIS_PACKET_TYPE_BROADCAST;
928 /* TODO: support multicast list */
929 if ((ifp->if_flags & IFF_ALLMULTI) ||
930 !TAILQ_EMPTY(&ifp->if_multiaddrs))
931 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
933 return (hn_set_rxfilter(sc, filter));
937 hn_set_txagg(struct hn_softc *sc)
943 * Setup aggregation size.
945 if (sc->hn_agg_size < 0)
948 size = sc->hn_agg_size;
950 if (sc->hn_rndis_agg_size < size)
951 size = sc->hn_rndis_agg_size;
953 /* NOTE: We only aggregate packets using chimney sending buffers. */
954 if (size > (uint32_t)sc->hn_chim_szmax)
955 size = sc->hn_chim_szmax;
957 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
964 /* NOTE: Type of the per TX ring setting is 'int'. */
969 * Setup aggregation packet count.
971 if (sc->hn_agg_pkts < 0)
974 pkts = sc->hn_agg_pkts;
976 if (sc->hn_rndis_agg_pkts < pkts)
977 pkts = sc->hn_rndis_agg_pkts;
986 /* NOTE: Type of the per TX ring setting is 'short'. */
991 /* NOTE: Type of the per TX ring setting is 'short'. */
992 if (sc->hn_rndis_agg_align > SHRT_MAX) {
999 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1000 size, pkts, sc->hn_rndis_agg_align);
1003 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1004 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1006 mtx_lock(&txr->hn_tx_lock);
1007 txr->hn_agg_szmax = size;
1008 txr->hn_agg_pktmax = pkts;
1009 txr->hn_agg_align = sc->hn_rndis_agg_align;
1010 mtx_unlock(&txr->hn_tx_lock);
1015 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1018 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1019 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1020 return txr->hn_txdesc_cnt;
1021 return hn_tx_swq_depth;
1025 hn_rss_reconfig(struct hn_softc *sc)
1031 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1035 * Disable RSS first.
1038 * Direct reconfiguration by setting the UNCHG flags does
1039 * _not_ work properly.
1042 if_printf(sc->hn_ifp, "disable RSS\n");
1043 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1045 if_printf(sc->hn_ifp, "RSS disable failed\n");
1050 * Reenable the RSS w/ the updated RSS key or indirect
1054 if_printf(sc->hn_ifp, "reconfig RSS\n");
1055 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1057 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1064 hn_rss_ind_fixup(struct hn_softc *sc)
1066 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1069 nchan = sc->hn_rx_ring_inuse;
1070 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1073 * Check indirect table to make sure that all channels in it
1076 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1077 if (rss->rss_ind[i] >= nchan) {
1078 if_printf(sc->hn_ifp,
1079 "RSS indirect table %d fixup: %u -> %d\n",
1080 i, rss->rss_ind[i], nchan - 1);
1081 rss->rss_ind[i] = nchan - 1;
1087 hn_ifmedia_upd(struct ifnet *ifp __unused)
1094 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1096 struct hn_softc *sc = ifp->if_softc;
1098 ifmr->ifm_status = IFM_AVALID;
1099 ifmr->ifm_active = IFM_ETHER;
1101 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1102 ifmr->ifm_active |= IFM_NONE;
1105 ifmr->ifm_status |= IFM_ACTIVE;
1106 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1110 hn_rxvf_set_task(void *xarg, int pending __unused)
1112 struct hn_rxvf_setarg *arg = xarg;
1114 arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1118 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1120 struct hn_rx_ring *rxr;
1121 struct hn_rxvf_setarg arg;
1127 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1129 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1130 rxr = &sc->hn_rx_ring[i];
1132 if (i < sc->hn_rx_ring_inuse) {
1134 arg.vf_ifp = vf_ifp;
1135 vmbus_chan_run_task(rxr->hn_chan, &task);
1137 rxr->hn_rxvf_ifp = vf_ifp;
1143 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1145 const struct ifnet *hn_ifp;
1147 hn_ifp = sc->hn_ifp;
1152 if (ifp->if_alloctype != IFT_ETHER)
1155 /* Ignore lagg/vlan interfaces */
1156 if (strcmp(ifp->if_dname, "lagg") == 0 ||
1157 strcmp(ifp->if_dname, "vlan") == 0)
1160 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1167 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1169 struct ifnet *hn_ifp;
1173 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1176 if (!hn_ismyvf(sc, ifp))
1178 hn_ifp = sc->hn_ifp;
1181 if (sc->hn_flags & HN_FLAG_RXVF)
1184 sc->hn_flags |= HN_FLAG_RXVF;
1185 hn_rxfilter_config(sc);
1187 if (!(sc->hn_flags & HN_FLAG_RXVF))
1190 sc->hn_flags &= ~HN_FLAG_RXVF;
1191 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1192 hn_rxfilter_config(sc);
1194 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1197 hn_nvs_set_datapath(sc,
1198 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1200 hn_rxvf_set(sc, rxvf ? ifp : NULL);
1203 hn_vf_rss_fixup(sc, true);
1204 hn_suspend_mgmt(sc);
1205 sc->hn_link_flags &=
1206 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1207 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1209 hn_vf_rss_restore(sc);
1213 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1214 rxvf ? "VF_UP" : "VF_DOWN", NULL);
1217 if_printf(hn_ifp, "datapath is switched %s %s\n",
1218 rxvf ? "to" : "from", ifp->if_xname);
1225 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1228 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1230 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1234 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1237 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1241 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1243 struct ifnet *ifp, *vf_ifp;
1249 vf_ifp = sc->hn_vf_ifp;
1252 * Fix up requested capabilities w/ supported capabilities,
1253 * since the supported capabilities could have been changed.
1255 ifr->ifr_reqcap &= ifp->if_capabilities;
1256 /* Pass SIOCSIFCAP to VF. */
1257 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1261 * The error will be propagated to the callers, however, it
1262 * is _not_ useful here.
1266 * Merge VF's enabled capabilities.
1268 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1270 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1271 if (ifp->if_capenable & IFCAP_TXCSUM)
1272 ifp->if_hwassist |= tmp;
1274 ifp->if_hwassist &= ~tmp;
1276 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1277 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1278 ifp->if_hwassist |= tmp;
1280 ifp->if_hwassist &= ~tmp;
1282 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1283 if (ifp->if_capenable & IFCAP_TSO4)
1284 ifp->if_hwassist |= tmp;
1286 ifp->if_hwassist &= ~tmp;
1288 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1289 if (ifp->if_capenable & IFCAP_TSO6)
1290 ifp->if_hwassist |= tmp;
1292 ifp->if_hwassist &= ~tmp;
1298 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1300 struct ifnet *vf_ifp;
1304 vf_ifp = sc->hn_vf_ifp;
1306 memset(&ifr, 0, sizeof(ifr));
1307 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1308 ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1309 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1310 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1314 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1316 struct ifnet *ifp = sc->hn_ifp;
1321 /* XXX vlan(4) style mcast addr maintenance */
1322 if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1323 allmulti = IFF_ALLMULTI;
1325 /* Always set the VF's if_flags */
1326 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1330 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1332 struct rm_priotracker pt;
1333 struct ifnet *hn_ifp = NULL;
1337 * XXX racy, if hn(4) ever detached.
1339 rm_rlock(&hn_vfmap_lock, &pt);
1340 if (vf_ifp->if_index < hn_vfmap_size)
1341 hn_ifp = hn_vfmap[vf_ifp->if_index];
1342 rm_runlock(&hn_vfmap_lock, &pt);
1344 if (hn_ifp != NULL) {
1345 for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1347 * Allow tapping on the VF.
1349 ETHER_BPF_MTAP(vf_ifp, mn);
1354 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1355 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1359 * XXX IFCOUNTER_IMCAST
1360 * This stat updating is kinda invasive, since it
1361 * requires two checks on the mbuf: the length check
1362 * and the ethernet header check. As of this write,
1363 * all multicast packets go directly to hn(4), which
1364 * makes imcast stat updating in the VF a try in vian.
1368 * Fix up rcvif and increase hn(4)'s ipackets.
1370 mn->m_pkthdr.rcvif = hn_ifp;
1371 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1374 * Go through hn(4)'s if_input.
1376 hn_ifp->if_input(hn_ifp, m);
1379 * In the middle of the transition; free this
1384 m->m_nextpkt = NULL;
1392 hn_mtu_change_fixup(struct hn_softc *sc)
1399 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1400 #if __FreeBSD_version >= 1100099
1401 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1402 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1407 hn_rss_type_fromndis(uint32_t rss_hash)
1411 if (rss_hash & NDIS_HASH_IPV4)
1412 types |= RSS_TYPE_IPV4;
1413 if (rss_hash & NDIS_HASH_TCP_IPV4)
1414 types |= RSS_TYPE_TCP_IPV4;
1415 if (rss_hash & NDIS_HASH_IPV6)
1416 types |= RSS_TYPE_IPV6;
1417 if (rss_hash & NDIS_HASH_IPV6_EX)
1418 types |= RSS_TYPE_IPV6_EX;
1419 if (rss_hash & NDIS_HASH_TCP_IPV6)
1420 types |= RSS_TYPE_TCP_IPV6;
1421 if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1422 types |= RSS_TYPE_TCP_IPV6_EX;
1423 if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1424 types |= RSS_TYPE_UDP_IPV4;
1429 hn_rss_type_tondis(uint32_t types)
1431 uint32_t rss_hash = 0;
1433 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1434 ("UDP6 and UDP6EX are not supported"));
1436 if (types & RSS_TYPE_IPV4)
1437 rss_hash |= NDIS_HASH_IPV4;
1438 if (types & RSS_TYPE_TCP_IPV4)
1439 rss_hash |= NDIS_HASH_TCP_IPV4;
1440 if (types & RSS_TYPE_IPV6)
1441 rss_hash |= NDIS_HASH_IPV6;
1442 if (types & RSS_TYPE_IPV6_EX)
1443 rss_hash |= NDIS_HASH_IPV6_EX;
1444 if (types & RSS_TYPE_TCP_IPV6)
1445 rss_hash |= NDIS_HASH_TCP_IPV6;
1446 if (types & RSS_TYPE_TCP_IPV6_EX)
1447 rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1448 if (types & RSS_TYPE_UDP_IPV4)
1449 rss_hash |= NDIS_HASH_UDP_IPV4_X;
1454 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1460 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1461 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1465 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1467 struct ifnet *ifp, *vf_ifp;
1468 struct ifrsshash ifrh;
1469 struct ifrsskey ifrk;
1471 uint32_t my_types, diff_types, mbuf_types = 0;
1474 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1475 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1477 if (sc->hn_rx_ring_inuse == 1) {
1478 /* No RSS on synthetic parts; done. */
1481 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1482 /* Synthetic parts do not support Toeplitz; done. */
1487 vf_ifp = sc->hn_vf_ifp;
1490 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is
1493 memset(&ifrk, 0, sizeof(ifrk));
1494 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1495 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1497 if_printf(ifp, "%s SIOCGRSSKEY failed: %d\n",
1498 vf_ifp->if_xname, error);
1501 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1502 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1503 vf_ifp->if_xname, ifrk.ifrk_func);
1506 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1507 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1508 vf_ifp->if_xname, ifrk.ifrk_keylen);
1513 * Extract VF's RSS hash. Only Toeplitz is supported.
1515 memset(&ifrh, 0, sizeof(ifrh));
1516 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1517 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1519 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1520 vf_ifp->if_xname, error);
1523 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1524 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1525 vf_ifp->if_xname, ifrh.ifrh_func);
1529 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1530 if ((ifrh.ifrh_types & my_types) == 0) {
1531 /* This disables RSS; ignore it then */
1532 if_printf(ifp, "%s intersection of RSS types failed. "
1533 "VF %#x, mine %#x\n", vf_ifp->if_xname,
1534 ifrh.ifrh_types, my_types);
1538 diff_types = my_types ^ ifrh.ifrh_types;
1539 my_types &= ifrh.ifrh_types;
1540 mbuf_types = my_types;
1543 * Detect RSS hash value/type confliction.
1546 * We don't disable the hash type, but stop delivery the hash
1547 * value/type through mbufs on RX path.
1549 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1550 * hash is delivered with type of TCP_IPV4. This means if
1551 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1552 * least to hn_mbuf_hash. However, given that _all_ of the
1553 * NICs implement TCP_IPV4, this will _not_ impose any issues
1556 if ((my_types & RSS_TYPE_IPV4) &&
1557 (diff_types & ifrh.ifrh_types &
1558 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1559 /* Conflict; disable IPV4 hash type/value delivery. */
1560 if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1561 mbuf_types &= ~RSS_TYPE_IPV4;
1563 if ((my_types & RSS_TYPE_IPV6) &&
1564 (diff_types & ifrh.ifrh_types &
1565 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1566 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1567 RSS_TYPE_IPV6_EX))) {
1568 /* Conflict; disable IPV6 hash type/value delivery. */
1569 if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1570 mbuf_types &= ~RSS_TYPE_IPV6;
1572 if ((my_types & RSS_TYPE_IPV6_EX) &&
1573 (diff_types & ifrh.ifrh_types &
1574 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1575 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1577 /* Conflict; disable IPV6_EX hash type/value delivery. */
1578 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1579 mbuf_types &= ~RSS_TYPE_IPV6_EX;
1581 if ((my_types & RSS_TYPE_TCP_IPV6) &&
1582 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1583 /* Conflict; disable TCP_IPV6 hash type/value delivery. */
1584 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1585 mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1587 if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1588 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1589 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1590 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1591 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1593 if ((my_types & RSS_TYPE_UDP_IPV6) &&
1594 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1595 /* Conflict; disable UDP_IPV6 hash type/value delivery. */
1596 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1597 mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1599 if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1600 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1601 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1602 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1603 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1607 * Indirect table does not matter.
1610 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1611 hn_rss_type_tondis(my_types);
1612 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1613 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1616 error = hn_rss_reconfig(sc);
1618 /* XXX roll-back? */
1619 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1620 /* XXX keep going. */
1624 /* Hash deliverability for mbufs. */
1625 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1629 hn_vf_rss_restore(struct hn_softc *sc)
1633 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1634 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1636 if (sc->hn_rx_ring_inuse == 1)
1640 * Restore hash types. Key does _not_ matter.
1642 if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1645 sc->hn_rss_hash = sc->hn_rss_hcap;
1646 error = hn_rss_reconfig(sc);
1648 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1650 /* XXX keep going. */
1654 /* Hash deliverability for mbufs. */
1655 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1659 hn_xpnt_vf_setready(struct hn_softc *sc)
1661 struct ifnet *ifp, *vf_ifp;
1666 vf_ifp = sc->hn_vf_ifp;
1669 * Mark the VF ready.
1671 sc->hn_vf_rdytick = 0;
1674 * Save information for restoration.
1676 sc->hn_saved_caps = ifp->if_capabilities;
1677 sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1678 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1679 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1682 * Intersect supported/enabled capabilities.
1685 * if_hwassist is not changed here.
1687 ifp->if_capabilities &= vf_ifp->if_capabilities;
1688 ifp->if_capenable &= ifp->if_capabilities;
1693 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1694 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1695 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1696 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1697 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1698 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1701 * Change VF's enabled capabilities.
1703 memset(&ifr, 0, sizeof(ifr));
1704 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1705 ifr.ifr_reqcap = ifp->if_capenable;
1706 hn_xpnt_vf_iocsetcaps(sc, &ifr);
1708 if (ifp->if_mtu != ETHERMTU) {
1714 memset(&ifr, 0, sizeof(ifr));
1715 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1716 ifr.ifr_mtu = ifp->if_mtu;
1717 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1719 if_printf(ifp, "%s SIOCSIFMTU %lu failed\n",
1720 vf_ifp->if_xname, ifp->if_mtu);
1721 if (ifp->if_mtu > ETHERMTU) {
1722 if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1726 * No need to adjust the synthetic parts' MTU;
1727 * failure of the adjustment will cause us
1728 * infinite headache.
1730 ifp->if_mtu = ETHERMTU;
1731 hn_mtu_change_fixup(sc);
1738 hn_xpnt_vf_isready(struct hn_softc *sc)
1743 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1746 if (sc->hn_vf_rdytick == 0)
1749 if (sc->hn_vf_rdytick > ticks)
1752 /* Mark VF as ready. */
1753 hn_xpnt_vf_setready(sc);
1758 hn_xpnt_vf_setenable(struct hn_softc *sc)
1764 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1765 rm_wlock(&sc->hn_vf_lock);
1766 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1767 rm_wunlock(&sc->hn_vf_lock);
1769 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1770 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1774 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1780 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1781 rm_wlock(&sc->hn_vf_lock);
1782 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1784 sc->hn_vf_ifp = NULL;
1785 rm_wunlock(&sc->hn_vf_lock);
1787 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1788 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1792 hn_xpnt_vf_init(struct hn_softc *sc)
1798 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1799 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1802 if_printf(sc->hn_ifp, "try bringing up %s\n",
1803 sc->hn_vf_ifp->if_xname);
1809 hn_xpnt_vf_saveifflags(sc);
1810 sc->hn_vf_ifp->if_flags |= IFF_UP;
1811 error = hn_xpnt_vf_iocsetflags(sc);
1813 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1814 sc->hn_vf_ifp->if_xname, error);
1820 * Datapath setting must happen _after_ bringing the VF up.
1822 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1826 * Fixup RSS related bits _after_ the VF is brought up, since
1827 * many VFs generate RSS key during it's initialization.
1829 hn_vf_rss_fixup(sc, true);
1831 /* Mark transparent mode VF as enabled. */
1832 hn_xpnt_vf_setenable(sc);
1836 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1838 struct hn_softc *sc = xsc;
1842 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1844 if (sc->hn_vf_ifp == NULL)
1846 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1849 if (sc->hn_vf_rdytick != 0) {
1850 /* Mark VF as ready. */
1851 hn_xpnt_vf_setready(sc);
1854 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1856 * Delayed VF initialization.
1859 if_printf(sc->hn_ifp, "delayed initialize %s\n",
1860 sc->hn_vf_ifp->if_xname);
1862 hn_xpnt_vf_init(sc);
1869 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1871 struct hn_softc *sc = xsc;
1875 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1878 if (!hn_ismyvf(sc, ifp))
1881 if (sc->hn_vf_ifp != NULL) {
1882 if_printf(sc->hn_ifp, "%s was attached as VF\n",
1883 sc->hn_vf_ifp->if_xname);
1887 if (hn_xpnt_vf && ifp->if_start != NULL) {
1889 * ifnet.if_start is _not_ supported by transparent
1890 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1892 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1893 "in transparent VF mode.\n", ifp->if_xname);
1897 rm_wlock(&hn_vfmap_lock);
1899 if (ifp->if_index >= hn_vfmap_size) {
1900 struct ifnet **newmap;
1903 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1904 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1907 memcpy(newmap, hn_vfmap,
1908 sizeof(struct ifnet *) * hn_vfmap_size);
1909 free(hn_vfmap, M_DEVBUF);
1911 hn_vfmap_size = newsize;
1913 KASSERT(hn_vfmap[ifp->if_index] == NULL,
1914 ("%s: ifindex %d was mapped to %s",
1915 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1916 hn_vfmap[ifp->if_index] = sc->hn_ifp;
1918 rm_wunlock(&hn_vfmap_lock);
1920 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1921 rm_wlock(&sc->hn_vf_lock);
1922 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1923 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1924 sc->hn_vf_ifp = ifp;
1925 rm_wunlock(&sc->hn_vf_lock);
1931 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1932 * Save vf_ifp's current if_input for later restoration.
1934 sc->hn_vf_input = ifp->if_input;
1935 ifp->if_input = hn_xpnt_vf_input;
1938 * Stop link status management; use the VF's.
1940 hn_suspend_mgmt(sc);
1943 * Give VF sometime to complete its attach routing.
1945 wait_ticks = hn_xpnt_vf_attwait * hz;
1946 sc->hn_vf_rdytick = ticks + wait_ticks;
1948 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1956 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1958 struct hn_softc *sc = xsc;
1962 if (sc->hn_vf_ifp == NULL)
1965 if (!hn_ismyvf(sc, ifp))
1970 * Make sure that the delayed initialization is not running.
1973 * - This lock _must_ be released, since the hn_vf_init task
1974 * will try holding this lock.
1975 * - It is safe to release this lock here, since the
1976 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1978 * XXX racy, if hn(4) ever detached.
1981 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1984 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1985 sc->hn_ifp->if_xname));
1986 ifp->if_input = sc->hn_vf_input;
1987 sc->hn_vf_input = NULL;
1989 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1990 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1991 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1993 if (sc->hn_vf_rdytick == 0) {
1995 * The VF was ready; restore some settings.
1997 sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2000 * There is _no_ need to fixup if_capenable and
2001 * if_hwassist, since the if_capabilities before
2002 * restoration was an intersection of the VF's
2003 * if_capabilites and the synthetic device's
2006 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2007 sc->hn_ifp->if_hw_tsomaxsegcount =
2008 sc->hn_saved_tsosegcnt;
2009 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2012 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2014 * Restore RSS settings.
2016 hn_vf_rss_restore(sc);
2019 * Resume link status management, which was suspended
2020 * by hn_ifnet_attevent().
2026 /* Mark transparent mode VF as disabled. */
2027 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2029 rm_wlock(&hn_vfmap_lock);
2031 KASSERT(ifp->if_index < hn_vfmap_size,
2032 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2033 if (hn_vfmap[ifp->if_index] != NULL) {
2034 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2035 ("%s: ifindex %d was mapped to %s",
2036 ifp->if_xname, ifp->if_index,
2037 hn_vfmap[ifp->if_index]->if_xname));
2038 hn_vfmap[ifp->if_index] = NULL;
2041 rm_wunlock(&hn_vfmap_lock);
2047 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2049 struct hn_softc *sc = xsc;
2051 if (sc->hn_vf_ifp == ifp)
2052 if_link_state_change(sc->hn_ifp, link_state);
2056 hn_probe(device_t dev)
2059 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2060 device_set_desc(dev, "Hyper-V Network Interface");
2061 return BUS_PROBE_DEFAULT;
2067 hn_attach(device_t dev)
2069 struct hn_softc *sc = device_get_softc(dev);
2070 struct sysctl_oid_list *child;
2071 struct sysctl_ctx_list *ctx;
2072 uint8_t eaddr[ETHER_ADDR_LEN];
2073 struct ifnet *ifp = NULL;
2074 int error, ring_cnt, tx_ring_cnt;
2078 sc->hn_prichan = vmbus_get_channel(dev);
2080 rm_init(&sc->hn_vf_lock, "hnvf");
2081 if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2082 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2085 * Initialize these tunables once.
2087 sc->hn_agg_size = hn_tx_agg_size;
2088 sc->hn_agg_pkts = hn_tx_agg_pkts;
2091 * Setup taskqueue for transmission.
2093 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2097 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2098 M_DEVBUF, M_WAITOK);
2099 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2100 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2101 M_WAITOK, taskqueue_thread_enqueue,
2102 &sc->hn_tx_taskqs[i]);
2103 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2104 "%s tx%d", device_get_nameunit(dev), i);
2106 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2107 sc->hn_tx_taskqs = hn_tx_taskque;
2111 * Setup taskqueue for mangement tasks, e.g. link status.
2113 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2114 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2115 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2116 device_get_nameunit(dev));
2117 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2118 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2119 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2120 hn_netchg_status_taskfunc, sc);
2124 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2126 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2127 taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2128 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2129 device_get_nameunit(dev));
2130 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2131 hn_xpnt_vf_init_taskfunc, sc);
2135 * Allocate ifnet and setup its name earlier, so that if_printf
2136 * can be used by functions, which will be called after
2139 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
2141 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2144 * Initialize ifmedia earlier so that it can be unconditionally
2145 * destroyed, if error happened later on.
2147 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2150 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2151 * to use (tx_ring_cnt).
2154 * The # of RX rings to use is same as the # of channels to use.
2156 ring_cnt = hn_chan_cnt;
2157 if (ring_cnt <= 0) {
2159 ring_cnt = mp_ncpus;
2160 if (ring_cnt > HN_RING_CNT_DEF_MAX)
2161 ring_cnt = HN_RING_CNT_DEF_MAX;
2162 } else if (ring_cnt > mp_ncpus) {
2163 ring_cnt = mp_ncpus;
2166 tx_ring_cnt = hn_tx_ring_cnt;
2167 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2168 tx_ring_cnt = ring_cnt;
2169 #ifdef HN_IFSTART_SUPPORT
2170 if (hn_use_if_start) {
2171 /* ifnet.if_start only needs one TX ring. */
2177 * Set the leader CPU for channels.
2179 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2182 * Create enough TX/RX rings, even if only limited number of
2183 * channels can be allocated.
2185 error = hn_create_tx_data(sc, tx_ring_cnt);
2188 error = hn_create_rx_data(sc, ring_cnt);
2193 * Create transaction context for NVS and RNDIS transactions.
2195 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2196 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2197 if (sc->hn_xact == NULL) {
2203 * Install orphan handler for the revocation of this device's
2207 * The processing order is critical here:
2208 * Install the orphan handler, _before_ testing whether this
2209 * device's primary channel has been revoked or not.
2211 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2212 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2218 * Attach the synthetic parts, i.e. NVS and RNDIS.
2220 error = hn_synth_attach(sc, ETHERMTU);
2224 error = hn_rndis_get_eaddr(sc, eaddr);
2228 error = hn_rndis_get_mtu(sc, &mtu);
2231 else if (bootverbose)
2232 device_printf(dev, "RNDIS mtu %u\n", mtu);
2234 #if __FreeBSD_version >= 1100099
2235 if (sc->hn_rx_ring_inuse > 1) {
2237 * Reduce TCP segment aggregation limit for multiple
2238 * RX rings to increase ACK timeliness.
2240 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2245 * Fixup TX/RX stuffs after synthetic parts are attached.
2247 hn_fixup_tx_data(sc);
2248 hn_fixup_rx_data(sc);
2250 ctx = device_get_sysctl_ctx(dev);
2251 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2252 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2253 &sc->hn_nvs_ver, 0, "NVS version");
2254 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2255 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2256 hn_ndis_version_sysctl, "A", "NDIS version");
2257 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2258 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2259 hn_caps_sysctl, "A", "capabilities");
2260 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2261 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2262 hn_hwassist_sysctl, "A", "hwassist");
2263 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2264 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2265 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2266 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2267 "max # of TSO segments");
2268 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2269 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2270 "max size of TSO segment");
2271 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2272 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2273 hn_rxfilter_sysctl, "A", "rxfilter");
2274 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2275 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2276 hn_rss_hash_sysctl, "A", "RSS hash");
2277 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2278 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2279 hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2280 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2281 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2282 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2283 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2284 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2285 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2286 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2287 hn_rss_key_sysctl, "IU", "RSS key");
2288 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2289 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2290 hn_rss_ind_sysctl, "IU", "RSS indirect table");
2291 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2292 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2293 "RNDIS offered packet transmission aggregation size limit");
2294 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2295 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2296 "RNDIS offered packet transmission aggregation count limit");
2297 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2298 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2299 "RNDIS packet transmission aggregation alignment");
2300 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2301 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2302 hn_txagg_size_sysctl, "I",
2303 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2304 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2305 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2306 hn_txagg_pkts_sysctl, "I",
2307 "Packet transmission aggregation packets, "
2308 "0 -- disable, -1 -- auto");
2309 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2310 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2311 hn_polling_sysctl, "I",
2312 "Polling frequency: [100,1000000], 0 disable polling");
2313 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2314 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2315 hn_vf_sysctl, "A", "Virtual Function's name");
2317 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2318 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2319 hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2321 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2322 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2323 hn_xpnt_vf_enabled_sysctl, "I",
2324 "Transparent VF enabled");
2325 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2326 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2327 hn_xpnt_vf_accbpf_sysctl, "I",
2328 "Accurate BPF for transparent VF");
2332 * Setup the ifmedia, which has been initialized earlier.
2334 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2335 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2336 /* XXX ifmedia_set really should do this for us */
2337 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2340 * Setup the ifnet for this interface.
2344 ifp->if_baudrate = IF_Gbps(10);
2346 /* if_baudrate is 32bits on 32bit system. */
2347 ifp->if_baudrate = IF_Gbps(1);
2349 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2350 ifp->if_ioctl = hn_ioctl;
2351 ifp->if_init = hn_init;
2352 #ifdef HN_IFSTART_SUPPORT
2353 if (hn_use_if_start) {
2354 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2356 ifp->if_start = hn_start;
2357 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2358 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2359 IFQ_SET_READY(&ifp->if_snd);
2363 ifp->if_transmit = hn_transmit;
2364 ifp->if_qflush = hn_xmit_qflush;
2367 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2369 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2370 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2372 if (sc->hn_caps & HN_CAP_VLAN) {
2373 /* XXX not sure about VLAN_MTU. */
2374 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2377 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2378 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2379 ifp->if_capabilities |= IFCAP_TXCSUM;
2380 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2381 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2382 if (sc->hn_caps & HN_CAP_TSO4) {
2383 ifp->if_capabilities |= IFCAP_TSO4;
2384 ifp->if_hwassist |= CSUM_IP_TSO;
2386 if (sc->hn_caps & HN_CAP_TSO6) {
2387 ifp->if_capabilities |= IFCAP_TSO6;
2388 ifp->if_hwassist |= CSUM_IP6_TSO;
2391 /* Enable all available capabilities by default. */
2392 ifp->if_capenable = ifp->if_capabilities;
2395 * Disable IPv6 TSO and TXCSUM by default, they still can
2396 * be enabled through SIOCSIFCAP.
2398 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2399 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2401 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2403 * Lock hn_set_tso_maxsize() to simplify its
2407 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2409 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2410 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2413 ether_ifattach(ifp, eaddr);
2415 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2416 if_printf(ifp, "TSO segcnt %u segsz %u\n",
2417 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2419 if (mtu < ETHERMTU) {
2420 if_printf(ifp, "fixup mtu %lu -> %u\n", ifp->if_mtu, mtu);
2424 /* Inform the upper layer about the long frame support. */
2425 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2428 * Kick off link status check.
2430 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2431 hn_update_link_status(sc);
2434 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2435 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2436 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2437 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2439 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2440 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2445 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2446 * since interface's LLADDR is needed; interface LLADDR is not
2447 * available when ifnet_arrival event is triggered.
2449 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2450 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2451 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2452 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2456 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2457 hn_synth_detach(sc);
2463 hn_detach(device_t dev)
2465 struct hn_softc *sc = device_get_softc(dev);
2466 struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2468 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2470 * In case that the vmbus missed the orphan handler
2473 vmbus_xact_ctx_orphan(sc->hn_xact);
2476 if (sc->hn_ifaddr_evthand != NULL)
2477 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2478 if (sc->hn_ifnet_evthand != NULL)
2479 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2480 if (sc->hn_ifnet_atthand != NULL) {
2481 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2482 sc->hn_ifnet_atthand);
2484 if (sc->hn_ifnet_dethand != NULL) {
2485 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2486 sc->hn_ifnet_dethand);
2488 if (sc->hn_ifnet_lnkhand != NULL)
2489 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2491 vf_ifp = sc->hn_vf_ifp;
2492 __compiler_membar();
2494 hn_ifnet_detevent(sc, vf_ifp);
2496 if (device_is_attached(dev)) {
2498 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2499 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2503 * hn_stop() only suspends data, so managment
2504 * stuffs have to be suspended manually here.
2506 hn_suspend_mgmt(sc);
2507 hn_synth_detach(sc);
2510 ether_ifdetach(ifp);
2513 ifmedia_removeall(&sc->hn_media);
2514 hn_destroy_rx_data(sc);
2515 hn_destroy_tx_data(sc);
2517 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2520 for (i = 0; i < hn_tx_taskq_cnt; ++i)
2521 taskqueue_free(sc->hn_tx_taskqs[i]);
2522 free(sc->hn_tx_taskqs, M_DEVBUF);
2524 taskqueue_free(sc->hn_mgmt_taskq0);
2525 if (sc->hn_vf_taskq != NULL)
2526 taskqueue_free(sc->hn_vf_taskq);
2528 if (sc->hn_xact != NULL) {
2530 * Uninstall the orphan handler _before_ the xact is
2533 vmbus_chan_unset_orphan(sc->hn_prichan);
2534 vmbus_xact_ctx_destroy(sc->hn_xact);
2539 HN_LOCK_DESTROY(sc);
2540 rm_destroy(&sc->hn_vf_lock);
2545 hn_shutdown(device_t dev)
2552 hn_link_status(struct hn_softc *sc)
2554 uint32_t link_status;
2557 error = hn_rndis_get_linkstatus(sc, &link_status);
2559 /* XXX what to do? */
2563 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2564 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2566 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2567 if_link_state_change(sc->hn_ifp,
2568 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2569 LINK_STATE_UP : LINK_STATE_DOWN);
2573 hn_link_taskfunc(void *xsc, int pending __unused)
2575 struct hn_softc *sc = xsc;
2577 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2583 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2585 struct hn_softc *sc = xsc;
2587 /* Prevent any link status checks from running. */
2588 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2591 * Fake up a [link down --> link up] state change; 5 seconds
2592 * delay is used, which closely simulates miibus reaction
2593 * upon link down event.
2595 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2596 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2597 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2598 &sc->hn_netchg_status, 5 * hz);
2602 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2604 struct hn_softc *sc = xsc;
2606 /* Re-allow link status checks. */
2607 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2612 hn_update_link_status(struct hn_softc *sc)
2615 if (sc->hn_mgmt_taskq != NULL)
2616 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2620 hn_change_network(struct hn_softc *sc)
2623 if (sc->hn_mgmt_taskq != NULL)
2624 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2628 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2629 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2631 struct mbuf *m = *m_head;
2634 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2636 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2637 m, segs, nsegs, BUS_DMA_NOWAIT);
2638 if (error == EFBIG) {
2641 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2645 *m_head = m = m_new;
2646 txr->hn_tx_collapsed++;
2648 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2649 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2652 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2653 BUS_DMASYNC_PREWRITE);
2654 txd->flags |= HN_TXD_FLAG_DMAMAP;
2660 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2663 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2664 ("put an onlist txd %#x", txd->flags));
2665 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2666 ("put an onagg txd %#x", txd->flags));
2668 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2669 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2672 if (!STAILQ_EMPTY(&txd->agg_list)) {
2673 struct hn_txdesc *tmp_txd;
2675 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2678 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2679 ("resursive aggregation on aggregated txdesc"));
2680 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2681 ("not aggregated txdesc"));
2682 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2683 ("aggregated txdesc uses dmamap"));
2684 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2685 ("aggregated txdesc consumes "
2686 "chimney sending buffer"));
2687 KASSERT(tmp_txd->chim_size == 0,
2688 ("aggregated txdesc has non-zero "
2689 "chimney sending size"));
2691 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2692 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2693 freed = hn_txdesc_put(txr, tmp_txd);
2694 KASSERT(freed, ("failed to free aggregated txdesc"));
2698 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2699 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2700 ("chim txd uses dmamap"));
2701 hn_chim_free(txr->hn_sc, txd->chim_index);
2702 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2704 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2705 bus_dmamap_sync(txr->hn_tx_data_dtag,
2706 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2707 bus_dmamap_unload(txr->hn_tx_data_dtag,
2709 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2712 if (txd->m != NULL) {
2717 txd->flags |= HN_TXD_FLAG_ONLIST;
2718 #ifndef HN_USE_TXDESC_BUFRING
2719 mtx_lock_spin(&txr->hn_txlist_spin);
2720 KASSERT(txr->hn_txdesc_avail >= 0 &&
2721 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2722 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2723 txr->hn_txdesc_avail++;
2724 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2725 mtx_unlock_spin(&txr->hn_txlist_spin);
2726 #else /* HN_USE_TXDESC_BUFRING */
2728 atomic_add_int(&txr->hn_txdesc_avail, 1);
2730 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2731 #endif /* !HN_USE_TXDESC_BUFRING */
2736 static __inline struct hn_txdesc *
2737 hn_txdesc_get(struct hn_tx_ring *txr)
2739 struct hn_txdesc *txd;
2741 #ifndef HN_USE_TXDESC_BUFRING
2742 mtx_lock_spin(&txr->hn_txlist_spin);
2743 txd = SLIST_FIRST(&txr->hn_txlist);
2745 KASSERT(txr->hn_txdesc_avail > 0,
2746 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2747 txr->hn_txdesc_avail--;
2748 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2750 mtx_unlock_spin(&txr->hn_txlist_spin);
2752 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2756 #ifdef HN_USE_TXDESC_BUFRING
2758 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2760 #endif /* HN_USE_TXDESC_BUFRING */
2761 KASSERT(txd->m == NULL && txd->refs == 0 &&
2762 STAILQ_EMPTY(&txd->agg_list) &&
2763 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2764 txd->chim_size == 0 &&
2765 (txd->flags & HN_TXD_FLAG_ONLIST) &&
2766 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2767 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2768 txd->flags &= ~HN_TXD_FLAG_ONLIST;
2774 static __inline void
2775 hn_txdesc_hold(struct hn_txdesc *txd)
2778 /* 0->1 transition will never work */
2779 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2780 atomic_add_int(&txd->refs, 1);
2783 static __inline void
2784 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2787 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2788 ("recursive aggregation on aggregating txdesc"));
2790 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2791 ("already aggregated"));
2792 KASSERT(STAILQ_EMPTY(&txd->agg_list),
2793 ("recursive aggregation on to-be-aggregated txdesc"));
2795 txd->flags |= HN_TXD_FLAG_ONAGG;
2796 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2800 hn_tx_ring_pending(struct hn_tx_ring *txr)
2802 bool pending = false;
2804 #ifndef HN_USE_TXDESC_BUFRING
2805 mtx_lock_spin(&txr->hn_txlist_spin);
2806 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2808 mtx_unlock_spin(&txr->hn_txlist_spin);
2810 if (!buf_ring_full(txr->hn_txdesc_br))
2816 static __inline void
2817 hn_txeof(struct hn_tx_ring *txr)
2819 txr->hn_has_txeof = 0;
2824 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2825 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2827 struct hn_txdesc *txd = sndc->hn_cbarg;
2828 struct hn_tx_ring *txr;
2831 KASSERT(txr->hn_chan == chan,
2832 ("channel mismatch, on chan%u, should be chan%u",
2833 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2835 txr->hn_has_txeof = 1;
2836 hn_txdesc_put(txr, txd);
2838 ++txr->hn_txdone_cnt;
2839 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2840 txr->hn_txdone_cnt = 0;
2841 if (txr->hn_oactive)
2847 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2849 #if defined(INET) || defined(INET6)
2850 struct lro_ctrl *lro = &rxr->hn_lro;
2851 struct lro_entry *queued;
2853 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
2854 SLIST_REMOVE_HEAD(&lro->lro_active, next);
2855 tcp_lro_flush(lro, queued);
2861 * 'txr' could be NULL, if multiple channels and
2862 * ifnet.if_start method are enabled.
2864 if (txr == NULL || !txr->hn_has_txeof)
2867 txr->hn_txdone_cnt = 0;
2871 static __inline uint32_t
2872 hn_rndis_pktmsg_offset(uint32_t ofs)
2875 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2876 ("invalid RNDIS packet msg offset %u", ofs));
2877 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2880 static __inline void *
2881 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2882 size_t pi_dlen, uint32_t pi_type)
2884 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2885 struct rndis_pktinfo *pi;
2887 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2888 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2891 * Per-packet-info does not move; it only grows.
2894 * rm_pktinfooffset in this phase counts from the beginning
2895 * of rndis_packet_msg.
2897 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2898 ("%u pktinfo overflows RNDIS packet msg", pi_type));
2899 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2900 pkt->rm_pktinfolen);
2901 pkt->rm_pktinfolen += pi_size;
2903 pi->rm_size = pi_size;
2904 pi->rm_type = pi_type;
2905 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2907 return (pi->rm_data);
2911 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2913 struct hn_txdesc *txd;
2917 txd = txr->hn_agg_txd;
2918 KASSERT(txd != NULL, ("no aggregate txdesc"));
2921 * Since hn_txpkt() will reset this temporary stat, save
2922 * it now, so that oerrors can be updated properly, if
2923 * hn_txpkt() ever fails.
2925 pkts = txr->hn_stat_pkts;
2928 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2929 * failure, save it for later freeing, if hn_txpkt() ever
2933 error = hn_txpkt(ifp, txr, txd);
2934 if (__predict_false(error)) {
2935 /* txd is freed, but m is not. */
2938 txr->hn_flush_failed++;
2939 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2942 /* Reset all aggregation states. */
2943 txr->hn_agg_txd = NULL;
2944 txr->hn_agg_szleft = 0;
2945 txr->hn_agg_pktleft = 0;
2946 txr->hn_agg_prevpkt = NULL;
2952 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2957 if (txr->hn_agg_txd != NULL) {
2958 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2959 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2960 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2964 * Update the previous RNDIS packet's total length,
2965 * it can be increased due to the mandatory alignment
2966 * padding for this RNDIS packet. And update the
2967 * aggregating txdesc's chimney sending buffer size
2971 * Zero-out the padding, as required by the RNDIS spec.
2974 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2975 agg_txd->chim_size += pkt->rm_len - olen;
2977 /* Link this txdesc to the parent. */
2978 hn_txdesc_agg(agg_txd, txd);
2980 chim = (uint8_t *)pkt + pkt->rm_len;
2981 /* Save the current packet for later fixup. */
2982 txr->hn_agg_prevpkt = chim;
2984 txr->hn_agg_pktleft--;
2985 txr->hn_agg_szleft -= pktsize;
2986 if (txr->hn_agg_szleft <=
2987 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2989 * Probably can't aggregate more packets,
2990 * flush this aggregating txdesc proactively.
2992 txr->hn_agg_pktleft = 0;
2997 hn_flush_txagg(ifp, txr);
2999 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3001 txr->hn_tx_chimney_tried++;
3002 txd->chim_index = hn_chim_alloc(txr->hn_sc);
3003 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3005 txr->hn_tx_chimney++;
3007 chim = txr->hn_sc->hn_chim +
3008 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3010 if (txr->hn_agg_pktmax > 1 &&
3011 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3012 txr->hn_agg_txd = txd;
3013 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3014 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3015 txr->hn_agg_prevpkt = chim;
3022 * If this function fails, then both txd and m_head0 will be freed.
3025 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3026 struct mbuf **m_head0)
3028 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3029 int error, nsegs, i;
3030 struct mbuf *m_head = *m_head0;
3031 struct rndis_packet_msg *pkt;
3034 int pkt_hlen, pkt_size;
3036 pkt = txd->rndis_pkt;
3037 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3038 if (pkt_size < txr->hn_chim_size) {
3039 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3043 if (txr->hn_agg_txd != NULL)
3044 hn_flush_txagg(ifp, txr);
3047 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3048 pkt->rm_len = m_head->m_pkthdr.len;
3049 pkt->rm_dataoffset = 0;
3050 pkt->rm_datalen = m_head->m_pkthdr.len;
3051 pkt->rm_oobdataoffset = 0;
3052 pkt->rm_oobdatalen = 0;
3053 pkt->rm_oobdataelements = 0;
3054 pkt->rm_pktinfooffset = sizeof(*pkt);
3055 pkt->rm_pktinfolen = 0;
3056 pkt->rm_vchandle = 0;
3057 pkt->rm_reserved = 0;
3059 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3061 * Set the hash value for this packet, so that the host could
3062 * dispatch the TX done event for this packet back to this TX
3065 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3066 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3067 *pi_data = txr->hn_tx_idx;
3070 if (m_head->m_flags & M_VLANTAG) {
3071 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3072 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3073 *pi_data = NDIS_VLAN_INFO_MAKE(
3074 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3075 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3076 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3079 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3080 #if defined(INET6) || defined(INET)
3081 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3082 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3084 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3085 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3086 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3087 m_head->m_pkthdr.tso_segsz);
3090 #if defined(INET6) && defined(INET)
3095 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3096 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3097 m_head->m_pkthdr.tso_segsz);
3100 #endif /* INET6 || INET */
3101 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3102 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3103 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3104 if (m_head->m_pkthdr.csum_flags &
3105 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3106 *pi_data = NDIS_TXCSUM_INFO_IPV6;
3108 *pi_data = NDIS_TXCSUM_INFO_IPV4;
3109 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3110 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
3113 if (m_head->m_pkthdr.csum_flags &
3114 (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3115 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3116 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3117 } else if (m_head->m_pkthdr.csum_flags &
3118 (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3119 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3120 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3124 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3125 /* Fixup RNDIS packet message total length */
3126 pkt->rm_len += pkt_hlen;
3127 /* Convert RNDIS packet message offsets */
3128 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3129 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3132 * Fast path: Chimney sending.
3135 struct hn_txdesc *tgt_txd = txd;
3137 if (txr->hn_agg_txd != NULL) {
3138 tgt_txd = txr->hn_agg_txd;
3144 KASSERT(pkt == chim,
3145 ("RNDIS pkt not in chimney sending buffer"));
3146 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3147 ("chimney sending buffer is not used"));
3148 tgt_txd->chim_size += pkt->rm_len;
3150 m_copydata(m_head, 0, m_head->m_pkthdr.len,
3151 ((uint8_t *)chim) + pkt_hlen);
3153 txr->hn_gpa_cnt = 0;
3154 txr->hn_sendpkt = hn_txpkt_chim;
3158 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3159 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3160 ("chimney buffer is used"));
3161 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3163 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3164 if (__predict_false(error)) {
3168 * This mbuf is not linked w/ the txd yet, so free it now.
3173 freed = hn_txdesc_put(txr, txd);
3175 ("fail to free txd upon txdma error"));
3177 txr->hn_txdma_failed++;
3178 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3183 /* +1 RNDIS packet message */
3184 txr->hn_gpa_cnt = nsegs + 1;
3186 /* send packet with page buffer */
3187 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3188 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3189 txr->hn_gpa[0].gpa_len = pkt_hlen;
3192 * Fill the page buffers with mbuf info after the page
3193 * buffer for RNDIS packet message.
3195 for (i = 0; i < nsegs; ++i) {
3196 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3198 gpa->gpa_page = atop(segs[i].ds_addr);
3199 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3200 gpa->gpa_len = segs[i].ds_len;
3203 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3205 txr->hn_sendpkt = hn_txpkt_sglist;
3209 /* Set the completion routine */
3210 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3212 /* Update temporary stats for later use. */
3213 txr->hn_stat_pkts++;
3214 txr->hn_stat_size += m_head->m_pkthdr.len;
3215 if (m_head->m_flags & M_MCAST)
3216 txr->hn_stat_mcasts++;
3223 * If this function fails, then txd will be freed, but the mbuf
3224 * associated w/ the txd will _not_ be freed.
3227 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3229 int error, send_failed = 0, has_bpf;
3232 has_bpf = bpf_peers_present(ifp->if_bpf);
3235 * Make sure that this txd and any aggregated txds are not
3236 * freed before ETHER_BPF_MTAP.
3238 hn_txdesc_hold(txd);
3240 error = txr->hn_sendpkt(txr, txd);
3243 const struct hn_txdesc *tmp_txd;
3245 ETHER_BPF_MTAP(ifp, txd->m);
3246 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3247 ETHER_BPF_MTAP(ifp, tmp_txd->m);
3250 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3251 #ifdef HN_IFSTART_SUPPORT
3252 if (!hn_use_if_start)
3255 if_inc_counter(ifp, IFCOUNTER_OBYTES,
3257 if (txr->hn_stat_mcasts != 0) {
3258 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3259 txr->hn_stat_mcasts);
3262 txr->hn_pkts += txr->hn_stat_pkts;
3266 hn_txdesc_put(txr, txd);
3268 if (__predict_false(error)) {
3272 * This should "really rarely" happen.
3274 * XXX Too many RX to be acked or too many sideband
3275 * commands to run? Ask netvsc_channel_rollup()
3276 * to kick start later.
3278 txr->hn_has_txeof = 1;
3280 txr->hn_send_failed++;
3283 * Try sending again after set hn_has_txeof;
3284 * in case that we missed the last
3285 * netvsc_channel_rollup().
3289 if_printf(ifp, "send failed\n");
3292 * Caller will perform further processing on the
3293 * associated mbuf, so don't free it in hn_txdesc_put();
3294 * only unload it from the DMA map in hn_txdesc_put(),
3298 freed = hn_txdesc_put(txr, txd);
3300 ("fail to free txd upon send error"));
3302 txr->hn_send_failed++;
3305 /* Reset temporary stats, after this sending is done. */
3306 txr->hn_stat_size = 0;
3307 txr->hn_stat_pkts = 0;
3308 txr->hn_stat_mcasts = 0;
3314 * Append the specified data to the indicated mbuf chain,
3315 * Extend the mbuf chain if the new data does not fit in
3318 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3319 * There should be an equivalent in the kernel mbuf code,
3320 * but there does not appear to be one yet.
3322 * Differs from m_append() in that additional mbufs are
3323 * allocated with cluster size MJUMPAGESIZE, and filled
3326 * Return 1 if able to complete the job; otherwise 0.
3329 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3332 int remainder, space;
3334 for (m = m0; m->m_next != NULL; m = m->m_next)
3337 space = M_TRAILINGSPACE(m);
3340 * Copy into available space.
3342 if (space > remainder)
3344 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3349 while (remainder > 0) {
3351 * Allocate a new mbuf; could check space
3352 * and allocate a cluster instead.
3354 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
3357 n->m_len = min(MJUMPAGESIZE, remainder);
3358 bcopy(cp, mtod(n, caddr_t), n->m_len);
3360 remainder -= n->m_len;
3364 if (m0->m_flags & M_PKTHDR)
3365 m0->m_pkthdr.len += len - remainder;
3367 return (remainder == 0);
3370 #if defined(INET) || defined(INET6)
3372 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3374 #if __FreeBSD_version >= 1100095
3375 if (hn_lro_mbufq_depth) {
3376 tcp_lro_queue_mbuf(lc, m);
3380 return tcp_lro_rx(lc, m, 0);
3385 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3386 const struct hn_rxinfo *info)
3388 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3390 int size, do_lro = 0, do_csum = 1, is_vf = 0;
3391 int hash_type = M_HASHTYPE_NONE;
3392 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3395 if (rxr->hn_rxvf_ifp != NULL) {
3397 * Non-transparent mode VF; pretend this packet is from
3400 ifp = rxr->hn_rxvf_ifp;
3402 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3403 /* Transparent mode VF. */
3407 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3410 * See the NOTE of hn_rndis_init_fixat(). This
3411 * function can be reached, immediately after the
3412 * RNDIS is initialized but before the ifnet is
3413 * setup on the hn_attach() path; drop the unexpected
3419 if (__predict_false(dlen < ETHER_HDR_LEN)) {
3420 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3424 if (dlen <= MHLEN) {
3425 m_new = m_gethdr(M_NOWAIT, MT_DATA);
3426 if (m_new == NULL) {
3427 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3430 memcpy(mtod(m_new, void *), data, dlen);
3431 m_new->m_pkthdr.len = m_new->m_len = dlen;
3432 rxr->hn_small_pkts++;
3435 * Get an mbuf with a cluster. For packets 2K or less,
3436 * get a standard 2K cluster. For anything larger, get a
3437 * 4K cluster. Any buffers larger than 4K can cause problems
3438 * if looped around to the Hyper-V TX channel, so avoid them.
3441 if (dlen > MCLBYTES) {
3443 size = MJUMPAGESIZE;
3446 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3447 if (m_new == NULL) {
3448 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3452 hv_m_append(m_new, dlen, data);
3454 m_new->m_pkthdr.rcvif = ifp;
3456 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3459 /* receive side checksum offload */
3460 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3461 /* IP csum offload */
3462 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3463 m_new->m_pkthdr.csum_flags |=
3464 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3468 /* TCP/UDP csum offload */
3469 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3470 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3471 m_new->m_pkthdr.csum_flags |=
3472 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3473 m_new->m_pkthdr.csum_data = 0xffff;
3474 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3482 * As of this write (Oct 28th, 2016), host side will turn
3483 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3484 * the do_lro setting here is actually _not_ accurate. We
3485 * depend on the RSS hash type check to reset do_lro.
3487 if ((info->csum_info &
3488 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3489 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3492 hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3493 if (l3proto == ETHERTYPE_IP) {
3494 if (l4proto == IPPROTO_TCP) {
3496 (rxr->hn_trust_hcsum &
3497 HN_TRUST_HCSUM_TCP)) {
3498 rxr->hn_csum_trusted++;
3499 m_new->m_pkthdr.csum_flags |=
3500 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3501 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3502 m_new->m_pkthdr.csum_data = 0xffff;
3505 } else if (l4proto == IPPROTO_UDP) {
3507 (rxr->hn_trust_hcsum &
3508 HN_TRUST_HCSUM_UDP)) {
3509 rxr->hn_csum_trusted++;
3510 m_new->m_pkthdr.csum_flags |=
3511 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3512 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3513 m_new->m_pkthdr.csum_data = 0xffff;
3515 } else if (l4proto != IPPROTO_DONE && do_csum &&
3516 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3517 rxr->hn_csum_trusted++;
3518 m_new->m_pkthdr.csum_flags |=
3519 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3524 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3525 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3526 NDIS_VLAN_INFO_ID(info->vlan_info),
3527 NDIS_VLAN_INFO_PRI(info->vlan_info),
3528 NDIS_VLAN_INFO_CFI(info->vlan_info));
3529 m_new->m_flags |= M_VLANTAG;
3533 * If VF is activated (tranparent/non-transparent mode does not
3538 * hn(4) will only receive broadcast packets, multicast packets,
3539 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3542 * For non-transparent, we definitely _cannot_ enable LRO at
3543 * all, since the LRO flush will use hn(4) as the receiving
3544 * interface; i.e. hn_ifp->if_input(hn_ifp, m).
3550 * If VF is activated (tranparent/non-transparent mode does not
3551 * matter here), do _not_ mess with unsupported hash types or
3554 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3556 m_new->m_pkthdr.flowid = info->hash_value;
3558 hash_type = M_HASHTYPE_OPAQUE;
3559 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3560 NDIS_HASH_FUNCTION_TOEPLITZ) {
3561 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3566 * do_lro is resetted, if the hash types are not TCP
3567 * related. See the comment in the above csum_flags
3571 case NDIS_HASH_IPV4:
3572 hash_type = M_HASHTYPE_RSS_IPV4;
3576 case NDIS_HASH_TCP_IPV4:
3577 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3578 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3579 int def_htype = M_HASHTYPE_OPAQUE;
3582 def_htype = M_HASHTYPE_NONE;
3585 * UDP 4-tuple hash is delivered as
3588 if (l3proto == ETHERTYPE_MAX) {
3589 hn_rxpkt_proto(m_new,
3590 &l3proto, &l4proto);
3592 if (l3proto == ETHERTYPE_IP) {
3593 if (l4proto == IPPROTO_UDP &&
3594 (rxr->hn_mbuf_hash &
3595 NDIS_HASH_UDP_IPV4_X)) {
3597 M_HASHTYPE_RSS_UDP_IPV4;
3599 } else if (l4proto !=
3601 hash_type = def_htype;
3605 hash_type = def_htype;
3611 case NDIS_HASH_IPV6:
3612 hash_type = M_HASHTYPE_RSS_IPV6;
3616 case NDIS_HASH_IPV6_EX:
3617 hash_type = M_HASHTYPE_RSS_IPV6_EX;
3621 case NDIS_HASH_TCP_IPV6:
3622 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3625 case NDIS_HASH_TCP_IPV6_EX:
3626 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3630 } else if (!is_vf) {
3631 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3633 M_HASHTYPE_SET(m_new, hash_type);
3635 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3636 if (hn_ifp != ifp) {
3637 const struct ether_header *eh;
3640 * Non-transparent mode VF is activated.
3644 * Allow tapping on hn(4).
3646 ETHER_BPF_MTAP(hn_ifp, m_new);
3649 * Update hn(4)'s stats.
3651 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3652 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3653 /* Checked at the beginning of this function. */
3654 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3655 eh = mtod(m_new, struct ether_header *);
3656 if (ETHER_IS_MULTICAST(eh->ether_dhost))
3657 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3661 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3662 #if defined(INET) || defined(INET6)
3663 struct lro_ctrl *lro = &rxr->hn_lro;
3666 rxr->hn_lro_tried++;
3667 if (hn_lro_rx(lro, m_new) == 0) {
3674 ifp->if_input(ifp, m_new);
3680 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3682 struct hn_softc *sc = ifp->if_softc;
3683 struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3684 struct ifnet *vf_ifp;
3685 int mask, error = 0;
3686 struct ifrsskey *ifrk;
3687 struct ifrsshash *ifrh;
3692 if (ifr->ifr_mtu > HN_MTU_MAX) {
3699 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3704 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3705 /* Can't change MTU */
3711 if (ifp->if_mtu == ifr->ifr_mtu) {
3716 if (hn_xpnt_vf_isready(sc)) {
3717 vf_ifp = sc->hn_vf_ifp;
3719 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3720 sizeof(ifr_vf.ifr_name));
3721 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3725 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3726 vf_ifp->if_xname, ifr->ifr_mtu, error);
3732 * Suspend this interface before the synthetic parts
3738 * Detach the synthetics parts, i.e. NVS and RNDIS.
3740 hn_synth_detach(sc);
3743 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3744 * with the new MTU setting.
3746 error = hn_synth_attach(sc, ifr->ifr_mtu);
3752 error = hn_rndis_get_mtu(sc, &mtu);
3755 else if (bootverbose)
3756 if_printf(ifp, "RNDIS mtu %u\n", mtu);
3759 * Commit the requested MTU, after the synthetic parts
3760 * have been successfully attached.
3762 if (mtu >= ifr->ifr_mtu) {
3765 if_printf(ifp, "fixup mtu %d -> %u\n",
3771 * Synthetic parts' reattach may change the chimney
3772 * sending size; update it.
3774 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3775 hn_set_chim_size(sc, sc->hn_chim_szmax);
3778 * Make sure that various parameters based on MTU are
3779 * still valid, after the MTU change.
3781 hn_mtu_change_fixup(sc);
3784 * All done! Resume the interface now.
3788 if ((sc->hn_flags & HN_FLAG_RXVF) ||
3789 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3791 * Since we have reattached the NVS part,
3792 * change the datapath to VF again; in case
3793 * that it is lost, after the NVS was detached.
3795 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3804 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3809 if (hn_xpnt_vf_isready(sc))
3810 hn_xpnt_vf_saveifflags(sc);
3812 if (ifp->if_flags & IFF_UP) {
3813 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3815 * Caller meight hold mutex, e.g.
3816 * bpf; use busy-wait for the RNDIS
3820 hn_rxfilter_config(sc);
3823 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3824 error = hn_xpnt_vf_iocsetflags(sc);
3829 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3832 sc->hn_if_flags = ifp->if_flags;
3840 if (hn_xpnt_vf_isready(sc)) {
3842 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3843 sizeof(ifr_vf.ifr_name));
3844 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3850 * Fix up requested capabilities w/ supported capabilities,
3851 * since the supported capabilities could have been changed.
3853 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3856 if (mask & IFCAP_TXCSUM) {
3857 ifp->if_capenable ^= IFCAP_TXCSUM;
3858 if (ifp->if_capenable & IFCAP_TXCSUM)
3859 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3861 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3863 if (mask & IFCAP_TXCSUM_IPV6) {
3864 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3865 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3866 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3868 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3871 /* TODO: flip RNDIS offload parameters for RXCSUM. */
3872 if (mask & IFCAP_RXCSUM)
3873 ifp->if_capenable ^= IFCAP_RXCSUM;
3875 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
3876 if (mask & IFCAP_RXCSUM_IPV6)
3877 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3880 if (mask & IFCAP_LRO)
3881 ifp->if_capenable ^= IFCAP_LRO;
3883 if (mask & IFCAP_TSO4) {
3884 ifp->if_capenable ^= IFCAP_TSO4;
3885 if (ifp->if_capenable & IFCAP_TSO4)
3886 ifp->if_hwassist |= CSUM_IP_TSO;
3888 ifp->if_hwassist &= ~CSUM_IP_TSO;
3890 if (mask & IFCAP_TSO6) {
3891 ifp->if_capenable ^= IFCAP_TSO6;
3892 if (ifp->if_capenable & IFCAP_TSO6)
3893 ifp->if_hwassist |= CSUM_IP6_TSO;
3895 ifp->if_hwassist &= ~CSUM_IP6_TSO;
3905 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3909 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3911 * Multicast uses mutex; use busy-wait for
3915 hn_rxfilter_config(sc);
3919 /* XXX vlan(4) style mcast addr maintenance */
3920 if (hn_xpnt_vf_isready(sc)) {
3923 old_if_flags = sc->hn_vf_ifp->if_flags;
3924 hn_xpnt_vf_saveifflags(sc);
3926 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3927 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3929 error = hn_xpnt_vf_iocsetflags(sc);
3938 if (hn_xpnt_vf_isready(sc)) {
3940 * SIOCGIFMEDIA expects ifmediareq, so don't
3941 * create and pass ifr_vf to the VF here; just
3942 * replace the ifr_name.
3944 vf_ifp = sc->hn_vf_ifp;
3945 strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3946 sizeof(ifr->ifr_name));
3947 error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3948 /* Restore the ifr_name. */
3949 strlcpy(ifr->ifr_name, ifp->if_xname,
3950 sizeof(ifr->ifr_name));
3955 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3958 case SIOCGIFRSSHASH:
3959 ifrh = (struct ifrsshash *)data;
3961 if (sc->hn_rx_ring_inuse == 1) {
3963 ifrh->ifrh_func = RSS_FUNC_NONE;
3964 ifrh->ifrh_types = 0;
3968 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3969 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3971 ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3972 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3977 ifrk = (struct ifrsskey *)data;
3979 if (sc->hn_rx_ring_inuse == 1) {
3981 ifrk->ifrk_func = RSS_FUNC_NONE;
3982 ifrk->ifrk_keylen = 0;
3985 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3986 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3988 ifrk->ifrk_func = RSS_FUNC_PRIVATE;
3989 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
3990 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
3991 NDIS_HASH_KEYSIZE_TOEPLITZ);
3996 error = ether_ioctl(ifp, cmd, data);
4003 hn_stop(struct hn_softc *sc, bool detaching)
4005 struct ifnet *ifp = sc->hn_ifp;
4010 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4011 ("synthetic parts were not attached"));
4013 /* Clear RUNNING bit ASAP. */
4014 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4016 /* Disable polling. */
4019 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4020 KASSERT(sc->hn_vf_ifp != NULL,
4021 ("%s: VF is not attached", ifp->if_xname));
4023 /* Mark transparent mode VF as disabled. */
4024 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4028 * Datapath setting must happen _before_ bringing
4031 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4034 * Bring the VF down.
4036 hn_xpnt_vf_saveifflags(sc);
4037 sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4038 hn_xpnt_vf_iocsetflags(sc);
4041 /* Suspend data transfers. */
4042 hn_suspend_data(sc);
4044 /* Clear OACTIVE bit. */
4045 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4046 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4047 sc->hn_tx_ring[i].hn_oactive = 0;
4050 * If the non-transparent mode VF is active, make sure
4051 * that the RX filter still allows packet reception.
4053 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4054 hn_rxfilter_config(sc);
4058 hn_init_locked(struct hn_softc *sc)
4060 struct ifnet *ifp = sc->hn_ifp;
4065 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4068 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4071 /* Configure RX filter */
4072 hn_rxfilter_config(sc);
4074 /* Clear OACTIVE bit. */
4075 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4076 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4077 sc->hn_tx_ring[i].hn_oactive = 0;
4079 /* Clear TX 'suspended' bit. */
4080 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4082 if (hn_xpnt_vf_isready(sc)) {
4083 /* Initialize transparent VF. */
4084 hn_xpnt_vf_init(sc);
4087 /* Everything is ready; unleash! */
4088 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4090 /* Re-enable polling if requested. */
4091 if (sc->hn_pollhz > 0)
4092 hn_polling(sc, sc->hn_pollhz);
4098 struct hn_softc *sc = xsc;
4105 #if __FreeBSD_version >= 1100099
4108 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4110 struct hn_softc *sc = arg1;
4111 unsigned int lenlim;
4114 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4115 error = sysctl_handle_int(oidp, &lenlim, 0, req);
4116 if (error || req->newptr == NULL)
4120 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4121 lenlim > TCP_LRO_LENGTH_MAX) {
4125 hn_set_lro_lenlim(sc, lenlim);
4132 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4134 struct hn_softc *sc = arg1;
4135 int ackcnt, error, i;
4138 * lro_ackcnt_lim is append count limit,
4139 * +1 to turn it into aggregation limit.
4141 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4142 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4143 if (error || req->newptr == NULL)
4146 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4150 * Convert aggregation limit back to append
4155 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4156 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4164 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4166 struct hn_softc *sc = arg1;
4171 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4174 error = sysctl_handle_int(oidp, &on, 0, req);
4175 if (error || req->newptr == NULL)
4179 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4180 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4183 rxr->hn_trust_hcsum |= hcsum;
4185 rxr->hn_trust_hcsum &= ~hcsum;
4192 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4194 struct hn_softc *sc = arg1;
4195 int chim_size, error;
4197 chim_size = sc->hn_tx_ring[0].hn_chim_size;
4198 error = sysctl_handle_int(oidp, &chim_size, 0, req);
4199 if (error || req->newptr == NULL)
4202 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4206 hn_set_chim_size(sc, chim_size);
4211 #if __FreeBSD_version < 1100095
4213 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4215 struct hn_softc *sc = arg1;
4216 int ofs = arg2, i, error;
4217 struct hn_rx_ring *rxr;
4221 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4222 rxr = &sc->hn_rx_ring[i];
4223 stat += *((int *)((uint8_t *)rxr + ofs));
4226 error = sysctl_handle_64(oidp, &stat, 0, req);
4227 if (error || req->newptr == NULL)
4230 /* Zero out this stat. */
4231 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4232 rxr = &sc->hn_rx_ring[i];
4233 *((int *)((uint8_t *)rxr + ofs)) = 0;
4239 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4241 struct hn_softc *sc = arg1;
4242 int ofs = arg2, i, error;
4243 struct hn_rx_ring *rxr;
4247 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4248 rxr = &sc->hn_rx_ring[i];
4249 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4252 error = sysctl_handle_64(oidp, &stat, 0, req);
4253 if (error || req->newptr == NULL)
4256 /* Zero out this stat. */
4257 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4258 rxr = &sc->hn_rx_ring[i];
4259 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4267 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4269 struct hn_softc *sc = arg1;
4270 int ofs = arg2, i, error;
4271 struct hn_rx_ring *rxr;
4275 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4276 rxr = &sc->hn_rx_ring[i];
4277 stat += *((u_long *)((uint8_t *)rxr + ofs));
4280 error = sysctl_handle_long(oidp, &stat, 0, req);
4281 if (error || req->newptr == NULL)
4284 /* Zero out this stat. */
4285 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4286 rxr = &sc->hn_rx_ring[i];
4287 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
4293 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4295 struct hn_softc *sc = arg1;
4296 int ofs = arg2, i, error;
4297 struct hn_tx_ring *txr;
4301 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4302 txr = &sc->hn_tx_ring[i];
4303 stat += *((u_long *)((uint8_t *)txr + ofs));
4306 error = sysctl_handle_long(oidp, &stat, 0, req);
4307 if (error || req->newptr == NULL)
4310 /* Zero out this stat. */
4311 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4312 txr = &sc->hn_tx_ring[i];
4313 *((u_long *)((uint8_t *)txr + ofs)) = 0;
4319 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4321 struct hn_softc *sc = arg1;
4322 int ofs = arg2, i, error, conf;
4323 struct hn_tx_ring *txr;
4325 txr = &sc->hn_tx_ring[0];
4326 conf = *((int *)((uint8_t *)txr + ofs));
4328 error = sysctl_handle_int(oidp, &conf, 0, req);
4329 if (error || req->newptr == NULL)
4333 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4334 txr = &sc->hn_tx_ring[i];
4335 *((int *)((uint8_t *)txr + ofs)) = conf;
4343 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4345 struct hn_softc *sc = arg1;
4348 size = sc->hn_agg_size;
4349 error = sysctl_handle_int(oidp, &size, 0, req);
4350 if (error || req->newptr == NULL)
4354 sc->hn_agg_size = size;
4362 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4364 struct hn_softc *sc = arg1;
4367 pkts = sc->hn_agg_pkts;
4368 error = sysctl_handle_int(oidp, &pkts, 0, req);
4369 if (error || req->newptr == NULL)
4373 sc->hn_agg_pkts = pkts;
4381 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4383 struct hn_softc *sc = arg1;
4386 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4387 return (sysctl_handle_int(oidp, &pkts, 0, req));
4391 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4393 struct hn_softc *sc = arg1;
4396 align = sc->hn_tx_ring[0].hn_agg_align;
4397 return (sysctl_handle_int(oidp, &align, 0, req));
4401 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4404 vmbus_chan_poll_disable(chan);
4406 vmbus_chan_poll_enable(chan, pollhz);
4410 hn_polling(struct hn_softc *sc, u_int pollhz)
4412 int nsubch = sc->hn_rx_ring_inuse - 1;
4417 struct vmbus_channel **subch;
4420 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4421 for (i = 0; i < nsubch; ++i)
4422 hn_chan_polling(subch[i], pollhz);
4423 vmbus_subchan_rel(subch, nsubch);
4425 hn_chan_polling(sc->hn_prichan, pollhz);
4429 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4431 struct hn_softc *sc = arg1;
4434 pollhz = sc->hn_pollhz;
4435 error = sysctl_handle_int(oidp, &pollhz, 0, req);
4436 if (error || req->newptr == NULL)
4440 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4444 if (sc->hn_pollhz != pollhz) {
4445 sc->hn_pollhz = pollhz;
4446 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4447 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4448 hn_polling(sc, sc->hn_pollhz);
4456 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4458 struct hn_softc *sc = arg1;
4461 snprintf(verstr, sizeof(verstr), "%u.%u",
4462 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4463 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4464 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4468 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4470 struct hn_softc *sc = arg1;
4477 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4478 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4482 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4484 struct hn_softc *sc = arg1;
4485 char assist_str[128];
4489 hwassist = sc->hn_ifp->if_hwassist;
4491 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4492 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4496 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4498 struct hn_softc *sc = arg1;
4499 char filter_str[128];
4503 filter = sc->hn_rx_filter;
4505 snprintf(filter_str, sizeof(filter_str), "%b", filter,
4507 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4511 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4513 struct hn_softc *sc = arg1;
4518 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4519 if (error || req->newptr == NULL)
4522 if ((sc->hn_flags & HN_FLAG_RXVF) ||
4523 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4525 * RSS key is synchronized w/ VF's, don't allow users
4532 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4535 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4537 if (sc->hn_rx_ring_inuse > 1) {
4538 error = hn_rss_reconfig(sc);
4540 /* Not RSS capable, at least for now; just save the RSS key. */
4549 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4551 struct hn_softc *sc = arg1;
4556 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4557 if (error || req->newptr == NULL)
4561 * Don't allow RSS indirect table change, if this interface is not
4562 * RSS capable currently.
4564 if (sc->hn_rx_ring_inuse == 1) {
4569 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4572 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4574 hn_rss_ind_fixup(sc);
4575 error = hn_rss_reconfig(sc);
4582 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4584 struct hn_softc *sc = arg1;
4589 hash = sc->hn_rss_hash;
4591 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4592 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4596 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4598 struct hn_softc *sc = arg1;
4603 hash = sc->hn_rss_hcap;
4605 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4606 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4610 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4612 struct hn_softc *sc = arg1;
4617 hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4619 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4620 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4624 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4626 struct hn_softc *sc = arg1;
4627 char vf_name[IFNAMSIZ + 1];
4628 struct ifnet *vf_ifp;
4632 vf_ifp = sc->hn_vf_ifp;
4634 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4636 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4640 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4642 struct hn_softc *sc = arg1;
4643 char vf_name[IFNAMSIZ + 1];
4644 struct ifnet *vf_ifp;
4648 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4650 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4652 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4656 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4658 struct rm_priotracker pt;
4663 error = sysctl_wire_old_buffer(req, 0);
4667 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4671 rm_rlock(&hn_vfmap_lock, &pt);
4674 for (i = 0; i < hn_vfmap_size; ++i) {
4677 if (hn_vfmap[i] == NULL)
4680 ifp = ifnet_byindex(i);
4683 sbuf_printf(sb, "%s", ifp->if_xname);
4685 sbuf_printf(sb, " %s", ifp->if_xname);
4690 rm_runlock(&hn_vfmap_lock, &pt);
4692 error = sbuf_finish(sb);
4698 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4700 struct rm_priotracker pt;
4705 error = sysctl_wire_old_buffer(req, 0);
4709 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4713 rm_rlock(&hn_vfmap_lock, &pt);
4716 for (i = 0; i < hn_vfmap_size; ++i) {
4717 struct ifnet *ifp, *hn_ifp;
4719 hn_ifp = hn_vfmap[i];
4723 ifp = ifnet_byindex(i);
4726 sbuf_printf(sb, "%s:%s", ifp->if_xname,
4729 sbuf_printf(sb, " %s:%s", ifp->if_xname,
4736 rm_runlock(&hn_vfmap_lock, &pt);
4738 error = sbuf_finish(sb);
4744 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4746 struct hn_softc *sc = arg1;
4747 int error, onoff = 0;
4749 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4751 error = sysctl_handle_int(oidp, &onoff, 0, req);
4752 if (error || req->newptr == NULL)
4756 /* NOTE: hn_vf_lock for hn_transmit() */
4757 rm_wlock(&sc->hn_vf_lock);
4759 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4761 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4762 rm_wunlock(&sc->hn_vf_lock);
4769 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4771 struct hn_softc *sc = arg1;
4774 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4776 return (sysctl_handle_int(oidp, &enabled, 0, req));
4780 hn_check_iplen(const struct mbuf *m, int hoff)
4782 const struct ip *ip;
4783 int len, iphlen, iplen;
4784 const struct tcphdr *th;
4785 int thoff; /* TCP data offset */
4787 len = hoff + sizeof(struct ip);
4789 /* The packet must be at least the size of an IP header. */
4790 if (m->m_pkthdr.len < len)
4791 return IPPROTO_DONE;
4793 /* The fixed IP header must reside completely in the first mbuf. */
4795 return IPPROTO_DONE;
4797 ip = mtodo(m, hoff);
4799 /* Bound check the packet's stated IP header length. */
4800 iphlen = ip->ip_hl << 2;
4801 if (iphlen < sizeof(struct ip)) /* minimum header length */
4802 return IPPROTO_DONE;
4804 /* The full IP header must reside completely in the one mbuf. */
4805 if (m->m_len < hoff + iphlen)
4806 return IPPROTO_DONE;
4808 iplen = ntohs(ip->ip_len);
4811 * Check that the amount of data in the buffers is as
4812 * at least much as the IP header would have us expect.
4814 if (m->m_pkthdr.len < hoff + iplen)
4815 return IPPROTO_DONE;
4818 * Ignore IP fragments.
4820 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4821 return IPPROTO_DONE;
4824 * The TCP/IP or UDP/IP header must be entirely contained within
4825 * the first fragment of a packet.
4829 if (iplen < iphlen + sizeof(struct tcphdr))
4830 return IPPROTO_DONE;
4831 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4832 return IPPROTO_DONE;
4833 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4834 thoff = th->th_off << 2;
4835 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4836 return IPPROTO_DONE;
4837 if (m->m_len < hoff + iphlen + thoff)
4838 return IPPROTO_DONE;
4841 if (iplen < iphlen + sizeof(struct udphdr))
4842 return IPPROTO_DONE;
4843 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4844 return IPPROTO_DONE;
4848 return IPPROTO_DONE;
4855 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4857 const struct ether_header *eh;
4862 /* Checked at the beginning of this function. */
4863 KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4865 eh = mtod(m_new, const struct ether_header *);
4866 etype = ntohs(eh->ether_type);
4867 if (etype == ETHERTYPE_VLAN) {
4868 const struct ether_vlan_header *evl;
4870 hoff = sizeof(*evl);
4871 if (m_new->m_len < hoff)
4873 evl = mtod(m_new, const struct ether_vlan_header *);
4874 etype = ntohs(evl->evl_proto);
4878 if (etype == ETHERTYPE_IP)
4879 *l4proto = hn_check_iplen(m_new, hoff);
4881 *l4proto = IPPROTO_DONE;
4885 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4887 struct sysctl_oid_list *child;
4888 struct sysctl_ctx_list *ctx;
4889 device_t dev = sc->hn_dev;
4890 #if defined(INET) || defined(INET6)
4891 #if __FreeBSD_version >= 1100095
4898 * Create RXBUF for reception.
4901 * - It is shared by all channels.
4902 * - A large enough buffer is allocated, certain version of NVSes
4903 * may further limit the usable space.
4905 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4906 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4907 BUS_DMA_WAITOK | BUS_DMA_ZERO);
4908 if (sc->hn_rxbuf == NULL) {
4909 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4913 sc->hn_rx_ring_cnt = ring_cnt;
4914 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4916 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4917 M_DEVBUF, M_WAITOK | M_ZERO);
4919 #if defined(INET) || defined(INET6)
4920 #if __FreeBSD_version >= 1100095
4921 lroent_cnt = hn_lro_entry_count;
4922 if (lroent_cnt < TCP_LRO_ENTRIES)
4923 lroent_cnt = TCP_LRO_ENTRIES;
4925 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4927 #endif /* INET || INET6 */
4929 ctx = device_get_sysctl_ctx(dev);
4930 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4932 /* Create dev.hn.UNIT.rx sysctl tree */
4933 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4934 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4936 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4937 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4939 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4940 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4941 &rxr->hn_br_dma, BUS_DMA_WAITOK);
4942 if (rxr->hn_br == NULL) {
4943 device_printf(dev, "allocate bufring failed\n");
4947 if (hn_trust_hosttcp)
4948 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4949 if (hn_trust_hostudp)
4950 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4951 if (hn_trust_hostip)
4952 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4953 rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4954 rxr->hn_ifp = sc->hn_ifp;
4955 if (i < sc->hn_tx_ring_cnt)
4956 rxr->hn_txr = &sc->hn_tx_ring[i];
4957 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4958 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4960 rxr->hn_rxbuf = sc->hn_rxbuf;
4965 #if defined(INET) || defined(INET6)
4966 #if __FreeBSD_version >= 1100095
4967 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4968 hn_lro_mbufq_depth);
4970 tcp_lro_init(&rxr->hn_lro);
4971 rxr->hn_lro.ifp = sc->hn_ifp;
4973 #if __FreeBSD_version >= 1100099
4974 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4975 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4977 #endif /* INET || INET6 */
4979 if (sc->hn_rx_sysctl_tree != NULL) {
4983 * Create per RX ring sysctl tree:
4984 * dev.hn.UNIT.rx.RINGID
4986 snprintf(name, sizeof(name), "%d", i);
4987 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4988 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4989 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4991 if (rxr->hn_rx_sysctl_tree != NULL) {
4992 SYSCTL_ADD_ULONG(ctx,
4993 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4994 OID_AUTO, "packets", CTLFLAG_RW,
4995 &rxr->hn_pkts, "# of packets received");
4996 SYSCTL_ADD_ULONG(ctx,
4997 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4998 OID_AUTO, "rss_pkts", CTLFLAG_RW,
5000 "# of packets w/ RSS info received");
5002 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5003 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5004 &rxr->hn_pktbuf_len, 0,
5005 "Temporary channel packet buffer length");
5010 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5011 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5012 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5013 #if __FreeBSD_version < 1100095
5014 hn_rx_stat_int_sysctl,
5016 hn_rx_stat_u64_sysctl,
5018 "LU", "LRO queued");
5019 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5020 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5021 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5022 #if __FreeBSD_version < 1100095
5023 hn_rx_stat_int_sysctl,
5025 hn_rx_stat_u64_sysctl,
5027 "LU", "LRO flushed");
5028 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5029 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5030 __offsetof(struct hn_rx_ring, hn_lro_tried),
5031 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5032 #if __FreeBSD_version >= 1100099
5033 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5034 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5035 hn_lro_lenlim_sysctl, "IU",
5036 "Max # of data bytes to be aggregated by LRO");
5037 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5038 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5039 hn_lro_ackcnt_sysctl, "I",
5040 "Max # of ACKs to be aggregated by LRO");
5042 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5043 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5044 hn_trust_hcsum_sysctl, "I",
5045 "Trust tcp segement verification on host side, "
5046 "when csum info is missing");
5047 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5048 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5049 hn_trust_hcsum_sysctl, "I",
5050 "Trust udp datagram verification on host side, "
5051 "when csum info is missing");
5052 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5053 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5054 hn_trust_hcsum_sysctl, "I",
5055 "Trust ip packet verification on host side, "
5056 "when csum info is missing");
5057 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5058 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5059 __offsetof(struct hn_rx_ring, hn_csum_ip),
5060 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5061 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5062 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5063 __offsetof(struct hn_rx_ring, hn_csum_tcp),
5064 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5065 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5066 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5067 __offsetof(struct hn_rx_ring, hn_csum_udp),
5068 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5069 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5070 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5071 __offsetof(struct hn_rx_ring, hn_csum_trusted),
5072 hn_rx_stat_ulong_sysctl, "LU",
5073 "# of packets that we trust host's csum verification");
5074 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5075 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5076 __offsetof(struct hn_rx_ring, hn_small_pkts),
5077 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5078 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5079 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5080 __offsetof(struct hn_rx_ring, hn_ack_failed),
5081 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5082 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5083 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5084 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5085 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5091 hn_destroy_rx_data(struct hn_softc *sc)
5095 if (sc->hn_rxbuf != NULL) {
5096 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5097 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5099 device_printf(sc->hn_dev, "RXBUF is referenced\n");
5100 sc->hn_rxbuf = NULL;
5103 if (sc->hn_rx_ring_cnt == 0)
5106 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5107 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5109 if (rxr->hn_br == NULL)
5111 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5112 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5114 device_printf(sc->hn_dev,
5115 "%dth channel bufring is referenced", i);
5119 #if defined(INET) || defined(INET6)
5120 tcp_lro_free(&rxr->hn_lro);
5122 free(rxr->hn_pktbuf, M_DEVBUF);
5124 free(sc->hn_rx_ring, M_DEVBUF);
5125 sc->hn_rx_ring = NULL;
5127 sc->hn_rx_ring_cnt = 0;
5128 sc->hn_rx_ring_inuse = 0;
5132 hn_tx_ring_create(struct hn_softc *sc, int id)
5134 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5135 device_t dev = sc->hn_dev;
5136 bus_dma_tag_t parent_dtag;
5140 txr->hn_tx_idx = id;
5142 #ifndef HN_USE_TXDESC_BUFRING
5143 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5145 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5147 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5148 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5149 M_DEVBUF, M_WAITOK | M_ZERO);
5150 #ifndef HN_USE_TXDESC_BUFRING
5151 SLIST_INIT(&txr->hn_txlist);
5153 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5154 M_WAITOK, &txr->hn_tx_lock);
5157 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5158 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5159 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5161 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5164 #ifdef HN_IFSTART_SUPPORT
5165 if (hn_use_if_start) {
5166 txr->hn_txeof = hn_start_txeof;
5167 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5168 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5174 txr->hn_txeof = hn_xmit_txeof;
5175 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5176 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5178 br_depth = hn_get_txswq_depth(txr);
5179 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5180 M_WAITOK, &txr->hn_tx_lock);
5183 txr->hn_direct_tx_size = hn_direct_tx_size;
5186 * Always schedule transmission instead of trying to do direct
5187 * transmission. This one gives the best performance so far.
5189 txr->hn_sched_tx = 1;
5191 parent_dtag = bus_get_dma_tag(dev);
5193 /* DMA tag for RNDIS packet messages. */
5194 error = bus_dma_tag_create(parent_dtag, /* parent */
5195 HN_RNDIS_PKT_ALIGN, /* alignment */
5196 HN_RNDIS_PKT_BOUNDARY, /* boundary */
5197 BUS_SPACE_MAXADDR, /* lowaddr */
5198 BUS_SPACE_MAXADDR, /* highaddr */
5199 NULL, NULL, /* filter, filterarg */
5200 HN_RNDIS_PKT_LEN, /* maxsize */
5202 HN_RNDIS_PKT_LEN, /* maxsegsize */
5204 NULL, /* lockfunc */
5205 NULL, /* lockfuncarg */
5206 &txr->hn_tx_rndis_dtag);
5208 device_printf(dev, "failed to create rndis dmatag\n");
5212 /* DMA tag for data. */
5213 error = bus_dma_tag_create(parent_dtag, /* parent */
5215 HN_TX_DATA_BOUNDARY, /* boundary */
5216 BUS_SPACE_MAXADDR, /* lowaddr */
5217 BUS_SPACE_MAXADDR, /* highaddr */
5218 NULL, NULL, /* filter, filterarg */
5219 HN_TX_DATA_MAXSIZE, /* maxsize */
5220 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
5221 HN_TX_DATA_SEGSIZE, /* maxsegsize */
5223 NULL, /* lockfunc */
5224 NULL, /* lockfuncarg */
5225 &txr->hn_tx_data_dtag);
5227 device_printf(dev, "failed to create data dmatag\n");
5231 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5232 struct hn_txdesc *txd = &txr->hn_txdesc[i];
5235 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5236 STAILQ_INIT(&txd->agg_list);
5239 * Allocate and load RNDIS packet message.
5241 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5242 (void **)&txd->rndis_pkt,
5243 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5244 &txd->rndis_pkt_dmap);
5247 "failed to allocate rndis_packet_msg, %d\n", i);
5251 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5252 txd->rndis_pkt_dmap,
5253 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5254 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5258 "failed to load rndis_packet_msg, %d\n", i);
5259 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5260 txd->rndis_pkt, txd->rndis_pkt_dmap);
5264 /* DMA map for TX data. */
5265 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5269 "failed to allocate tx data dmamap\n");
5270 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5271 txd->rndis_pkt_dmap);
5272 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5273 txd->rndis_pkt, txd->rndis_pkt_dmap);
5277 /* All set, put it to list */
5278 txd->flags |= HN_TXD_FLAG_ONLIST;
5279 #ifndef HN_USE_TXDESC_BUFRING
5280 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5282 buf_ring_enqueue(txr->hn_txdesc_br, txd);
5285 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5287 if (sc->hn_tx_sysctl_tree != NULL) {
5288 struct sysctl_oid_list *child;
5289 struct sysctl_ctx_list *ctx;
5293 * Create per TX ring sysctl tree:
5294 * dev.hn.UNIT.tx.RINGID
5296 ctx = device_get_sysctl_ctx(dev);
5297 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5299 snprintf(name, sizeof(name), "%d", id);
5300 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5301 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5303 if (txr->hn_tx_sysctl_tree != NULL) {
5304 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5307 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5308 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5309 "# of available TX descs");
5311 #ifdef HN_IFSTART_SUPPORT
5312 if (!hn_use_if_start)
5315 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5316 CTLFLAG_RD, &txr->hn_oactive, 0,
5319 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5320 CTLFLAG_RW, &txr->hn_pkts,
5321 "# of packets transmitted");
5322 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5323 CTLFLAG_RW, &txr->hn_sends, "# of sends");
5331 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5333 struct hn_tx_ring *txr = txd->txr;
5335 KASSERT(txd->m == NULL, ("still has mbuf installed"));
5336 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5338 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5339 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5340 txd->rndis_pkt_dmap);
5341 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5345 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5348 KASSERT(txd->refs == 0 || txd->refs == 1,
5349 ("invalid txd refs %d", txd->refs));
5351 /* Aggregated txds will be freed by their aggregating txd. */
5352 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5355 freed = hn_txdesc_put(txr, txd);
5356 KASSERT(freed, ("can't free txdesc"));
5361 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5365 if (txr->hn_txdesc == NULL)
5370 * Because the freeing of aggregated txds will be deferred
5371 * to the aggregating txd, two passes are used here:
5372 * - The first pass GCes any pending txds. This GC is necessary,
5373 * since if the channels are revoked, hypervisor will not
5374 * deliver send-done for all pending txds.
5375 * - The second pass frees the busdma stuffs, i.e. after all txds
5378 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5379 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5380 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5381 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5383 if (txr->hn_tx_data_dtag != NULL)
5384 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5385 if (txr->hn_tx_rndis_dtag != NULL)
5386 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5388 #ifdef HN_USE_TXDESC_BUFRING
5389 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5392 free(txr->hn_txdesc, M_DEVBUF);
5393 txr->hn_txdesc = NULL;
5395 if (txr->hn_mbuf_br != NULL)
5396 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5398 #ifndef HN_USE_TXDESC_BUFRING
5399 mtx_destroy(&txr->hn_txlist_spin);
5401 mtx_destroy(&txr->hn_tx_lock);
5405 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5407 struct sysctl_oid_list *child;
5408 struct sysctl_ctx_list *ctx;
5412 * Create TXBUF for chimney sending.
5414 * NOTE: It is shared by all channels.
5416 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5417 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5418 BUS_DMA_WAITOK | BUS_DMA_ZERO);
5419 if (sc->hn_chim == NULL) {
5420 device_printf(sc->hn_dev, "allocate txbuf failed\n");
5424 sc->hn_tx_ring_cnt = ring_cnt;
5425 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5427 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5428 M_DEVBUF, M_WAITOK | M_ZERO);
5430 ctx = device_get_sysctl_ctx(sc->hn_dev);
5431 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5433 /* Create dev.hn.UNIT.tx sysctl tree */
5434 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5435 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5437 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5440 error = hn_tx_ring_create(sc, i);
5445 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5446 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5447 __offsetof(struct hn_tx_ring, hn_no_txdescs),
5448 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5449 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5450 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5451 __offsetof(struct hn_tx_ring, hn_send_failed),
5452 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5453 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5454 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5455 __offsetof(struct hn_tx_ring, hn_txdma_failed),
5456 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5457 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5458 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5459 __offsetof(struct hn_tx_ring, hn_flush_failed),
5460 hn_tx_stat_ulong_sysctl, "LU",
5461 "# of packet transmission aggregation flush failure");
5462 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5463 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5464 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5465 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5466 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5467 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5468 __offsetof(struct hn_tx_ring, hn_tx_chimney),
5469 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5470 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5471 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5472 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5473 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5474 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5475 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5476 "# of total TX descs");
5477 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5478 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5479 "Chimney send packet size upper boundary");
5480 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5481 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5482 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5483 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5484 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5485 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5486 hn_tx_conf_int_sysctl, "I",
5487 "Size of the packet for direct transmission");
5488 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5489 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5490 __offsetof(struct hn_tx_ring, hn_sched_tx),
5491 hn_tx_conf_int_sysctl, "I",
5492 "Always schedule transmission "
5493 "instead of doing direct transmission");
5494 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5495 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5496 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5497 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5498 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5499 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5500 "Applied packet transmission aggregation size");
5501 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5502 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5503 hn_txagg_pktmax_sysctl, "I",
5504 "Applied packet transmission aggregation packets");
5505 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5506 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5507 hn_txagg_align_sysctl, "I",
5508 "Applied packet transmission aggregation alignment");
5514 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5518 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5519 sc->hn_tx_ring[i].hn_chim_size = chim_size;
5523 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5525 struct ifnet *ifp = sc->hn_ifp;
5531 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5534 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5535 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5536 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5538 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5539 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5540 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5542 if (tso_maxlen < tso_minlen)
5543 tso_maxlen = tso_minlen;
5544 else if (tso_maxlen > IP_MAXPACKET)
5545 tso_maxlen = IP_MAXPACKET;
5546 if (tso_maxlen > sc->hn_ndis_tso_szmax)
5547 tso_maxlen = sc->hn_ndis_tso_szmax;
5548 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5550 if (hn_xpnt_vf_isready(sc)) {
5551 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5552 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5554 ifp->if_hw_tsomax = hw_tsomax;
5556 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5560 hn_fixup_tx_data(struct hn_softc *sc)
5562 uint64_t csum_assist;
5565 hn_set_chim_size(sc, sc->hn_chim_szmax);
5566 if (hn_tx_chimney_size > 0 &&
5567 hn_tx_chimney_size < sc->hn_chim_szmax)
5568 hn_set_chim_size(sc, hn_tx_chimney_size);
5571 if (sc->hn_caps & HN_CAP_IPCS)
5572 csum_assist |= CSUM_IP;
5573 if (sc->hn_caps & HN_CAP_TCP4CS)
5574 csum_assist |= CSUM_IP_TCP;
5575 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5576 csum_assist |= CSUM_IP_UDP;
5577 if (sc->hn_caps & HN_CAP_TCP6CS)
5578 csum_assist |= CSUM_IP6_TCP;
5579 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5580 csum_assist |= CSUM_IP6_UDP;
5581 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5582 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5584 if (sc->hn_caps & HN_CAP_HASHVAL) {
5586 * Support HASHVAL pktinfo on TX path.
5589 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5590 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5591 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5596 hn_fixup_rx_data(struct hn_softc *sc)
5599 if (sc->hn_caps & HN_CAP_UDPHASH) {
5602 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5603 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5608 hn_destroy_tx_data(struct hn_softc *sc)
5612 if (sc->hn_chim != NULL) {
5613 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5614 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5616 device_printf(sc->hn_dev,
5617 "chimney sending buffer is referenced");
5622 if (sc->hn_tx_ring_cnt == 0)
5625 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5626 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5628 free(sc->hn_tx_ring, M_DEVBUF);
5629 sc->hn_tx_ring = NULL;
5631 sc->hn_tx_ring_cnt = 0;
5632 sc->hn_tx_ring_inuse = 0;
5635 #ifdef HN_IFSTART_SUPPORT
5638 hn_start_taskfunc(void *xtxr, int pending __unused)
5640 struct hn_tx_ring *txr = xtxr;
5642 mtx_lock(&txr->hn_tx_lock);
5643 hn_start_locked(txr, 0);
5644 mtx_unlock(&txr->hn_tx_lock);
5648 hn_start_locked(struct hn_tx_ring *txr, int len)
5650 struct hn_softc *sc = txr->hn_sc;
5651 struct ifnet *ifp = sc->hn_ifp;
5654 KASSERT(hn_use_if_start,
5655 ("hn_start_locked is called, when if_start is disabled"));
5656 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5657 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5658 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5660 if (__predict_false(txr->hn_suspended))
5663 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5667 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5668 struct hn_txdesc *txd;
5669 struct mbuf *m_head;
5672 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5676 if (len > 0 && m_head->m_pkthdr.len > len) {
5678 * This sending could be time consuming; let callers
5679 * dispatch this packet sending (and sending of any
5680 * following up packets) to tx taskqueue.
5682 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5687 #if defined(INET6) || defined(INET)
5688 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5689 m_head = hn_tso_fixup(m_head);
5690 if (__predict_false(m_head == NULL)) {
5691 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5694 } else if (m_head->m_pkthdr.csum_flags &
5695 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5696 m_head = hn_set_hlen(m_head);
5697 if (__predict_false(m_head == NULL)) {
5698 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5704 txd = hn_txdesc_get(txr);
5706 txr->hn_no_txdescs++;
5707 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5708 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5712 error = hn_encap(ifp, txr, txd, &m_head);
5714 /* Both txd and m_head are freed */
5715 KASSERT(txr->hn_agg_txd == NULL,
5716 ("encap failed w/ pending aggregating txdesc"));
5720 if (txr->hn_agg_pktleft == 0) {
5721 if (txr->hn_agg_txd != NULL) {
5722 KASSERT(m_head == NULL,
5723 ("pending mbuf for aggregating txdesc"));
5724 error = hn_flush_txagg(ifp, txr);
5725 if (__predict_false(error)) {
5726 atomic_set_int(&ifp->if_drv_flags,
5731 KASSERT(m_head != NULL, ("mbuf was freed"));
5732 error = hn_txpkt(ifp, txr, txd);
5733 if (__predict_false(error)) {
5734 /* txd is freed, but m_head is not */
5735 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5736 atomic_set_int(&ifp->if_drv_flags,
5744 KASSERT(txr->hn_agg_txd != NULL,
5745 ("no aggregating txdesc"));
5746 KASSERT(m_head == NULL,
5747 ("pending mbuf for aggregating txdesc"));
5752 /* Flush pending aggerated transmission. */
5753 if (txr->hn_agg_txd != NULL)
5754 hn_flush_txagg(ifp, txr);
5759 hn_start(struct ifnet *ifp)
5761 struct hn_softc *sc = ifp->if_softc;
5762 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5764 if (txr->hn_sched_tx)
5767 if (mtx_trylock(&txr->hn_tx_lock)) {
5770 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5771 mtx_unlock(&txr->hn_tx_lock);
5776 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5780 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5782 struct hn_tx_ring *txr = xtxr;
5784 mtx_lock(&txr->hn_tx_lock);
5785 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5786 hn_start_locked(txr, 0);
5787 mtx_unlock(&txr->hn_tx_lock);
5791 hn_start_txeof(struct hn_tx_ring *txr)
5793 struct hn_softc *sc = txr->hn_sc;
5794 struct ifnet *ifp = sc->hn_ifp;
5796 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5798 if (txr->hn_sched_tx)
5801 if (mtx_trylock(&txr->hn_tx_lock)) {
5804 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5805 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5806 mtx_unlock(&txr->hn_tx_lock);
5808 taskqueue_enqueue(txr->hn_tx_taskq,
5814 * Release the OACTIVE earlier, with the hope, that
5815 * others could catch up. The task will clear the
5816 * flag again with the hn_tx_lock to avoid possible
5819 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5820 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5824 #endif /* HN_IFSTART_SUPPORT */
5827 hn_xmit(struct hn_tx_ring *txr, int len)
5829 struct hn_softc *sc = txr->hn_sc;
5830 struct ifnet *ifp = sc->hn_ifp;
5831 struct mbuf *m_head;
5834 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5835 #ifdef HN_IFSTART_SUPPORT
5836 KASSERT(hn_use_if_start == 0,
5837 ("hn_xmit is called, when if_start is enabled"));
5839 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5841 if (__predict_false(txr->hn_suspended))
5844 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5847 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5848 struct hn_txdesc *txd;
5851 if (len > 0 && m_head->m_pkthdr.len > len) {
5853 * This sending could be time consuming; let callers
5854 * dispatch this packet sending (and sending of any
5855 * following up packets) to tx taskqueue.
5857 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5862 txd = hn_txdesc_get(txr);
5864 txr->hn_no_txdescs++;
5865 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5866 txr->hn_oactive = 1;
5870 error = hn_encap(ifp, txr, txd, &m_head);
5872 /* Both txd and m_head are freed; discard */
5873 KASSERT(txr->hn_agg_txd == NULL,
5874 ("encap failed w/ pending aggregating txdesc"));
5875 drbr_advance(ifp, txr->hn_mbuf_br);
5879 if (txr->hn_agg_pktleft == 0) {
5880 if (txr->hn_agg_txd != NULL) {
5881 KASSERT(m_head == NULL,
5882 ("pending mbuf for aggregating txdesc"));
5883 error = hn_flush_txagg(ifp, txr);
5884 if (__predict_false(error)) {
5885 txr->hn_oactive = 1;
5889 KASSERT(m_head != NULL, ("mbuf was freed"));
5890 error = hn_txpkt(ifp, txr, txd);
5891 if (__predict_false(error)) {
5892 /* txd is freed, but m_head is not */
5893 drbr_putback(ifp, txr->hn_mbuf_br,
5895 txr->hn_oactive = 1;
5902 KASSERT(txr->hn_agg_txd != NULL,
5903 ("no aggregating txdesc"));
5904 KASSERT(m_head == NULL,
5905 ("pending mbuf for aggregating txdesc"));
5910 drbr_advance(ifp, txr->hn_mbuf_br);
5913 /* Flush pending aggerated transmission. */
5914 if (txr->hn_agg_txd != NULL)
5915 hn_flush_txagg(ifp, txr);
5920 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5922 struct hn_softc *sc = ifp->if_softc;
5923 struct hn_tx_ring *txr;
5926 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5927 struct rm_priotracker pt;
5929 rm_rlock(&sc->hn_vf_lock, &pt);
5930 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5931 struct mbuf *m_bpf = NULL;
5934 obytes = m->m_pkthdr.len;
5935 if (m->m_flags & M_MCAST)
5938 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5939 if (bpf_peers_present(ifp->if_bpf)) {
5940 m_bpf = m_copypacket(m, M_NOWAIT);
5941 if (m_bpf == NULL) {
5943 * Failed to grab a shallow
5946 ETHER_BPF_MTAP(ifp, m);
5950 ETHER_BPF_MTAP(ifp, m);
5953 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5954 rm_runlock(&sc->hn_vf_lock, &pt);
5956 if (m_bpf != NULL) {
5958 ETHER_BPF_MTAP(ifp, m_bpf);
5962 if (error == ENOBUFS) {
5963 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5965 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5967 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5968 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5970 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5976 rm_runlock(&sc->hn_vf_lock, &pt);
5979 #if defined(INET6) || defined(INET)
5981 * Perform TSO packet header fixup or get l2/l3 header length now,
5982 * since packet headers should be cache-hot.
5984 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5985 m = hn_tso_fixup(m);
5986 if (__predict_false(m == NULL)) {
5987 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5990 } else if (m->m_pkthdr.csum_flags &
5991 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5993 if (__predict_false(m == NULL)) {
5994 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6001 * Select the TX ring based on flowid
6003 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6004 #if defined(INET6) || defined(INET)
6007 if (m->m_pkthdr.len < 128 &&
6008 (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6009 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6010 m = hn_check_tcpsyn(m, &tcpsyn);
6011 if (__predict_false(m == NULL)) {
6013 IFCOUNTER_OERRORS, 1);
6018 const int tcpsyn = 0;
6023 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6025 txr = &sc->hn_tx_ring[idx];
6027 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6029 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6033 if (txr->hn_oactive)
6036 if (txr->hn_sched_tx)
6039 if (mtx_trylock(&txr->hn_tx_lock)) {
6042 sched = hn_xmit(txr, txr->hn_direct_tx_size);
6043 mtx_unlock(&txr->hn_tx_lock);
6048 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6053 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6057 mtx_lock(&txr->hn_tx_lock);
6058 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6060 mtx_unlock(&txr->hn_tx_lock);
6064 hn_xmit_qflush(struct ifnet *ifp)
6066 struct hn_softc *sc = ifp->if_softc;
6067 struct rm_priotracker pt;
6070 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6071 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6074 rm_rlock(&sc->hn_vf_lock, &pt);
6075 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6076 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6077 rm_runlock(&sc->hn_vf_lock, &pt);
6081 hn_xmit_txeof(struct hn_tx_ring *txr)
6084 if (txr->hn_sched_tx)
6087 if (mtx_trylock(&txr->hn_tx_lock)) {
6090 txr->hn_oactive = 0;
6091 sched = hn_xmit(txr, txr->hn_direct_tx_size);
6092 mtx_unlock(&txr->hn_tx_lock);
6094 taskqueue_enqueue(txr->hn_tx_taskq,
6100 * Release the oactive earlier, with the hope, that
6101 * others could catch up. The task will clear the
6102 * oactive again with the hn_tx_lock to avoid possible
6105 txr->hn_oactive = 0;
6106 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6111 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6113 struct hn_tx_ring *txr = xtxr;
6115 mtx_lock(&txr->hn_tx_lock);
6117 mtx_unlock(&txr->hn_tx_lock);
6121 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6123 struct hn_tx_ring *txr = xtxr;
6125 mtx_lock(&txr->hn_tx_lock);
6126 txr->hn_oactive = 0;
6128 mtx_unlock(&txr->hn_tx_lock);
6132 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6134 struct vmbus_chan_br cbr;
6135 struct hn_rx_ring *rxr;
6136 struct hn_tx_ring *txr = NULL;
6139 idx = vmbus_chan_subidx(chan);
6142 * Link this channel to RX/TX ring.
6144 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6145 ("invalid channel index %d, should > 0 && < %d",
6146 idx, sc->hn_rx_ring_inuse));
6147 rxr = &sc->hn_rx_ring[idx];
6148 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6149 ("RX ring %d already attached", idx));
6150 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6151 rxr->hn_chan = chan;
6154 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6155 idx, vmbus_chan_id(chan));
6158 if (idx < sc->hn_tx_ring_inuse) {
6159 txr = &sc->hn_tx_ring[idx];
6160 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6161 ("TX ring %d already attached", idx));
6162 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6164 txr->hn_chan = chan;
6166 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6167 idx, vmbus_chan_id(chan));
6171 /* Bind this channel to a proper CPU. */
6172 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6177 cbr.cbr = rxr->hn_br;
6178 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6179 cbr.cbr_txsz = HN_TXBR_SIZE;
6180 cbr.cbr_rxsz = HN_RXBR_SIZE;
6181 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6183 if (error == EISCONN) {
6184 if_printf(sc->hn_ifp, "bufring is connected after "
6185 "chan%u open failure\n", vmbus_chan_id(chan));
6186 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6188 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6189 vmbus_chan_id(chan), error);
6196 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6198 struct hn_rx_ring *rxr;
6201 idx = vmbus_chan_subidx(chan);
6204 * Link this channel to RX/TX ring.
6206 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6207 ("invalid channel index %d, should > 0 && < %d",
6208 idx, sc->hn_rx_ring_inuse));
6209 rxr = &sc->hn_rx_ring[idx];
6210 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6211 ("RX ring %d is not attached", idx));
6212 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6214 if (idx < sc->hn_tx_ring_inuse) {
6215 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6217 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6218 ("TX ring %d is not attached attached", idx));
6219 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6223 * Close this channel.
6226 * Channel closing does _not_ destroy the target channel.
6228 error = vmbus_chan_close_direct(chan);
6229 if (error == EISCONN) {
6230 if_printf(sc->hn_ifp, "chan%u bufring is connected "
6231 "after being closed\n", vmbus_chan_id(chan));
6232 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6234 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6235 vmbus_chan_id(chan), error);
6240 hn_attach_subchans(struct hn_softc *sc)
6242 struct vmbus_channel **subchans;
6243 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6246 KASSERT(subchan_cnt > 0, ("no sub-channels"));
6248 /* Attach the sub-channels. */
6249 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6250 for (i = 0; i < subchan_cnt; ++i) {
6253 error1 = hn_chan_attach(sc, subchans[i]);
6256 /* Move on; all channels will be detached later. */
6259 vmbus_subchan_rel(subchans, subchan_cnt);
6262 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6265 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6273 hn_detach_allchans(struct hn_softc *sc)
6275 struct vmbus_channel **subchans;
6276 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6279 if (subchan_cnt == 0)
6282 /* Detach the sub-channels. */
6283 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6284 for (i = 0; i < subchan_cnt; ++i)
6285 hn_chan_detach(sc, subchans[i]);
6286 vmbus_subchan_rel(subchans, subchan_cnt);
6290 * Detach the primary channel, _after_ all sub-channels
6293 hn_chan_detach(sc, sc->hn_prichan);
6295 /* Wait for sub-channels to be destroyed, if any. */
6296 vmbus_subchan_drain(sc->hn_prichan);
6299 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6300 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6301 HN_RX_FLAG_ATTACHED) == 0,
6302 ("%dth RX ring is still attached", i));
6304 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6305 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6306 HN_TX_FLAG_ATTACHED) == 0,
6307 ("%dth TX ring is still attached", i));
6313 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6315 struct vmbus_channel **subchans;
6316 int nchan, rxr_cnt, error;
6318 nchan = *nsubch + 1;
6321 * Multiple RX/TX rings are not requested.
6328 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6331 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6333 /* No RSS; this is benign. */
6338 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6342 if (nchan > rxr_cnt)
6345 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6351 * Allocate sub-channels from NVS.
6353 *nsubch = nchan - 1;
6354 error = hn_nvs_alloc_subchans(sc, nsubch);
6355 if (error || *nsubch == 0) {
6356 /* Failed to allocate sub-channels. */
6362 * Wait for all sub-channels to become ready before moving on.
6364 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6365 vmbus_subchan_rel(subchans, *nsubch);
6370 hn_synth_attachable(const struct hn_softc *sc)
6374 if (sc->hn_flags & HN_FLAG_ERRORS)
6377 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6378 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6380 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6387 * Make sure that the RX filter is zero after the successful
6388 * RNDIS initialization.
6391 * Under certain conditions on certain versions of Hyper-V,
6392 * the RNDIS rxfilter is _not_ zero on the hypervisor side
6393 * after the successful RNDIS initialization, which breaks
6394 * the assumption of any following code (well, it breaks the
6395 * RNDIS API contract actually). Clear the RNDIS rxfilter
6396 * explicitly, drain packets sneaking through, and drain the
6397 * interrupt taskqueues scheduled due to the stealth packets.
6400 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6404 hn_drain_rxtx(sc, nchan);
6408 hn_synth_attach(struct hn_softc *sc, int mtu)
6410 #define ATTACHED_NVS 0x0002
6411 #define ATTACHED_RNDIS 0x0004
6413 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6414 int error, nsubch, nchan = 1, i, rndis_inited;
6415 uint32_t old_caps, attached = 0;
6417 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6418 ("synthetic parts were attached"));
6420 if (!hn_synth_attachable(sc))
6423 /* Save capabilities for later verification. */
6424 old_caps = sc->hn_caps;
6427 /* Clear RSS stuffs. */
6428 sc->hn_rss_ind_size = 0;
6429 sc->hn_rss_hash = 0;
6430 sc->hn_rss_hcap = 0;
6433 * Attach the primary channel _before_ attaching NVS and RNDIS.
6435 error = hn_chan_attach(sc, sc->hn_prichan);
6442 error = hn_nvs_attach(sc, mtu);
6445 attached |= ATTACHED_NVS;
6448 * Attach RNDIS _after_ NVS is attached.
6450 error = hn_rndis_attach(sc, mtu, &rndis_inited);
6452 attached |= ATTACHED_RNDIS;
6457 * Make sure capabilities are not changed.
6459 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6460 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6461 old_caps, sc->hn_caps);
6467 * Allocate sub-channels for multi-TX/RX rings.
6470 * The # of RX rings that can be used is equivalent to the # of
6471 * channels to be requested.
6473 nsubch = sc->hn_rx_ring_cnt - 1;
6474 error = hn_synth_alloc_subchans(sc, &nsubch);
6477 /* NOTE: _Full_ synthetic parts detach is required now. */
6478 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6481 * Set the # of TX/RX rings that could be used according to
6482 * the # of channels that NVS offered.
6485 hn_set_ring_inuse(sc, nchan);
6487 /* Only the primary channel can be used; done */
6492 * Attach the sub-channels.
6494 * NOTE: hn_set_ring_inuse() _must_ have been called.
6496 error = hn_attach_subchans(sc);
6501 * Configure RSS key and indirect table _after_ all sub-channels
6504 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6506 * RSS key is not set yet; set it to the default RSS key.
6509 if_printf(sc->hn_ifp, "setup default RSS key\n");
6510 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6511 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6514 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6516 * RSS indirect table is not set yet; set it up in round-
6520 if_printf(sc->hn_ifp, "setup default RSS indirect "
6523 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
6524 rss->rss_ind[i] = i % nchan;
6525 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6528 * # of usable channels may be changed, so we have to
6529 * make sure that all entries in RSS indirect table
6532 * NOTE: hn_set_ring_inuse() _must_ have been called.
6534 hn_rss_ind_fixup(sc);
6537 sc->hn_rss_hash = sc->hn_rss_hcap;
6538 if ((sc->hn_flags & HN_FLAG_RXVF) ||
6539 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6540 /* NOTE: Don't reconfigure RSS; will do immediately. */
6541 hn_vf_rss_fixup(sc, false);
6543 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6548 * Fixup transmission aggregation setup.
6551 hn_rndis_init_fixat(sc, nchan);
6555 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6556 hn_rndis_init_fixat(sc, nchan);
6557 hn_synth_detach(sc);
6559 if (attached & ATTACHED_RNDIS) {
6560 hn_rndis_init_fixat(sc, nchan);
6561 hn_rndis_detach(sc);
6563 if (attached & ATTACHED_NVS)
6565 hn_chan_detach(sc, sc->hn_prichan);
6566 /* Restore old capabilities. */
6567 sc->hn_caps = old_caps;
6571 #undef ATTACHED_RNDIS
6577 * The interface must have been suspended though hn_suspend(), before
6578 * this function get called.
6581 hn_synth_detach(struct hn_softc *sc)
6584 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6585 ("synthetic parts were not attached"));
6587 /* Detach the RNDIS first. */
6588 hn_rndis_detach(sc);
6593 /* Detach all of the channels. */
6594 hn_detach_allchans(sc);
6596 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6600 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6602 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6603 ("invalid ring count %d", ring_cnt));
6605 if (sc->hn_tx_ring_cnt > ring_cnt)
6606 sc->hn_tx_ring_inuse = ring_cnt;
6608 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6609 sc->hn_rx_ring_inuse = ring_cnt;
6612 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6613 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6618 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6623 * The TX bufring will not be drained by the hypervisor,
6624 * if the primary channel is revoked.
6626 while (!vmbus_chan_rx_empty(chan) ||
6627 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6628 !vmbus_chan_tx_empty(chan)))
6630 vmbus_chan_intr_drain(chan);
6634 hn_disable_rx(struct hn_softc *sc)
6638 * Disable RX by clearing RX filter forcefully.
6640 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6641 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6644 * Give RNDIS enough time to flush all pending data packets.
6646 pause("waitrx", (200 * hz) / 1000);
6651 * RX/TX _must_ have been suspended/disabled, before this function
6655 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6657 struct vmbus_channel **subch = NULL;
6661 * Drain RX/TX bufrings and interrupts.
6665 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6667 if (subch != NULL) {
6670 for (i = 0; i < nsubch; ++i)
6671 hn_chan_drain(sc, subch[i]);
6673 hn_chan_drain(sc, sc->hn_prichan);
6676 vmbus_subchan_rel(subch, nsubch);
6680 hn_suspend_data(struct hn_softc *sc)
6682 struct hn_tx_ring *txr;
6690 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6691 txr = &sc->hn_tx_ring[i];
6693 mtx_lock(&txr->hn_tx_lock);
6694 txr->hn_suspended = 1;
6695 mtx_unlock(&txr->hn_tx_lock);
6696 /* No one is able send more packets now. */
6699 * Wait for all pending sends to finish.
6702 * We will _not_ receive all pending send-done, if the
6703 * primary channel is revoked.
6705 while (hn_tx_ring_pending(txr) &&
6706 !vmbus_chan_is_revoked(sc->hn_prichan))
6707 pause("hnwtx", 1 /* 1 tick */);
6718 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6721 * Drain any pending TX tasks.
6724 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6725 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6727 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6728 txr = &sc->hn_tx_ring[i];
6730 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6731 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6736 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6739 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6743 hn_suspend_mgmt(struct hn_softc *sc)
6750 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6751 * through hn_mgmt_taskq.
6753 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6754 vmbus_chan_run_task(sc->hn_prichan, &task);
6757 * Make sure that all pending management tasks are completed.
6759 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6760 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6761 taskqueue_drain_all(sc->hn_mgmt_taskq0);
6765 hn_suspend(struct hn_softc *sc)
6768 /* Disable polling. */
6772 * If the non-transparent mode VF is activated, the synthetic
6773 * device is receiving packets, so the data path of the
6774 * synthetic device must be suspended.
6776 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6777 (sc->hn_flags & HN_FLAG_RXVF))
6778 hn_suspend_data(sc);
6779 hn_suspend_mgmt(sc);
6783 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6787 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6788 ("invalid TX ring count %d", tx_ring_cnt));
6790 for (i = 0; i < tx_ring_cnt; ++i) {
6791 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6793 mtx_lock(&txr->hn_tx_lock);
6794 txr->hn_suspended = 0;
6795 mtx_unlock(&txr->hn_tx_lock);
6800 hn_resume_data(struct hn_softc *sc)
6809 hn_rxfilter_config(sc);
6812 * Make sure to clear suspend status on "all" TX rings,
6813 * since hn_tx_ring_inuse can be changed after
6814 * hn_suspend_data().
6816 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6818 #ifdef HN_IFSTART_SUPPORT
6819 if (!hn_use_if_start)
6823 * Flush unused drbrs, since hn_tx_ring_inuse may be
6826 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6827 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6833 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6834 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6837 * Use txeof task, so that any pending oactive can be
6840 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6845 hn_resume_mgmt(struct hn_softc *sc)
6848 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6851 * Kick off network change detection, if it was pending.
6852 * If no network change was pending, start link status
6853 * checks, which is more lightweight than network change
6856 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6857 hn_change_network(sc);
6859 hn_update_link_status(sc);
6863 hn_resume(struct hn_softc *sc)
6867 * If the non-transparent mode VF is activated, the synthetic
6868 * device have to receive packets, so the data path of the
6869 * synthetic device must be resumed.
6871 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6872 (sc->hn_flags & HN_FLAG_RXVF))
6876 * Don't resume link status change if VF is attached/activated.
6877 * - In the non-transparent VF mode, the synthetic device marks
6878 * link down until the VF is deactivated; i.e. VF is down.
6879 * - In transparent VF mode, VF's media status is used until
6880 * the VF is detached.
6882 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6883 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6887 * Re-enable polling if this interface is running and
6888 * the polling is requested.
6890 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6891 hn_polling(sc, sc->hn_pollhz);
6895 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6897 const struct rndis_status_msg *msg;
6900 if (dlen < sizeof(*msg)) {
6901 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6906 switch (msg->rm_status) {
6907 case RNDIS_STATUS_MEDIA_CONNECT:
6908 case RNDIS_STATUS_MEDIA_DISCONNECT:
6909 hn_update_link_status(sc);
6912 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6913 case RNDIS_STATUS_LINK_SPEED_CHANGE:
6914 /* Not really useful; ignore. */
6917 case RNDIS_STATUS_NETWORK_CHANGE:
6918 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6919 if (dlen < ofs + msg->rm_stbuflen ||
6920 msg->rm_stbuflen < sizeof(uint32_t)) {
6921 if_printf(sc->hn_ifp, "network changed\n");
6925 memcpy(&change, ((const uint8_t *)msg) + ofs,
6927 if_printf(sc->hn_ifp, "network changed, change %u\n",
6930 hn_change_network(sc);
6934 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6941 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6943 const struct rndis_pktinfo *pi = info_data;
6946 while (info_dlen != 0) {
6950 if (__predict_false(info_dlen < sizeof(*pi)))
6952 if (__predict_false(info_dlen < pi->rm_size))
6954 info_dlen -= pi->rm_size;
6956 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6958 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6960 dlen = pi->rm_size - pi->rm_pktinfooffset;
6963 switch (pi->rm_type) {
6964 case NDIS_PKTINFO_TYPE_VLAN:
6965 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
6967 info->vlan_info = *((const uint32_t *)data);
6968 mask |= HN_RXINFO_VLAN;
6971 case NDIS_PKTINFO_TYPE_CSUM:
6972 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
6974 info->csum_info = *((const uint32_t *)data);
6975 mask |= HN_RXINFO_CSUM;
6978 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
6979 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
6981 info->hash_value = *((const uint32_t *)data);
6982 mask |= HN_RXINFO_HASHVAL;
6985 case HN_NDIS_PKTINFO_TYPE_HASHINF:
6986 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
6988 info->hash_info = *((const uint32_t *)data);
6989 mask |= HN_RXINFO_HASHINF;
6996 if (mask == HN_RXINFO_ALL) {
6997 /* All found; done */
7001 pi = (const struct rndis_pktinfo *)
7002 ((const uint8_t *)pi + pi->rm_size);
7007 * - If there is no hash value, invalidate the hash info.
7009 if ((mask & HN_RXINFO_HASHVAL) == 0)
7010 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
7014 static __inline bool
7015 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7018 if (off < check_off) {
7019 if (__predict_true(off + len <= check_off))
7021 } else if (off > check_off) {
7022 if (__predict_true(check_off + check_len <= off))
7029 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7031 const struct rndis_packet_msg *pkt;
7032 struct hn_rxinfo info;
7033 int data_off, pktinfo_off, data_len, pktinfo_len;
7038 if (__predict_false(dlen < sizeof(*pkt))) {
7039 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7044 if (__predict_false(dlen < pkt->rm_len)) {
7045 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7046 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7049 if (__predict_false(pkt->rm_len <
7050 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7051 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7052 "msglen %u, data %u, oob %u, pktinfo %u\n",
7053 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7054 pkt->rm_pktinfolen);
7057 if (__predict_false(pkt->rm_datalen == 0)) {
7058 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7065 #define IS_OFFSET_INVALID(ofs) \
7066 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
7067 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7069 /* XXX Hyper-V does not meet data offset alignment requirement */
7070 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7071 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7072 "data offset %u\n", pkt->rm_dataoffset);
7075 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7076 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7077 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7078 "oob offset %u\n", pkt->rm_oobdataoffset);
7081 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7082 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7083 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7084 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7088 #undef IS_OFFSET_INVALID
7090 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7091 data_len = pkt->rm_datalen;
7092 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7093 pktinfo_len = pkt->rm_pktinfolen;
7096 * Check OOB coverage.
7098 if (__predict_false(pkt->rm_oobdatalen != 0)) {
7099 int oob_off, oob_len;
7101 if_printf(rxr->hn_ifp, "got oobdata\n");
7102 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7103 oob_len = pkt->rm_oobdatalen;
7105 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7106 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7107 "oob overflow, msglen %u, oob abs %d len %d\n",
7108 pkt->rm_len, oob_off, oob_len);
7113 * Check against data.
7115 if (hn_rndis_check_overlap(oob_off, oob_len,
7116 data_off, data_len)) {
7117 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7118 "oob overlaps data, oob abs %d len %d, "
7119 "data abs %d len %d\n",
7120 oob_off, oob_len, data_off, data_len);
7125 * Check against pktinfo.
7127 if (pktinfo_len != 0 &&
7128 hn_rndis_check_overlap(oob_off, oob_len,
7129 pktinfo_off, pktinfo_len)) {
7130 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7131 "oob overlaps pktinfo, oob abs %d len %d, "
7132 "pktinfo abs %d len %d\n",
7133 oob_off, oob_len, pktinfo_off, pktinfo_len);
7139 * Check per-packet-info coverage and find useful per-packet-info.
7141 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7142 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7143 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7144 if (__predict_true(pktinfo_len != 0)) {
7148 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7149 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7150 "pktinfo overflow, msglen %u, "
7151 "pktinfo abs %d len %d\n",
7152 pkt->rm_len, pktinfo_off, pktinfo_len);
7157 * Check packet info coverage.
7159 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7160 data_off, data_len);
7161 if (__predict_false(overlap)) {
7162 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7163 "pktinfo overlap data, pktinfo abs %d len %d, "
7164 "data abs %d len %d\n",
7165 pktinfo_off, pktinfo_len, data_off, data_len);
7170 * Find useful per-packet-info.
7172 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7173 pktinfo_len, &info);
7174 if (__predict_false(error)) {
7175 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7181 if (__predict_false(data_off + data_len > pkt->rm_len)) {
7182 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7183 "data overflow, msglen %u, data abs %d len %d\n",
7184 pkt->rm_len, data_off, data_len);
7187 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7190 static __inline void
7191 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7193 const struct rndis_msghdr *hdr;
7195 if (__predict_false(dlen < sizeof(*hdr))) {
7196 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7201 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7202 /* Hot data path. */
7203 hn_rndis_rx_data(rxr, data, dlen);
7208 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7209 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7211 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7215 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7217 const struct hn_nvs_hdr *hdr;
7219 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7220 if_printf(sc->hn_ifp, "invalid nvs notify\n");
7223 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7225 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7226 /* Useless; ignore */
7229 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7233 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7234 const struct vmbus_chanpkt_hdr *pkt)
7236 struct hn_nvs_sendctx *sndc;
7238 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7239 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7240 VMBUS_CHANPKT_DATALEN(pkt));
7243 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7249 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7250 const struct vmbus_chanpkt_hdr *pkthdr)
7252 const struct vmbus_chanpkt_rxbuf *pkt;
7253 const struct hn_nvs_hdr *nvs_hdr;
7256 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7257 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7260 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7262 /* Make sure that this is a RNDIS message. */
7263 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7264 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7269 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7270 if (__predict_false(hlen < sizeof(*pkt))) {
7271 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7274 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7276 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7277 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7282 count = pkt->cp_rxbuf_cnt;
7283 if (__predict_false(hlen <
7284 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7285 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7289 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7290 for (i = 0; i < count; ++i) {
7293 ofs = pkt->cp_rxbuf[i].rb_ofs;
7294 len = pkt->cp_rxbuf[i].rb_len;
7295 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7296 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7297 "ofs %d, len %d\n", i, ofs, len);
7300 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7304 * Ack the consumed RXBUF associated w/ this channel packet,
7305 * so that this RXBUF can be recycled by the hypervisor.
7307 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7311 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7314 struct hn_nvs_rndis_ack ack;
7317 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7318 ack.nvs_status = HN_NVS_STATUS_OK;
7322 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7323 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7324 if (__predict_false(error == EAGAIN)) {
7327 * This should _not_ happen in real world, since the
7328 * consumption of the TX bufring from the TX path is
7331 if (rxr->hn_ack_failed == 0)
7332 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7333 rxr->hn_ack_failed++;
7340 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7345 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7347 struct hn_rx_ring *rxr = xrxr;
7348 struct hn_softc *sc = rxr->hn_ifp->if_softc;
7351 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7354 pktlen = rxr->hn_pktbuf_len;
7355 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7356 if (__predict_false(error == ENOBUFS)) {
7361 * Expand channel packet buffer.
7364 * Use M_WAITOK here, since allocation failure
7367 nlen = rxr->hn_pktbuf_len * 2;
7368 while (nlen < pktlen)
7370 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7372 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7373 rxr->hn_pktbuf_len, nlen);
7375 free(rxr->hn_pktbuf, M_DEVBUF);
7376 rxr->hn_pktbuf = nbuf;
7377 rxr->hn_pktbuf_len = nlen;
7380 } else if (__predict_false(error == EAGAIN)) {
7381 /* No more channel packets; done! */
7384 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7386 switch (pkt->cph_type) {
7387 case VMBUS_CHANPKT_TYPE_COMP:
7388 hn_nvs_handle_comp(sc, chan, pkt);
7391 case VMBUS_CHANPKT_TYPE_RXBUF:
7392 hn_nvs_handle_rxbuf(rxr, chan, pkt);
7395 case VMBUS_CHANPKT_TYPE_INBAND:
7396 hn_nvs_handle_notify(sc, pkt);
7400 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7405 hn_chan_rollup(rxr, rxr->hn_txr);
7409 hn_sysinit(void *arg __unused)
7413 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7415 #ifdef HN_IFSTART_SUPPORT
7417 * Don't use ifnet.if_start if transparent VF mode is requested;
7418 * mainly due to the IFF_DRV_OACTIVE flag.
7420 if (hn_xpnt_vf && hn_use_if_start) {
7421 hn_use_if_start = 0;
7422 printf("hn: tranparent VF mode, if_transmit will be used, "
7423 "instead of if_start\n");
7426 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7427 printf("hn: invalid transparent VF attach routing "
7428 "wait timeout %d, reset to %d\n",
7429 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7430 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7434 * Initialize VF map.
7436 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7437 hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7438 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7442 * Fix the # of TX taskqueues.
7444 if (hn_tx_taskq_cnt <= 0)
7445 hn_tx_taskq_cnt = 1;
7446 else if (hn_tx_taskq_cnt > mp_ncpus)
7447 hn_tx_taskq_cnt = mp_ncpus;
7450 * Fix the TX taskqueue mode.
7452 switch (hn_tx_taskq_mode) {
7453 case HN_TX_TASKQ_M_INDEP:
7454 case HN_TX_TASKQ_M_GLOBAL:
7455 case HN_TX_TASKQ_M_EVTTQ:
7458 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7462 if (vm_guest != VM_GUEST_HV)
7465 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7468 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7469 M_DEVBUF, M_WAITOK);
7470 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7471 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7472 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7473 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7477 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7480 hn_sysuninit(void *arg __unused)
7483 if (hn_tx_taskque != NULL) {
7486 for (i = 0; i < hn_tx_taskq_cnt; ++i)
7487 taskqueue_free(hn_tx_taskque[i]);
7488 free(hn_tx_taskque, M_DEVBUF);
7491 if (hn_vfmap != NULL)
7492 free(hn_vfmap, M_DEVBUF);
7493 rm_destroy(&hn_vfmap_lock);
7495 counter_u64_free(hn_udpcs_fixup);
7497 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);