2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
62 #include <sys/param.h>
63 #include <sys/systm.h>
65 #include <sys/counter.h>
66 #include <sys/kernel.h>
67 #include <sys/limits.h>
68 #include <sys/malloc.h>
70 #include <sys/module.h>
72 #include <sys/queue.h>
74 #include <sys/rmlock.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
85 #include <machine/atomic.h>
86 #include <machine/in_cksum.h>
89 #include <net/ethernet.h>
91 #include <net/if_arp.h>
92 #include <net/if_dl.h>
93 #include <net/if_media.h>
94 #include <net/if_types.h>
95 #include <net/if_var.h>
96 #include <net/if_vlan_var.h>
97 #include <net/rndis.h>
99 #include <netinet/in_systm.h>
100 #include <netinet/in.h>
101 #include <netinet/ip.h>
102 #include <netinet/ip6.h>
103 #include <netinet/tcp.h>
104 #include <netinet/tcp_lro.h>
105 #include <netinet/udp.h>
107 #include <dev/hyperv/include/hyperv.h>
108 #include <dev/hyperv/include/hyperv_busdma.h>
109 #include <dev/hyperv/include/vmbus.h>
110 #include <dev/hyperv/include/vmbus_xact.h>
112 #include <dev/hyperv/netvsc/ndis.h>
113 #include <dev/hyperv/netvsc/if_hnreg.h>
114 #include <dev/hyperv/netvsc/if_hnvar.h>
115 #include <dev/hyperv/netvsc/hn_nvs.h>
116 #include <dev/hyperv/netvsc/hn_rndis.h>
118 #include "vmbus_if.h"
120 #define HN_IFSTART_SUPPORT
122 /* NOTE: M_HASHTYPE_RSS_UDP_IPV4 is not available on stable/10. */
123 #ifndef M_HASHTYPE_RSS_UDP_IPV4
124 #define M_HASHTYPE_RSS_UDP_IPV4 M_HASHTYPE_OPAQUE
127 #define HN_RING_CNT_DEF_MAX 8
129 #define HN_VFMAP_SIZE_DEF 8
131 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */
133 /* YYY should get it from the underlying channel */
134 #define HN_TX_DESC_CNT 512
136 #define HN_RNDIS_PKT_LEN \
137 (sizeof(struct rndis_packet_msg) + \
138 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
139 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
140 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
141 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
142 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
143 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
145 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
146 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
147 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
148 /* -1 for RNDIS packet message */
149 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
151 #define HN_DIRECT_TX_SIZE_DEF 128
153 #define HN_EARLY_TXEOF_THRESH 8
155 #define HN_PKTBUF_LEN_DEF (16 * 1024)
157 #define HN_LROENT_CNT_DEF 128
159 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
160 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
161 /* YYY 2*MTU is a bit rough, but should be good enough. */
162 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
164 #define HN_LRO_ACKCNT_DEF 1
166 #define HN_LOCK_INIT(sc) \
167 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
168 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
169 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
170 #define HN_LOCK(sc) \
172 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
175 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
177 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
178 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
179 #define HN_CSUM_IP_HWASSIST(sc) \
180 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
181 #define HN_CSUM_IP6_HWASSIST(sc) \
182 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
184 #define HN_PKTSIZE_MIN(align) \
185 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
186 HN_RNDIS_PKT_LEN, (align))
187 #define HN_PKTSIZE(m, align) \
188 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
190 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
193 #ifndef HN_USE_TXDESC_BUFRING
194 SLIST_ENTRY(hn_txdesc) link;
196 STAILQ_ENTRY(hn_txdesc) agg_link;
198 /* Aggregated txdescs, in sending order. */
199 STAILQ_HEAD(, hn_txdesc) agg_list;
201 /* The oldest packet, if transmission aggregation happens. */
203 struct hn_tx_ring *txr;
205 uint32_t flags; /* HN_TXD_FLAG_ */
206 struct hn_nvs_sendctx send_ctx;
210 bus_dmamap_t data_dmap;
212 bus_addr_t rndis_pkt_paddr;
213 struct rndis_packet_msg *rndis_pkt;
214 bus_dmamap_t rndis_pkt_dmap;
217 #define HN_TXD_FLAG_ONLIST 0x0001
218 #define HN_TXD_FLAG_DMAMAP 0x0002
219 #define HN_TXD_FLAG_ONAGG 0x0004
228 struct hn_rxvf_setarg {
229 struct hn_rx_ring *rxr;
230 struct ifnet *vf_ifp;
233 #define HN_RXINFO_VLAN 0x0001
234 #define HN_RXINFO_CSUM 0x0002
235 #define HN_RXINFO_HASHINF 0x0004
236 #define HN_RXINFO_HASHVAL 0x0008
237 #define HN_RXINFO_ALL \
240 HN_RXINFO_HASHINF | \
243 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
244 #define HN_NDIS_RXCSUM_INFO_INVALID 0
245 #define HN_NDIS_HASH_INFO_INVALID 0
247 static int hn_probe(device_t);
248 static int hn_attach(device_t);
249 static int hn_detach(device_t);
250 static int hn_shutdown(device_t);
251 static void hn_chan_callback(struct vmbus_channel *,
254 static void hn_init(void *);
255 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
256 #ifdef HN_IFSTART_SUPPORT
257 static void hn_start(struct ifnet *);
259 static int hn_transmit(struct ifnet *, struct mbuf *);
260 static void hn_xmit_qflush(struct ifnet *);
261 static int hn_ifmedia_upd(struct ifnet *);
262 static void hn_ifmedia_sts(struct ifnet *,
263 struct ifmediareq *);
265 static void hn_ifnet_event(void *, struct ifnet *, int);
266 static void hn_ifaddr_event(void *, struct ifnet *);
267 static void hn_ifnet_attevent(void *, struct ifnet *);
268 static void hn_ifnet_detevent(void *, struct ifnet *);
269 static void hn_ifnet_lnkevent(void *, struct ifnet *, int);
271 static bool hn_ismyvf(const struct hn_softc *,
272 const struct ifnet *);
273 static void hn_rxvf_change(struct hn_softc *,
274 struct ifnet *, bool);
275 static void hn_rxvf_set(struct hn_softc *, struct ifnet *);
276 static void hn_rxvf_set_task(void *, int);
277 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
278 static int hn_xpnt_vf_iocsetflags(struct hn_softc *);
279 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *,
281 static void hn_xpnt_vf_saveifflags(struct hn_softc *);
282 static bool hn_xpnt_vf_isready(struct hn_softc *);
283 static void hn_xpnt_vf_setready(struct hn_softc *);
284 static void hn_xpnt_vf_init_taskfunc(void *, int);
285 static void hn_xpnt_vf_init(struct hn_softc *);
286 static void hn_xpnt_vf_setenable(struct hn_softc *);
287 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool);
288 static void hn_vf_rss_fixup(struct hn_softc *, bool);
289 static void hn_vf_rss_restore(struct hn_softc *);
291 static int hn_rndis_rxinfo(const void *, int,
293 static void hn_rndis_rx_data(struct hn_rx_ring *,
295 static void hn_rndis_rx_status(struct hn_softc *,
297 static void hn_rndis_init_fixat(struct hn_softc *, int);
299 static void hn_nvs_handle_notify(struct hn_softc *,
300 const struct vmbus_chanpkt_hdr *);
301 static void hn_nvs_handle_comp(struct hn_softc *,
302 struct vmbus_channel *,
303 const struct vmbus_chanpkt_hdr *);
304 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
305 struct vmbus_channel *,
306 const struct vmbus_chanpkt_hdr *);
307 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
308 struct vmbus_channel *, uint64_t);
310 #if __FreeBSD_version >= 1100099
311 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
312 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
314 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
315 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
316 #if __FreeBSD_version < 1100095
317 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
319 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
321 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
322 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
323 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
324 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
325 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
326 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
327 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
328 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
329 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
330 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
331 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
332 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
333 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
334 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
335 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
336 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
337 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
338 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
339 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
340 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
341 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
342 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
343 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
345 static void hn_stop(struct hn_softc *, bool);
346 static void hn_init_locked(struct hn_softc *);
347 static int hn_chan_attach(struct hn_softc *,
348 struct vmbus_channel *);
349 static void hn_chan_detach(struct hn_softc *,
350 struct vmbus_channel *);
351 static int hn_attach_subchans(struct hn_softc *);
352 static void hn_detach_allchans(struct hn_softc *);
353 static void hn_chan_rollup(struct hn_rx_ring *,
354 struct hn_tx_ring *);
355 static void hn_set_ring_inuse(struct hn_softc *, int);
356 static int hn_synth_attach(struct hn_softc *, int);
357 static void hn_synth_detach(struct hn_softc *);
358 static int hn_synth_alloc_subchans(struct hn_softc *,
360 static bool hn_synth_attachable(const struct hn_softc *);
361 static void hn_suspend(struct hn_softc *);
362 static void hn_suspend_data(struct hn_softc *);
363 static void hn_suspend_mgmt(struct hn_softc *);
364 static void hn_resume(struct hn_softc *);
365 static void hn_resume_data(struct hn_softc *);
366 static void hn_resume_mgmt(struct hn_softc *);
367 static void hn_suspend_mgmt_taskfunc(void *, int);
368 static void hn_chan_drain(struct hn_softc *,
369 struct vmbus_channel *);
370 static void hn_disable_rx(struct hn_softc *);
371 static void hn_drain_rxtx(struct hn_softc *, int);
372 static void hn_polling(struct hn_softc *, u_int);
373 static void hn_chan_polling(struct vmbus_channel *, u_int);
374 static void hn_mtu_change_fixup(struct hn_softc *);
376 static void hn_update_link_status(struct hn_softc *);
377 static void hn_change_network(struct hn_softc *);
378 static void hn_link_taskfunc(void *, int);
379 static void hn_netchg_init_taskfunc(void *, int);
380 static void hn_netchg_status_taskfunc(void *, int);
381 static void hn_link_status(struct hn_softc *);
383 static int hn_create_rx_data(struct hn_softc *, int);
384 static void hn_destroy_rx_data(struct hn_softc *);
385 static int hn_check_iplen(const struct mbuf *, int);
386 static void hn_rxpkt_proto(const struct mbuf *, int *, int *);
387 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
388 static int hn_rxfilter_config(struct hn_softc *);
389 static int hn_rss_reconfig(struct hn_softc *);
390 static void hn_rss_ind_fixup(struct hn_softc *);
391 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
392 static int hn_rxpkt(struct hn_rx_ring *, const void *,
393 int, const struct hn_rxinfo *);
394 static uint32_t hn_rss_type_fromndis(uint32_t);
395 static uint32_t hn_rss_type_tondis(uint32_t);
397 static int hn_tx_ring_create(struct hn_softc *, int);
398 static void hn_tx_ring_destroy(struct hn_tx_ring *);
399 static int hn_create_tx_data(struct hn_softc *, int);
400 static void hn_fixup_tx_data(struct hn_softc *);
401 static void hn_fixup_rx_data(struct hn_softc *);
402 static void hn_destroy_tx_data(struct hn_softc *);
403 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
404 static void hn_txdesc_gc(struct hn_tx_ring *,
406 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
407 struct hn_txdesc *, struct mbuf **);
408 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
410 static void hn_set_chim_size(struct hn_softc *, int);
411 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
412 static bool hn_tx_ring_pending(struct hn_tx_ring *);
413 static void hn_tx_ring_qflush(struct hn_tx_ring *);
414 static void hn_resume_tx(struct hn_softc *, int);
415 static void hn_set_txagg(struct hn_softc *);
416 static void *hn_try_txagg(struct ifnet *,
417 struct hn_tx_ring *, struct hn_txdesc *,
419 static int hn_get_txswq_depth(const struct hn_tx_ring *);
420 static void hn_txpkt_done(struct hn_nvs_sendctx *,
421 struct hn_softc *, struct vmbus_channel *,
423 static int hn_txpkt_sglist(struct hn_tx_ring *,
425 static int hn_txpkt_chim(struct hn_tx_ring *,
427 static int hn_xmit(struct hn_tx_ring *, int);
428 static void hn_xmit_taskfunc(void *, int);
429 static void hn_xmit_txeof(struct hn_tx_ring *);
430 static void hn_xmit_txeof_taskfunc(void *, int);
431 #ifdef HN_IFSTART_SUPPORT
432 static int hn_start_locked(struct hn_tx_ring *, int);
433 static void hn_start_taskfunc(void *, int);
434 static void hn_start_txeof(struct hn_tx_ring *);
435 static void hn_start_txeof_taskfunc(void *, int);
438 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
439 "Hyper-V network interface");
441 /* Trust tcp segements verification on host side. */
442 static int hn_trust_hosttcp = 1;
443 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
444 &hn_trust_hosttcp, 0,
445 "Trust tcp segement verification on host side, "
446 "when csum info is missing (global setting)");
448 /* Trust udp datagrams verification on host side. */
449 static int hn_trust_hostudp = 1;
450 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
451 &hn_trust_hostudp, 0,
452 "Trust udp datagram verification on host side, "
453 "when csum info is missing (global setting)");
455 /* Trust ip packets verification on host side. */
456 static int hn_trust_hostip = 1;
457 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
459 "Trust ip packet verification on host side, "
460 "when csum info is missing (global setting)");
463 * Offload UDP/IPv4 checksum.
465 static int hn_enable_udp4cs = 1;
466 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
467 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
470 * Offload UDP/IPv6 checksum.
472 static int hn_enable_udp6cs = 1;
473 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
474 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
477 static counter_u64_t hn_udpcs_fixup;
478 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
479 &hn_udpcs_fixup, "# of UDP checksum fixup");
484 * This value is for Azure. For Hyper-V, set this above
485 * 65536 to disable UDP datagram checksum fixup.
487 static int hn_udpcs_fixup_mtu = 1420;
488 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
489 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
491 /* Limit TSO burst size */
492 static int hn_tso_maxlen = IP_MAXPACKET;
493 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
494 &hn_tso_maxlen, 0, "TSO burst limit");
496 /* Limit chimney send size */
497 static int hn_tx_chimney_size = 0;
498 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
499 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
501 /* Limit the size of packet for direct transmission */
502 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
503 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
504 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
506 /* # of LRO entries per RX ring */
507 #if defined(INET) || defined(INET6)
508 #if __FreeBSD_version >= 1100095
509 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
510 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
511 &hn_lro_entry_count, 0, "LRO entry count");
515 static int hn_tx_taskq_cnt = 1;
516 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
517 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
519 #define HN_TX_TASKQ_M_INDEP 0
520 #define HN_TX_TASKQ_M_GLOBAL 1
521 #define HN_TX_TASKQ_M_EVTTQ 2
523 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
524 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
525 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
526 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
528 #ifndef HN_USE_TXDESC_BUFRING
529 static int hn_use_txdesc_bufring = 0;
531 static int hn_use_txdesc_bufring = 1;
533 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
534 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
536 #ifdef HN_IFSTART_SUPPORT
537 /* Use ifnet.if_start instead of ifnet.if_transmit */
538 static int hn_use_if_start = 0;
539 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
540 &hn_use_if_start, 0, "Use if_start TX method");
543 /* # of channels to use */
544 static int hn_chan_cnt = 0;
545 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
547 "# of channels to use; each channel has one RX ring and one TX ring");
549 /* # of transmit rings to use */
550 static int hn_tx_ring_cnt = 0;
551 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
552 &hn_tx_ring_cnt, 0, "# of TX rings to use");
554 /* Software TX ring deptch */
555 static int hn_tx_swq_depth = 0;
556 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
557 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
559 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
560 #if __FreeBSD_version >= 1100095
561 static u_int hn_lro_mbufq_depth = 0;
562 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
563 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
566 /* Packet transmission aggregation size limit */
567 static int hn_tx_agg_size = -1;
568 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
569 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
571 /* Packet transmission aggregation count limit */
572 static int hn_tx_agg_pkts = -1;
573 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
574 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
577 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
578 0, 0, hn_vflist_sysctl, "A", "VF list");
581 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
582 0, 0, hn_vfmap_sysctl, "A", "VF mapping");
585 static int hn_xpnt_vf = 1;
586 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
587 &hn_xpnt_vf, 0, "Transparent VF mod");
589 /* Accurate BPF support for Transparent VF */
590 static int hn_xpnt_vf_accbpf = 0;
591 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
592 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
594 /* Extra wait for transparent VF attach routing; unit seconds. */
595 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
596 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
597 &hn_xpnt_vf_attwait, 0,
598 "Extra wait for transparent VF attach routing; unit: seconds");
600 static u_int hn_cpu_index; /* next CPU for channel */
601 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
603 static struct rmlock hn_vfmap_lock;
604 static int hn_vfmap_size;
605 static struct ifnet **hn_vfmap;
608 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
609 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
610 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
611 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
612 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
613 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
616 static const struct hyperv_guid hn_guid = {
618 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
619 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
622 static device_method_t hn_methods[] = {
623 /* Device interface */
624 DEVMETHOD(device_probe, hn_probe),
625 DEVMETHOD(device_attach, hn_attach),
626 DEVMETHOD(device_detach, hn_detach),
627 DEVMETHOD(device_shutdown, hn_shutdown),
631 static driver_t hn_driver = {
634 sizeof(struct hn_softc)
637 static devclass_t hn_devclass;
639 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
640 MODULE_VERSION(hn, 1);
641 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
643 #if __FreeBSD_version >= 1100099
645 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
649 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
650 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
655 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
658 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
659 txd->chim_size == 0, ("invalid rndis sglist txd"));
660 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
661 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
665 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
667 struct hn_nvs_rndis rndis;
669 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
670 txd->chim_size > 0, ("invalid rndis chim txd"));
672 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
673 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
674 rndis.nvs_chim_idx = txd->chim_index;
675 rndis.nvs_chim_sz = txd->chim_size;
677 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
678 &rndis, sizeof(rndis), &txd->send_ctx));
681 static __inline uint32_t
682 hn_chim_alloc(struct hn_softc *sc)
684 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
685 u_long *bmap = sc->hn_chim_bmap;
686 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
688 for (i = 0; i < bmap_cnt; ++i) {
691 idx = ffsl(~bmap[i]);
695 --idx; /* ffsl is 1-based */
696 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
697 ("invalid i %d and idx %d", i, idx));
699 if (atomic_testandset_long(&bmap[i], idx))
702 ret = i * LONG_BIT + idx;
709 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
714 idx = chim_idx / LONG_BIT;
715 KASSERT(idx < sc->hn_chim_bmap_cnt,
716 ("invalid chimney index 0x%x", chim_idx));
718 mask = 1UL << (chim_idx % LONG_BIT);
719 KASSERT(sc->hn_chim_bmap[idx] & mask,
720 ("index bitmap 0x%lx, chimney index %u, "
721 "bitmap idx %d, bitmask 0x%lx",
722 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
724 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
727 #if defined(INET6) || defined(INET)
729 #define PULLUP_HDR(m, len) \
731 if (__predict_false((m)->m_len < (len))) { \
732 (m) = m_pullup((m), (len)); \
739 * NOTE: If this function failed, the m_head would be freed.
741 static __inline struct mbuf *
742 hn_tso_fixup(struct mbuf *m_head)
744 struct ether_vlan_header *evl;
748 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
750 PULLUP_HDR(m_head, sizeof(*evl));
751 evl = mtod(m_head, struct ether_vlan_header *);
752 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
753 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
755 ehlen = ETHER_HDR_LEN;
756 m_head->m_pkthdr.l2hlen = ehlen;
759 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
763 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
764 ip = mtodo(m_head, ehlen);
765 iphlen = ip->ip_hl << 2;
766 m_head->m_pkthdr.l3hlen = iphlen;
768 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
769 th = mtodo(m_head, ehlen + iphlen);
773 th->th_sum = in_pseudo(ip->ip_src.s_addr,
774 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
777 #if defined(INET6) && defined(INET)
784 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
785 ip6 = mtodo(m_head, ehlen);
786 if (ip6->ip6_nxt != IPPROTO_TCP) {
790 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
792 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
793 th = mtodo(m_head, ehlen + sizeof(*ip6));
796 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
803 * NOTE: If this function failed, the m_head would be freed.
805 static __inline struct mbuf *
806 hn_set_hlen(struct mbuf *m_head)
808 const struct ether_vlan_header *evl;
811 PULLUP_HDR(m_head, sizeof(*evl));
812 evl = mtod(m_head, const struct ether_vlan_header *);
813 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
814 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
816 ehlen = ETHER_HDR_LEN;
817 m_head->m_pkthdr.l2hlen = ehlen;
820 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
824 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
825 ip = mtodo(m_head, ehlen);
826 iphlen = ip->ip_hl << 2;
827 m_head->m_pkthdr.l3hlen = iphlen;
830 * UDP checksum offload does not work in Azure, if the
831 * following conditions meet:
832 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
833 * - IP_DF is not set in the IP hdr.
835 * Fallback to software checksum for these UDP datagrams.
837 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
838 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
839 (ntohs(ip->ip_off) & IP_DF) == 0) {
840 uint16_t off = ehlen + iphlen;
842 counter_u64_add(hn_udpcs_fixup, 1);
843 PULLUP_HDR(m_head, off + sizeof(struct udphdr));
844 *(uint16_t *)(m_head->m_data + off +
845 m_head->m_pkthdr.csum_data) = in_cksum_skip(
846 m_head, m_head->m_pkthdr.len, off);
847 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
851 #if defined(INET6) && defined(INET)
856 const struct ip6_hdr *ip6;
858 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
859 ip6 = mtodo(m_head, ehlen);
860 if (ip6->ip6_nxt != IPPROTO_TCP &&
861 ip6->ip6_nxt != IPPROTO_UDP) {
865 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
872 * NOTE: If this function failed, the m_head would be freed.
874 static __inline struct mbuf *
875 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
877 const struct tcphdr *th;
881 ehlen = m_head->m_pkthdr.l2hlen;
882 iphlen = m_head->m_pkthdr.l3hlen;
884 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
885 th = mtodo(m_head, ehlen + iphlen);
886 if (th->th_flags & TH_SYN)
893 #endif /* INET6 || INET */
896 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
902 if (sc->hn_rx_filter != filter) {
903 error = hn_rndis_set_rxfilter(sc, filter);
905 sc->hn_rx_filter = filter;
911 hn_rxfilter_config(struct hn_softc *sc)
913 struct ifnet *ifp = sc->hn_ifp;
919 * If the non-transparent mode VF is activated, we don't know how
920 * its RX filter is configured, so stick the synthetic device in
921 * the promiscous mode.
923 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
924 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
926 filter = NDIS_PACKET_TYPE_DIRECTED;
927 if (ifp->if_flags & IFF_BROADCAST)
928 filter |= NDIS_PACKET_TYPE_BROADCAST;
929 /* TODO: support multicast list */
930 if ((ifp->if_flags & IFF_ALLMULTI) ||
931 !TAILQ_EMPTY(&ifp->if_multiaddrs))
932 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
934 return (hn_set_rxfilter(sc, filter));
938 hn_set_txagg(struct hn_softc *sc)
944 * Setup aggregation size.
946 if (sc->hn_agg_size < 0)
949 size = sc->hn_agg_size;
951 if (sc->hn_rndis_agg_size < size)
952 size = sc->hn_rndis_agg_size;
954 /* NOTE: We only aggregate packets using chimney sending buffers. */
955 if (size > (uint32_t)sc->hn_chim_szmax)
956 size = sc->hn_chim_szmax;
958 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
965 /* NOTE: Type of the per TX ring setting is 'int'. */
970 * Setup aggregation packet count.
972 if (sc->hn_agg_pkts < 0)
975 pkts = sc->hn_agg_pkts;
977 if (sc->hn_rndis_agg_pkts < pkts)
978 pkts = sc->hn_rndis_agg_pkts;
987 /* NOTE: Type of the per TX ring setting is 'short'. */
992 /* NOTE: Type of the per TX ring setting is 'short'. */
993 if (sc->hn_rndis_agg_align > SHRT_MAX) {
1000 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
1001 size, pkts, sc->hn_rndis_agg_align);
1004 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
1005 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
1007 mtx_lock(&txr->hn_tx_lock);
1008 txr->hn_agg_szmax = size;
1009 txr->hn_agg_pktmax = pkts;
1010 txr->hn_agg_align = sc->hn_rndis_agg_align;
1011 mtx_unlock(&txr->hn_tx_lock);
1016 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1019 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1020 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1021 return txr->hn_txdesc_cnt;
1022 return hn_tx_swq_depth;
1026 hn_rss_reconfig(struct hn_softc *sc)
1032 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1036 * Disable RSS first.
1039 * Direct reconfiguration by setting the UNCHG flags does
1040 * _not_ work properly.
1043 if_printf(sc->hn_ifp, "disable RSS\n");
1044 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1046 if_printf(sc->hn_ifp, "RSS disable failed\n");
1051 * Reenable the RSS w/ the updated RSS key or indirect
1055 if_printf(sc->hn_ifp, "reconfig RSS\n");
1056 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1058 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1065 hn_rss_ind_fixup(struct hn_softc *sc)
1067 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1070 nchan = sc->hn_rx_ring_inuse;
1071 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1074 * Check indirect table to make sure that all channels in it
1077 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1078 if (rss->rss_ind[i] >= nchan) {
1079 if_printf(sc->hn_ifp,
1080 "RSS indirect table %d fixup: %u -> %d\n",
1081 i, rss->rss_ind[i], nchan - 1);
1082 rss->rss_ind[i] = nchan - 1;
1088 hn_ifmedia_upd(struct ifnet *ifp __unused)
1095 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1097 struct hn_softc *sc = ifp->if_softc;
1099 ifmr->ifm_status = IFM_AVALID;
1100 ifmr->ifm_active = IFM_ETHER;
1102 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1103 ifmr->ifm_active |= IFM_NONE;
1106 ifmr->ifm_status |= IFM_ACTIVE;
1107 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1111 hn_rxvf_set_task(void *xarg, int pending __unused)
1113 struct hn_rxvf_setarg *arg = xarg;
1115 arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1119 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1121 struct hn_rx_ring *rxr;
1122 struct hn_rxvf_setarg arg;
1128 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1130 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1131 rxr = &sc->hn_rx_ring[i];
1133 if (i < sc->hn_rx_ring_inuse) {
1135 arg.vf_ifp = vf_ifp;
1136 vmbus_chan_run_task(rxr->hn_chan, &task);
1138 rxr->hn_rxvf_ifp = vf_ifp;
1144 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1146 const struct ifnet *hn_ifp;
1148 hn_ifp = sc->hn_ifp;
1153 if (ifp->if_alloctype != IFT_ETHER)
1156 /* Ignore lagg/vlan interfaces */
1157 if (strcmp(ifp->if_dname, "lagg") == 0 ||
1158 strcmp(ifp->if_dname, "vlan") == 0)
1162 * During detach events ifp->if_addr might be NULL.
1163 * Make sure the bcmp() below doesn't panic on that:
1165 if (ifp->if_addr == NULL || hn_ifp->if_addr == NULL)
1168 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1175 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1177 struct ifnet *hn_ifp;
1181 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1184 if (!hn_ismyvf(sc, ifp))
1186 hn_ifp = sc->hn_ifp;
1189 if (sc->hn_flags & HN_FLAG_RXVF)
1192 sc->hn_flags |= HN_FLAG_RXVF;
1193 hn_rxfilter_config(sc);
1195 if (!(sc->hn_flags & HN_FLAG_RXVF))
1198 sc->hn_flags &= ~HN_FLAG_RXVF;
1199 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1200 hn_rxfilter_config(sc);
1202 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1205 hn_nvs_set_datapath(sc,
1206 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1208 hn_rxvf_set(sc, rxvf ? ifp : NULL);
1211 hn_vf_rss_fixup(sc, true);
1212 hn_suspend_mgmt(sc);
1213 sc->hn_link_flags &=
1214 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1215 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1217 hn_vf_rss_restore(sc);
1221 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1222 rxvf ? "VF_UP" : "VF_DOWN", NULL);
1225 if_printf(hn_ifp, "datapath is switched %s %s\n",
1226 rxvf ? "to" : "from", ifp->if_xname);
1233 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1236 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1238 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1242 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1245 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1249 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1251 struct ifnet *ifp, *vf_ifp;
1257 vf_ifp = sc->hn_vf_ifp;
1260 * Fix up requested capabilities w/ supported capabilities,
1261 * since the supported capabilities could have been changed.
1263 ifr->ifr_reqcap &= ifp->if_capabilities;
1264 /* Pass SIOCSIFCAP to VF. */
1265 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1269 * The error will be propagated to the callers, however, it
1270 * is _not_ useful here.
1274 * Merge VF's enabled capabilities.
1276 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1278 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1279 if (ifp->if_capenable & IFCAP_TXCSUM)
1280 ifp->if_hwassist |= tmp;
1282 ifp->if_hwassist &= ~tmp;
1284 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1285 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1286 ifp->if_hwassist |= tmp;
1288 ifp->if_hwassist &= ~tmp;
1290 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1291 if (ifp->if_capenable & IFCAP_TSO4)
1292 ifp->if_hwassist |= tmp;
1294 ifp->if_hwassist &= ~tmp;
1296 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1297 if (ifp->if_capenable & IFCAP_TSO6)
1298 ifp->if_hwassist |= tmp;
1300 ifp->if_hwassist &= ~tmp;
1306 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1308 struct ifnet *vf_ifp;
1312 vf_ifp = sc->hn_vf_ifp;
1314 memset(&ifr, 0, sizeof(ifr));
1315 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1316 ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1317 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1318 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1322 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1324 struct ifnet *ifp = sc->hn_ifp;
1329 /* XXX vlan(4) style mcast addr maintenance */
1330 if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1331 allmulti = IFF_ALLMULTI;
1333 /* Always set the VF's if_flags */
1334 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1338 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1340 struct rm_priotracker pt;
1341 struct ifnet *hn_ifp = NULL;
1345 * XXX racy, if hn(4) ever detached.
1347 rm_rlock(&hn_vfmap_lock, &pt);
1348 if (vf_ifp->if_index < hn_vfmap_size)
1349 hn_ifp = hn_vfmap[vf_ifp->if_index];
1350 rm_runlock(&hn_vfmap_lock, &pt);
1352 if (hn_ifp != NULL) {
1353 for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1355 * Allow tapping on the VF.
1357 ETHER_BPF_MTAP(vf_ifp, mn);
1362 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1363 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1367 * XXX IFCOUNTER_IMCAST
1368 * This stat updating is kinda invasive, since it
1369 * requires two checks on the mbuf: the length check
1370 * and the ethernet header check. As of this write,
1371 * all multicast packets go directly to hn(4), which
1372 * makes imcast stat updating in the VF a try in vian.
1376 * Fix up rcvif and increase hn(4)'s ipackets.
1378 mn->m_pkthdr.rcvif = hn_ifp;
1379 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1382 * Go through hn(4)'s if_input.
1384 hn_ifp->if_input(hn_ifp, m);
1387 * In the middle of the transition; free this
1392 m->m_nextpkt = NULL;
1400 hn_mtu_change_fixup(struct hn_softc *sc)
1407 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1408 #if __FreeBSD_version >= 1100099
1409 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1410 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1415 hn_rss_type_fromndis(uint32_t rss_hash)
1419 if (rss_hash & NDIS_HASH_IPV4)
1420 types |= RSS_TYPE_IPV4;
1421 if (rss_hash & NDIS_HASH_TCP_IPV4)
1422 types |= RSS_TYPE_TCP_IPV4;
1423 if (rss_hash & NDIS_HASH_IPV6)
1424 types |= RSS_TYPE_IPV6;
1425 if (rss_hash & NDIS_HASH_IPV6_EX)
1426 types |= RSS_TYPE_IPV6_EX;
1427 if (rss_hash & NDIS_HASH_TCP_IPV6)
1428 types |= RSS_TYPE_TCP_IPV6;
1429 if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1430 types |= RSS_TYPE_TCP_IPV6_EX;
1431 if (rss_hash & NDIS_HASH_UDP_IPV4_X)
1432 types |= RSS_TYPE_UDP_IPV4;
1437 hn_rss_type_tondis(uint32_t types)
1439 uint32_t rss_hash = 0;
1441 KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1442 ("UDP6 and UDP6EX are not supported"));
1444 if (types & RSS_TYPE_IPV4)
1445 rss_hash |= NDIS_HASH_IPV4;
1446 if (types & RSS_TYPE_TCP_IPV4)
1447 rss_hash |= NDIS_HASH_TCP_IPV4;
1448 if (types & RSS_TYPE_IPV6)
1449 rss_hash |= NDIS_HASH_IPV6;
1450 if (types & RSS_TYPE_IPV6_EX)
1451 rss_hash |= NDIS_HASH_IPV6_EX;
1452 if (types & RSS_TYPE_TCP_IPV6)
1453 rss_hash |= NDIS_HASH_TCP_IPV6;
1454 if (types & RSS_TYPE_TCP_IPV6_EX)
1455 rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1456 if (types & RSS_TYPE_UDP_IPV4)
1457 rss_hash |= NDIS_HASH_UDP_IPV4_X;
1462 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1468 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1469 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1473 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1475 struct ifnet *ifp, *vf_ifp;
1476 struct ifrsshash ifrh;
1477 struct ifrsskey ifrk;
1479 uint32_t my_types, diff_types, mbuf_types = 0;
1482 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1483 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1485 if (sc->hn_rx_ring_inuse == 1) {
1486 /* No RSS on synthetic parts; done. */
1489 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1490 /* Synthetic parts do not support Toeplitz; done. */
1495 vf_ifp = sc->hn_vf_ifp;
1498 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is
1501 memset(&ifrk, 0, sizeof(ifrk));
1502 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1503 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1505 if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n",
1506 vf_ifp->if_xname, error);
1509 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1510 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1511 vf_ifp->if_xname, ifrk.ifrk_func);
1514 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1515 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1516 vf_ifp->if_xname, ifrk.ifrk_keylen);
1521 * Extract VF's RSS hash. Only Toeplitz is supported.
1523 memset(&ifrh, 0, sizeof(ifrh));
1524 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1525 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1527 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1528 vf_ifp->if_xname, error);
1531 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1532 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1533 vf_ifp->if_xname, ifrh.ifrh_func);
1537 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1538 if ((ifrh.ifrh_types & my_types) == 0) {
1539 /* This disables RSS; ignore it then */
1540 if_printf(ifp, "%s intersection of RSS types failed. "
1541 "VF %#x, mine %#x\n", vf_ifp->if_xname,
1542 ifrh.ifrh_types, my_types);
1546 diff_types = my_types ^ ifrh.ifrh_types;
1547 my_types &= ifrh.ifrh_types;
1548 mbuf_types = my_types;
1551 * Detect RSS hash value/type confliction.
1554 * We don't disable the hash type, but stop delivery the hash
1555 * value/type through mbufs on RX path.
1557 * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple
1558 * hash is delivered with type of TCP_IPV4. This means if
1559 * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at
1560 * least to hn_mbuf_hash. However, given that _all_ of the
1561 * NICs implement TCP_IPV4, this will _not_ impose any issues
1564 if ((my_types & RSS_TYPE_IPV4) &&
1565 (diff_types & ifrh.ifrh_types &
1566 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1567 /* Conflict; disable IPV4 hash type/value delivery. */
1568 if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1569 mbuf_types &= ~RSS_TYPE_IPV4;
1571 if ((my_types & RSS_TYPE_IPV6) &&
1572 (diff_types & ifrh.ifrh_types &
1573 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1574 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1575 RSS_TYPE_IPV6_EX))) {
1576 /* Conflict; disable IPV6 hash type/value delivery. */
1577 if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1578 mbuf_types &= ~RSS_TYPE_IPV6;
1580 if ((my_types & RSS_TYPE_IPV6_EX) &&
1581 (diff_types & ifrh.ifrh_types &
1582 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1583 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1585 /* Conflict; disable IPV6_EX hash type/value delivery. */
1586 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1587 mbuf_types &= ~RSS_TYPE_IPV6_EX;
1589 if ((my_types & RSS_TYPE_TCP_IPV6) &&
1590 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1591 /* Conflict; disable TCP_IPV6 hash type/value delivery. */
1592 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1593 mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1595 if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1596 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1597 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1598 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1599 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1601 if ((my_types & RSS_TYPE_UDP_IPV6) &&
1602 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1603 /* Conflict; disable UDP_IPV6 hash type/value delivery. */
1604 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1605 mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1607 if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1608 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1609 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1610 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1611 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1615 * Indirect table does not matter.
1618 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1619 hn_rss_type_tondis(my_types);
1620 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1621 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1624 error = hn_rss_reconfig(sc);
1626 /* XXX roll-back? */
1627 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1628 /* XXX keep going. */
1632 /* Hash deliverability for mbufs. */
1633 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1637 hn_vf_rss_restore(struct hn_softc *sc)
1641 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1642 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1644 if (sc->hn_rx_ring_inuse == 1)
1648 * Restore hash types. Key does _not_ matter.
1650 if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1653 sc->hn_rss_hash = sc->hn_rss_hcap;
1654 error = hn_rss_reconfig(sc);
1656 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1658 /* XXX keep going. */
1662 /* Hash deliverability for mbufs. */
1663 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1667 hn_xpnt_vf_setready(struct hn_softc *sc)
1669 struct ifnet *ifp, *vf_ifp;
1674 vf_ifp = sc->hn_vf_ifp;
1677 * Mark the VF ready.
1679 sc->hn_vf_rdytick = 0;
1682 * Save information for restoration.
1684 sc->hn_saved_caps = ifp->if_capabilities;
1685 sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1686 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1687 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1690 * Intersect supported/enabled capabilities.
1693 * if_hwassist is not changed here.
1695 ifp->if_capabilities &= vf_ifp->if_capabilities;
1696 ifp->if_capenable &= ifp->if_capabilities;
1701 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1702 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1703 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1704 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1705 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1706 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1709 * Change VF's enabled capabilities.
1711 memset(&ifr, 0, sizeof(ifr));
1712 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1713 ifr.ifr_reqcap = ifp->if_capenable;
1714 hn_xpnt_vf_iocsetcaps(sc, &ifr);
1716 if (ifp->if_mtu != ETHERMTU) {
1722 memset(&ifr, 0, sizeof(ifr));
1723 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1724 ifr.ifr_mtu = ifp->if_mtu;
1725 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1727 if_printf(ifp, "%s SIOCSIFMTU %lu failed\n",
1728 vf_ifp->if_xname, ifp->if_mtu);
1729 if (ifp->if_mtu > ETHERMTU) {
1730 if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1734 * No need to adjust the synthetic parts' MTU;
1735 * failure of the adjustment will cause us
1736 * infinite headache.
1738 ifp->if_mtu = ETHERMTU;
1739 hn_mtu_change_fixup(sc);
1746 hn_xpnt_vf_isready(struct hn_softc *sc)
1751 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1754 if (sc->hn_vf_rdytick == 0)
1757 if (sc->hn_vf_rdytick > ticks)
1760 /* Mark VF as ready. */
1761 hn_xpnt_vf_setready(sc);
1766 hn_xpnt_vf_setenable(struct hn_softc *sc)
1772 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1773 rm_wlock(&sc->hn_vf_lock);
1774 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1775 rm_wunlock(&sc->hn_vf_lock);
1777 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1778 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1782 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1788 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1789 rm_wlock(&sc->hn_vf_lock);
1790 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1792 sc->hn_vf_ifp = NULL;
1793 rm_wunlock(&sc->hn_vf_lock);
1795 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1796 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1800 hn_xpnt_vf_init(struct hn_softc *sc)
1806 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1807 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1810 if_printf(sc->hn_ifp, "try bringing up %s\n",
1811 sc->hn_vf_ifp->if_xname);
1817 hn_xpnt_vf_saveifflags(sc);
1818 sc->hn_vf_ifp->if_flags |= IFF_UP;
1819 error = hn_xpnt_vf_iocsetflags(sc);
1821 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1822 sc->hn_vf_ifp->if_xname, error);
1828 * Datapath setting must happen _after_ bringing the VF up.
1830 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1834 * Fixup RSS related bits _after_ the VF is brought up, since
1835 * many VFs generate RSS key during it's initialization.
1837 hn_vf_rss_fixup(sc, true);
1839 /* Mark transparent mode VF as enabled. */
1840 hn_xpnt_vf_setenable(sc);
1844 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1846 struct hn_softc *sc = xsc;
1850 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1852 if (sc->hn_vf_ifp == NULL)
1854 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1857 if (sc->hn_vf_rdytick != 0) {
1858 /* Mark VF as ready. */
1859 hn_xpnt_vf_setready(sc);
1862 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1864 * Delayed VF initialization.
1867 if_printf(sc->hn_ifp, "delayed initialize %s\n",
1868 sc->hn_vf_ifp->if_xname);
1870 hn_xpnt_vf_init(sc);
1877 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1879 struct hn_softc *sc = xsc;
1883 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1886 if (!hn_ismyvf(sc, ifp))
1889 if (sc->hn_vf_ifp != NULL) {
1890 if_printf(sc->hn_ifp, "%s was attached as VF\n",
1891 sc->hn_vf_ifp->if_xname);
1895 if (hn_xpnt_vf && ifp->if_start != NULL) {
1897 * ifnet.if_start is _not_ supported by transparent
1898 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1900 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1901 "in transparent VF mode.\n", ifp->if_xname);
1905 rm_wlock(&hn_vfmap_lock);
1907 if (ifp->if_index >= hn_vfmap_size) {
1908 struct ifnet **newmap;
1911 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1912 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1915 memcpy(newmap, hn_vfmap,
1916 sizeof(struct ifnet *) * hn_vfmap_size);
1917 free(hn_vfmap, M_DEVBUF);
1919 hn_vfmap_size = newsize;
1921 KASSERT(hn_vfmap[ifp->if_index] == NULL,
1922 ("%s: ifindex %d was mapped to %s",
1923 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1924 hn_vfmap[ifp->if_index] = sc->hn_ifp;
1926 rm_wunlock(&hn_vfmap_lock);
1928 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1929 rm_wlock(&sc->hn_vf_lock);
1930 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1931 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1932 sc->hn_vf_ifp = ifp;
1933 rm_wunlock(&sc->hn_vf_lock);
1939 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1940 * Save vf_ifp's current if_input for later restoration.
1942 sc->hn_vf_input = ifp->if_input;
1943 ifp->if_input = hn_xpnt_vf_input;
1946 * Stop link status management; use the VF's.
1948 hn_suspend_mgmt(sc);
1951 * Give VF sometime to complete its attach routing.
1953 wait_ticks = hn_xpnt_vf_attwait * hz;
1954 sc->hn_vf_rdytick = ticks + wait_ticks;
1956 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1964 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1966 struct hn_softc *sc = xsc;
1970 if (sc->hn_vf_ifp == NULL)
1973 if (!hn_ismyvf(sc, ifp))
1978 * Make sure that the delayed initialization is not running.
1981 * - This lock _must_ be released, since the hn_vf_init task
1982 * will try holding this lock.
1983 * - It is safe to release this lock here, since the
1984 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1986 * XXX racy, if hn(4) ever detached.
1989 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1992 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1993 sc->hn_ifp->if_xname));
1994 ifp->if_input = sc->hn_vf_input;
1995 sc->hn_vf_input = NULL;
1997 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1998 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1999 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
2001 if (sc->hn_vf_rdytick == 0) {
2003 * The VF was ready; restore some settings.
2005 sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
2008 * There is _no_ need to fixup if_capenable and
2009 * if_hwassist, since the if_capabilities before
2010 * restoration was an intersection of the VF's
2011 * if_capabilites and the synthetic device's
2014 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
2015 sc->hn_ifp->if_hw_tsomaxsegcount =
2016 sc->hn_saved_tsosegcnt;
2017 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
2020 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2022 * Restore RSS settings.
2024 hn_vf_rss_restore(sc);
2027 * Resume link status management, which was suspended
2028 * by hn_ifnet_attevent().
2034 /* Mark transparent mode VF as disabled. */
2035 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2037 rm_wlock(&hn_vfmap_lock);
2039 KASSERT(ifp->if_index < hn_vfmap_size,
2040 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2041 if (hn_vfmap[ifp->if_index] != NULL) {
2042 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2043 ("%s: ifindex %d was mapped to %s",
2044 ifp->if_xname, ifp->if_index,
2045 hn_vfmap[ifp->if_index]->if_xname));
2046 hn_vfmap[ifp->if_index] = NULL;
2049 rm_wunlock(&hn_vfmap_lock);
2055 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2057 struct hn_softc *sc = xsc;
2059 if (sc->hn_vf_ifp == ifp)
2060 if_link_state_change(sc->hn_ifp, link_state);
2064 hn_probe(device_t dev)
2067 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2068 device_set_desc(dev, "Hyper-V Network Interface");
2069 return BUS_PROBE_DEFAULT;
2075 hn_attach(device_t dev)
2077 struct hn_softc *sc = device_get_softc(dev);
2078 struct sysctl_oid_list *child;
2079 struct sysctl_ctx_list *ctx;
2080 uint8_t eaddr[ETHER_ADDR_LEN];
2081 struct ifnet *ifp = NULL;
2082 int error, ring_cnt, tx_ring_cnt;
2086 sc->hn_prichan = vmbus_get_channel(dev);
2088 rm_init(&sc->hn_vf_lock, "hnvf");
2089 if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2090 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2093 * Initialize these tunables once.
2095 sc->hn_agg_size = hn_tx_agg_size;
2096 sc->hn_agg_pkts = hn_tx_agg_pkts;
2099 * Setup taskqueue for transmission.
2101 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2105 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2106 M_DEVBUF, M_WAITOK);
2107 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2108 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2109 M_WAITOK, taskqueue_thread_enqueue,
2110 &sc->hn_tx_taskqs[i]);
2111 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2112 "%s tx%d", device_get_nameunit(dev), i);
2114 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2115 sc->hn_tx_taskqs = hn_tx_taskque;
2119 * Setup taskqueue for mangement tasks, e.g. link status.
2121 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2122 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2123 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2124 device_get_nameunit(dev));
2125 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2126 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2127 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2128 hn_netchg_status_taskfunc, sc);
2132 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2134 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2135 taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2136 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2137 device_get_nameunit(dev));
2138 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2139 hn_xpnt_vf_init_taskfunc, sc);
2143 * Allocate ifnet and setup its name earlier, so that if_printf
2144 * can be used by functions, which will be called after
2147 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
2149 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2152 * Initialize ifmedia earlier so that it can be unconditionally
2153 * destroyed, if error happened later on.
2155 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2158 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2159 * to use (tx_ring_cnt).
2162 * The # of RX rings to use is same as the # of channels to use.
2164 ring_cnt = hn_chan_cnt;
2165 if (ring_cnt <= 0) {
2167 ring_cnt = mp_ncpus;
2168 if (ring_cnt > HN_RING_CNT_DEF_MAX)
2169 ring_cnt = HN_RING_CNT_DEF_MAX;
2170 } else if (ring_cnt > mp_ncpus) {
2171 ring_cnt = mp_ncpus;
2174 tx_ring_cnt = hn_tx_ring_cnt;
2175 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2176 tx_ring_cnt = ring_cnt;
2177 #ifdef HN_IFSTART_SUPPORT
2178 if (hn_use_if_start) {
2179 /* ifnet.if_start only needs one TX ring. */
2185 * Set the leader CPU for channels.
2187 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2190 * Create enough TX/RX rings, even if only limited number of
2191 * channels can be allocated.
2193 error = hn_create_tx_data(sc, tx_ring_cnt);
2196 error = hn_create_rx_data(sc, ring_cnt);
2201 * Create transaction context for NVS and RNDIS transactions.
2203 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2204 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2205 if (sc->hn_xact == NULL) {
2211 * Install orphan handler for the revocation of this device's
2215 * The processing order is critical here:
2216 * Install the orphan handler, _before_ testing whether this
2217 * device's primary channel has been revoked or not.
2219 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2220 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2226 * Attach the synthetic parts, i.e. NVS and RNDIS.
2228 error = hn_synth_attach(sc, ETHERMTU);
2232 error = hn_rndis_get_eaddr(sc, eaddr);
2236 error = hn_rndis_get_mtu(sc, &mtu);
2239 else if (bootverbose)
2240 device_printf(dev, "RNDIS mtu %u\n", mtu);
2242 #if __FreeBSD_version >= 1100099
2243 if (sc->hn_rx_ring_inuse > 1) {
2245 * Reduce TCP segment aggregation limit for multiple
2246 * RX rings to increase ACK timeliness.
2248 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2253 * Fixup TX/RX stuffs after synthetic parts are attached.
2255 hn_fixup_tx_data(sc);
2256 hn_fixup_rx_data(sc);
2258 ctx = device_get_sysctl_ctx(dev);
2259 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2260 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2261 &sc->hn_nvs_ver, 0, "NVS version");
2262 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2263 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2264 hn_ndis_version_sysctl, "A", "NDIS version");
2265 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2266 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2267 hn_caps_sysctl, "A", "capabilities");
2268 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2269 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2270 hn_hwassist_sysctl, "A", "hwassist");
2271 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2272 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2273 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2274 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2275 "max # of TSO segments");
2276 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2277 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2278 "max size of TSO segment");
2279 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2280 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2281 hn_rxfilter_sysctl, "A", "rxfilter");
2282 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2283 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2284 hn_rss_hash_sysctl, "A", "RSS hash");
2285 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2286 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2287 hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2288 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2289 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2290 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2291 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2292 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2293 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2294 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2295 hn_rss_key_sysctl, "IU", "RSS key");
2296 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2297 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2298 hn_rss_ind_sysctl, "IU", "RSS indirect table");
2299 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2300 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2301 "RNDIS offered packet transmission aggregation size limit");
2302 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2303 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2304 "RNDIS offered packet transmission aggregation count limit");
2305 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2306 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2307 "RNDIS packet transmission aggregation alignment");
2308 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2309 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2310 hn_txagg_size_sysctl, "I",
2311 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2312 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2313 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2314 hn_txagg_pkts_sysctl, "I",
2315 "Packet transmission aggregation packets, "
2316 "0 -- disable, -1 -- auto");
2317 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2318 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2319 hn_polling_sysctl, "I",
2320 "Polling frequency: [100,1000000], 0 disable polling");
2321 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2322 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2323 hn_vf_sysctl, "A", "Virtual Function's name");
2325 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2326 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2327 hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2329 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2330 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2331 hn_xpnt_vf_enabled_sysctl, "I",
2332 "Transparent VF enabled");
2333 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2334 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2335 hn_xpnt_vf_accbpf_sysctl, "I",
2336 "Accurate BPF for transparent VF");
2340 * Setup the ifmedia, which has been initialized earlier.
2342 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2343 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2344 /* XXX ifmedia_set really should do this for us */
2345 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2348 * Setup the ifnet for this interface.
2352 ifp->if_baudrate = IF_Gbps(10);
2354 /* if_baudrate is 32bits on 32bit system. */
2355 ifp->if_baudrate = IF_Gbps(1);
2357 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2358 ifp->if_ioctl = hn_ioctl;
2359 ifp->if_init = hn_init;
2360 #ifdef HN_IFSTART_SUPPORT
2361 if (hn_use_if_start) {
2362 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2364 ifp->if_start = hn_start;
2365 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2366 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2367 IFQ_SET_READY(&ifp->if_snd);
2371 ifp->if_transmit = hn_transmit;
2372 ifp->if_qflush = hn_xmit_qflush;
2375 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2377 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2378 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2380 if (sc->hn_caps & HN_CAP_VLAN) {
2381 /* XXX not sure about VLAN_MTU. */
2382 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2385 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2386 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2387 ifp->if_capabilities |= IFCAP_TXCSUM;
2388 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2389 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2390 if (sc->hn_caps & HN_CAP_TSO4) {
2391 ifp->if_capabilities |= IFCAP_TSO4;
2392 ifp->if_hwassist |= CSUM_IP_TSO;
2394 if (sc->hn_caps & HN_CAP_TSO6) {
2395 ifp->if_capabilities |= IFCAP_TSO6;
2396 ifp->if_hwassist |= CSUM_IP6_TSO;
2399 /* Enable all available capabilities by default. */
2400 ifp->if_capenable = ifp->if_capabilities;
2403 * Disable IPv6 TSO and TXCSUM by default, they still can
2404 * be enabled through SIOCSIFCAP.
2406 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2407 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2409 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2411 * Lock hn_set_tso_maxsize() to simplify its
2415 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2417 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2418 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2421 ether_ifattach(ifp, eaddr);
2423 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2424 if_printf(ifp, "TSO segcnt %u segsz %u\n",
2425 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2427 if (mtu < ETHERMTU) {
2428 if_printf(ifp, "fixup mtu %lu -> %u\n", ifp->if_mtu, mtu);
2432 /* Inform the upper layer about the long frame support. */
2433 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2436 * Kick off link status check.
2438 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2439 hn_update_link_status(sc);
2442 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2443 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2444 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2445 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2447 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2448 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2453 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2454 * since interface's LLADDR is needed; interface LLADDR is not
2455 * available when ifnet_arrival event is triggered.
2457 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2458 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2459 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2460 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2464 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2465 hn_synth_detach(sc);
2471 hn_detach(device_t dev)
2473 struct hn_softc *sc = device_get_softc(dev);
2474 struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2476 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2478 * In case that the vmbus missed the orphan handler
2481 vmbus_xact_ctx_orphan(sc->hn_xact);
2484 if (sc->hn_ifaddr_evthand != NULL)
2485 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2486 if (sc->hn_ifnet_evthand != NULL)
2487 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2488 if (sc->hn_ifnet_atthand != NULL) {
2489 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2490 sc->hn_ifnet_atthand);
2492 if (sc->hn_ifnet_dethand != NULL) {
2493 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2494 sc->hn_ifnet_dethand);
2496 if (sc->hn_ifnet_lnkhand != NULL)
2497 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2499 vf_ifp = sc->hn_vf_ifp;
2500 __compiler_membar();
2502 hn_ifnet_detevent(sc, vf_ifp);
2504 if (device_is_attached(dev)) {
2506 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2507 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2511 * hn_stop() only suspends data, so managment
2512 * stuffs have to be suspended manually here.
2514 hn_suspend_mgmt(sc);
2515 hn_synth_detach(sc);
2518 ether_ifdetach(ifp);
2521 ifmedia_removeall(&sc->hn_media);
2522 hn_destroy_rx_data(sc);
2523 hn_destroy_tx_data(sc);
2525 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2528 for (i = 0; i < hn_tx_taskq_cnt; ++i)
2529 taskqueue_free(sc->hn_tx_taskqs[i]);
2530 free(sc->hn_tx_taskqs, M_DEVBUF);
2532 taskqueue_free(sc->hn_mgmt_taskq0);
2533 if (sc->hn_vf_taskq != NULL)
2534 taskqueue_free(sc->hn_vf_taskq);
2536 if (sc->hn_xact != NULL) {
2538 * Uninstall the orphan handler _before_ the xact is
2541 vmbus_chan_unset_orphan(sc->hn_prichan);
2542 vmbus_xact_ctx_destroy(sc->hn_xact);
2547 HN_LOCK_DESTROY(sc);
2548 rm_destroy(&sc->hn_vf_lock);
2553 hn_shutdown(device_t dev)
2560 hn_link_status(struct hn_softc *sc)
2562 uint32_t link_status;
2565 error = hn_rndis_get_linkstatus(sc, &link_status);
2567 /* XXX what to do? */
2571 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2572 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2574 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2575 if_link_state_change(sc->hn_ifp,
2576 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2577 LINK_STATE_UP : LINK_STATE_DOWN);
2581 hn_link_taskfunc(void *xsc, int pending __unused)
2583 struct hn_softc *sc = xsc;
2585 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2591 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2593 struct hn_softc *sc = xsc;
2595 /* Prevent any link status checks from running. */
2596 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2599 * Fake up a [link down --> link up] state change; 5 seconds
2600 * delay is used, which closely simulates miibus reaction
2601 * upon link down event.
2603 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2604 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2605 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2606 &sc->hn_netchg_status, 5 * hz);
2610 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2612 struct hn_softc *sc = xsc;
2614 /* Re-allow link status checks. */
2615 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2620 hn_update_link_status(struct hn_softc *sc)
2623 if (sc->hn_mgmt_taskq != NULL)
2624 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2628 hn_change_network(struct hn_softc *sc)
2631 if (sc->hn_mgmt_taskq != NULL)
2632 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2636 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2637 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2639 struct mbuf *m = *m_head;
2642 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2644 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2645 m, segs, nsegs, BUS_DMA_NOWAIT);
2646 if (error == EFBIG) {
2649 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2653 *m_head = m = m_new;
2654 txr->hn_tx_collapsed++;
2656 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2657 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2660 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2661 BUS_DMASYNC_PREWRITE);
2662 txd->flags |= HN_TXD_FLAG_DMAMAP;
2668 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2671 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2672 ("put an onlist txd %#x", txd->flags));
2673 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2674 ("put an onagg txd %#x", txd->flags));
2676 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2677 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2680 if (!STAILQ_EMPTY(&txd->agg_list)) {
2681 struct hn_txdesc *tmp_txd;
2683 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2686 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2687 ("resursive aggregation on aggregated txdesc"));
2688 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2689 ("not aggregated txdesc"));
2690 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2691 ("aggregated txdesc uses dmamap"));
2692 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2693 ("aggregated txdesc consumes "
2694 "chimney sending buffer"));
2695 KASSERT(tmp_txd->chim_size == 0,
2696 ("aggregated txdesc has non-zero "
2697 "chimney sending size"));
2699 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2700 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2701 freed = hn_txdesc_put(txr, tmp_txd);
2702 KASSERT(freed, ("failed to free aggregated txdesc"));
2706 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2707 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2708 ("chim txd uses dmamap"));
2709 hn_chim_free(txr->hn_sc, txd->chim_index);
2710 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2712 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2713 bus_dmamap_sync(txr->hn_tx_data_dtag,
2714 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2715 bus_dmamap_unload(txr->hn_tx_data_dtag,
2717 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2720 if (txd->m != NULL) {
2725 txd->flags |= HN_TXD_FLAG_ONLIST;
2726 #ifndef HN_USE_TXDESC_BUFRING
2727 mtx_lock_spin(&txr->hn_txlist_spin);
2728 KASSERT(txr->hn_txdesc_avail >= 0 &&
2729 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2730 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2731 txr->hn_txdesc_avail++;
2732 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2733 mtx_unlock_spin(&txr->hn_txlist_spin);
2734 #else /* HN_USE_TXDESC_BUFRING */
2736 atomic_add_int(&txr->hn_txdesc_avail, 1);
2738 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2739 #endif /* !HN_USE_TXDESC_BUFRING */
2744 static __inline struct hn_txdesc *
2745 hn_txdesc_get(struct hn_tx_ring *txr)
2747 struct hn_txdesc *txd;
2749 #ifndef HN_USE_TXDESC_BUFRING
2750 mtx_lock_spin(&txr->hn_txlist_spin);
2751 txd = SLIST_FIRST(&txr->hn_txlist);
2753 KASSERT(txr->hn_txdesc_avail > 0,
2754 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2755 txr->hn_txdesc_avail--;
2756 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2758 mtx_unlock_spin(&txr->hn_txlist_spin);
2760 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2764 #ifdef HN_USE_TXDESC_BUFRING
2766 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2768 #endif /* HN_USE_TXDESC_BUFRING */
2769 KASSERT(txd->m == NULL && txd->refs == 0 &&
2770 STAILQ_EMPTY(&txd->agg_list) &&
2771 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2772 txd->chim_size == 0 &&
2773 (txd->flags & HN_TXD_FLAG_ONLIST) &&
2774 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2775 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2776 txd->flags &= ~HN_TXD_FLAG_ONLIST;
2782 static __inline void
2783 hn_txdesc_hold(struct hn_txdesc *txd)
2786 /* 0->1 transition will never work */
2787 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2788 atomic_add_int(&txd->refs, 1);
2791 static __inline void
2792 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2795 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2796 ("recursive aggregation on aggregating txdesc"));
2798 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2799 ("already aggregated"));
2800 KASSERT(STAILQ_EMPTY(&txd->agg_list),
2801 ("recursive aggregation on to-be-aggregated txdesc"));
2803 txd->flags |= HN_TXD_FLAG_ONAGG;
2804 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2808 hn_tx_ring_pending(struct hn_tx_ring *txr)
2810 bool pending = false;
2812 #ifndef HN_USE_TXDESC_BUFRING
2813 mtx_lock_spin(&txr->hn_txlist_spin);
2814 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2816 mtx_unlock_spin(&txr->hn_txlist_spin);
2818 if (!buf_ring_full(txr->hn_txdesc_br))
2824 static __inline void
2825 hn_txeof(struct hn_tx_ring *txr)
2827 txr->hn_has_txeof = 0;
2832 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2833 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2835 struct hn_txdesc *txd = sndc->hn_cbarg;
2836 struct hn_tx_ring *txr;
2839 KASSERT(txr->hn_chan == chan,
2840 ("channel mismatch, on chan%u, should be chan%u",
2841 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2843 txr->hn_has_txeof = 1;
2844 hn_txdesc_put(txr, txd);
2846 ++txr->hn_txdone_cnt;
2847 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2848 txr->hn_txdone_cnt = 0;
2849 if (txr->hn_oactive)
2855 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2857 #if defined(INET) || defined(INET6)
2858 struct lro_ctrl *lro = &rxr->hn_lro;
2859 struct lro_entry *queued;
2861 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
2862 SLIST_REMOVE_HEAD(&lro->lro_active, next);
2863 tcp_lro_flush(lro, queued);
2869 * 'txr' could be NULL, if multiple channels and
2870 * ifnet.if_start method are enabled.
2872 if (txr == NULL || !txr->hn_has_txeof)
2875 txr->hn_txdone_cnt = 0;
2879 static __inline uint32_t
2880 hn_rndis_pktmsg_offset(uint32_t ofs)
2883 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2884 ("invalid RNDIS packet msg offset %u", ofs));
2885 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2888 static __inline void *
2889 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2890 size_t pi_dlen, uint32_t pi_type)
2892 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2893 struct rndis_pktinfo *pi;
2895 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2896 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2899 * Per-packet-info does not move; it only grows.
2902 * rm_pktinfooffset in this phase counts from the beginning
2903 * of rndis_packet_msg.
2905 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2906 ("%u pktinfo overflows RNDIS packet msg", pi_type));
2907 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2908 pkt->rm_pktinfolen);
2909 pkt->rm_pktinfolen += pi_size;
2911 pi->rm_size = pi_size;
2912 pi->rm_type = pi_type;
2913 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2915 return (pi->rm_data);
2919 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2921 struct hn_txdesc *txd;
2925 txd = txr->hn_agg_txd;
2926 KASSERT(txd != NULL, ("no aggregate txdesc"));
2929 * Since hn_txpkt() will reset this temporary stat, save
2930 * it now, so that oerrors can be updated properly, if
2931 * hn_txpkt() ever fails.
2933 pkts = txr->hn_stat_pkts;
2936 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2937 * failure, save it for later freeing, if hn_txpkt() ever
2941 error = hn_txpkt(ifp, txr, txd);
2942 if (__predict_false(error)) {
2943 /* txd is freed, but m is not. */
2946 txr->hn_flush_failed++;
2947 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2950 /* Reset all aggregation states. */
2951 txr->hn_agg_txd = NULL;
2952 txr->hn_agg_szleft = 0;
2953 txr->hn_agg_pktleft = 0;
2954 txr->hn_agg_prevpkt = NULL;
2960 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2965 if (txr->hn_agg_txd != NULL) {
2966 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2967 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2968 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2972 * Update the previous RNDIS packet's total length,
2973 * it can be increased due to the mandatory alignment
2974 * padding for this RNDIS packet. And update the
2975 * aggregating txdesc's chimney sending buffer size
2979 * Zero-out the padding, as required by the RNDIS spec.
2982 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2983 agg_txd->chim_size += pkt->rm_len - olen;
2985 /* Link this txdesc to the parent. */
2986 hn_txdesc_agg(agg_txd, txd);
2988 chim = (uint8_t *)pkt + pkt->rm_len;
2989 /* Save the current packet for later fixup. */
2990 txr->hn_agg_prevpkt = chim;
2992 txr->hn_agg_pktleft--;
2993 txr->hn_agg_szleft -= pktsize;
2994 if (txr->hn_agg_szleft <=
2995 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2997 * Probably can't aggregate more packets,
2998 * flush this aggregating txdesc proactively.
3000 txr->hn_agg_pktleft = 0;
3005 hn_flush_txagg(ifp, txr);
3007 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3009 txr->hn_tx_chimney_tried++;
3010 txd->chim_index = hn_chim_alloc(txr->hn_sc);
3011 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
3013 txr->hn_tx_chimney++;
3015 chim = txr->hn_sc->hn_chim +
3016 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
3018 if (txr->hn_agg_pktmax > 1 &&
3019 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
3020 txr->hn_agg_txd = txd;
3021 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
3022 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
3023 txr->hn_agg_prevpkt = chim;
3030 * If this function fails, then both txd and m_head0 will be freed.
3033 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3034 struct mbuf **m_head0)
3036 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3037 int error, nsegs, i;
3038 struct mbuf *m_head = *m_head0;
3039 struct rndis_packet_msg *pkt;
3042 int pkt_hlen, pkt_size;
3044 pkt = txd->rndis_pkt;
3045 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3046 if (pkt_size < txr->hn_chim_size) {
3047 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3051 if (txr->hn_agg_txd != NULL)
3052 hn_flush_txagg(ifp, txr);
3055 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3056 pkt->rm_len = m_head->m_pkthdr.len;
3057 pkt->rm_dataoffset = 0;
3058 pkt->rm_datalen = m_head->m_pkthdr.len;
3059 pkt->rm_oobdataoffset = 0;
3060 pkt->rm_oobdatalen = 0;
3061 pkt->rm_oobdataelements = 0;
3062 pkt->rm_pktinfooffset = sizeof(*pkt);
3063 pkt->rm_pktinfolen = 0;
3064 pkt->rm_vchandle = 0;
3065 pkt->rm_reserved = 0;
3067 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3069 * Set the hash value for this packet, so that the host could
3070 * dispatch the TX done event for this packet back to this TX
3073 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3074 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3075 *pi_data = txr->hn_tx_idx;
3078 if (m_head->m_flags & M_VLANTAG) {
3079 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3080 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3081 *pi_data = NDIS_VLAN_INFO_MAKE(
3082 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3083 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3084 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3087 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3088 #if defined(INET6) || defined(INET)
3089 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3090 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3092 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3093 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3094 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3095 m_head->m_pkthdr.tso_segsz);
3098 #if defined(INET6) && defined(INET)
3103 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3104 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3105 m_head->m_pkthdr.tso_segsz);
3108 #endif /* INET6 || INET */
3109 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3110 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3111 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3112 if (m_head->m_pkthdr.csum_flags &
3113 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3114 *pi_data = NDIS_TXCSUM_INFO_IPV6;
3116 *pi_data = NDIS_TXCSUM_INFO_IPV4;
3117 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3118 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
3121 if (m_head->m_pkthdr.csum_flags &
3122 (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3123 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3124 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3125 } else if (m_head->m_pkthdr.csum_flags &
3126 (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3127 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3128 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3132 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3133 /* Fixup RNDIS packet message total length */
3134 pkt->rm_len += pkt_hlen;
3135 /* Convert RNDIS packet message offsets */
3136 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3137 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3140 * Fast path: Chimney sending.
3143 struct hn_txdesc *tgt_txd = txd;
3145 if (txr->hn_agg_txd != NULL) {
3146 tgt_txd = txr->hn_agg_txd;
3152 KASSERT(pkt == chim,
3153 ("RNDIS pkt not in chimney sending buffer"));
3154 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3155 ("chimney sending buffer is not used"));
3156 tgt_txd->chim_size += pkt->rm_len;
3158 m_copydata(m_head, 0, m_head->m_pkthdr.len,
3159 ((uint8_t *)chim) + pkt_hlen);
3161 txr->hn_gpa_cnt = 0;
3162 txr->hn_sendpkt = hn_txpkt_chim;
3166 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3167 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3168 ("chimney buffer is used"));
3169 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3171 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3172 if (__predict_false(error)) {
3176 * This mbuf is not linked w/ the txd yet, so free it now.
3181 freed = hn_txdesc_put(txr, txd);
3183 ("fail to free txd upon txdma error"));
3185 txr->hn_txdma_failed++;
3186 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3191 /* +1 RNDIS packet message */
3192 txr->hn_gpa_cnt = nsegs + 1;
3194 /* send packet with page buffer */
3195 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3196 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3197 txr->hn_gpa[0].gpa_len = pkt_hlen;
3200 * Fill the page buffers with mbuf info after the page
3201 * buffer for RNDIS packet message.
3203 for (i = 0; i < nsegs; ++i) {
3204 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3206 gpa->gpa_page = atop(segs[i].ds_addr);
3207 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3208 gpa->gpa_len = segs[i].ds_len;
3211 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3213 txr->hn_sendpkt = hn_txpkt_sglist;
3217 /* Set the completion routine */
3218 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3220 /* Update temporary stats for later use. */
3221 txr->hn_stat_pkts++;
3222 txr->hn_stat_size += m_head->m_pkthdr.len;
3223 if (m_head->m_flags & M_MCAST)
3224 txr->hn_stat_mcasts++;
3231 * If this function fails, then txd will be freed, but the mbuf
3232 * associated w/ the txd will _not_ be freed.
3235 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3237 int error, send_failed = 0, has_bpf;
3240 has_bpf = bpf_peers_present(ifp->if_bpf);
3243 * Make sure that this txd and any aggregated txds are not
3244 * freed before ETHER_BPF_MTAP.
3246 hn_txdesc_hold(txd);
3248 error = txr->hn_sendpkt(txr, txd);
3251 const struct hn_txdesc *tmp_txd;
3253 ETHER_BPF_MTAP(ifp, txd->m);
3254 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3255 ETHER_BPF_MTAP(ifp, tmp_txd->m);
3258 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3259 #ifdef HN_IFSTART_SUPPORT
3260 if (!hn_use_if_start)
3263 if_inc_counter(ifp, IFCOUNTER_OBYTES,
3265 if (txr->hn_stat_mcasts != 0) {
3266 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3267 txr->hn_stat_mcasts);
3270 txr->hn_pkts += txr->hn_stat_pkts;
3274 hn_txdesc_put(txr, txd);
3276 if (__predict_false(error)) {
3280 * This should "really rarely" happen.
3282 * XXX Too many RX to be acked or too many sideband
3283 * commands to run? Ask netvsc_channel_rollup()
3284 * to kick start later.
3286 txr->hn_has_txeof = 1;
3288 txr->hn_send_failed++;
3291 * Try sending again after set hn_has_txeof;
3292 * in case that we missed the last
3293 * netvsc_channel_rollup().
3297 if_printf(ifp, "send failed\n");
3300 * Caller will perform further processing on the
3301 * associated mbuf, so don't free it in hn_txdesc_put();
3302 * only unload it from the DMA map in hn_txdesc_put(),
3306 freed = hn_txdesc_put(txr, txd);
3308 ("fail to free txd upon send error"));
3310 txr->hn_send_failed++;
3313 /* Reset temporary stats, after this sending is done. */
3314 txr->hn_stat_size = 0;
3315 txr->hn_stat_pkts = 0;
3316 txr->hn_stat_mcasts = 0;
3322 * Append the specified data to the indicated mbuf chain,
3323 * Extend the mbuf chain if the new data does not fit in
3326 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3327 * There should be an equivalent in the kernel mbuf code,
3328 * but there does not appear to be one yet.
3330 * Differs from m_append() in that additional mbufs are
3331 * allocated with cluster size MJUMPAGESIZE, and filled
3334 * Return 1 if able to complete the job; otherwise 0.
3337 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3340 int remainder, space;
3342 for (m = m0; m->m_next != NULL; m = m->m_next)
3345 space = M_TRAILINGSPACE(m);
3348 * Copy into available space.
3350 if (space > remainder)
3352 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3357 while (remainder > 0) {
3359 * Allocate a new mbuf; could check space
3360 * and allocate a cluster instead.
3362 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
3365 n->m_len = min(MJUMPAGESIZE, remainder);
3366 bcopy(cp, mtod(n, caddr_t), n->m_len);
3368 remainder -= n->m_len;
3372 if (m0->m_flags & M_PKTHDR)
3373 m0->m_pkthdr.len += len - remainder;
3375 return (remainder == 0);
3378 #if defined(INET) || defined(INET6)
3380 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3382 #if __FreeBSD_version >= 1100095
3383 if (hn_lro_mbufq_depth) {
3384 tcp_lro_queue_mbuf(lc, m);
3388 return tcp_lro_rx(lc, m, 0);
3393 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3394 const struct hn_rxinfo *info)
3396 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3398 int size, do_lro = 0, do_csum = 1, is_vf = 0;
3399 int hash_type = M_HASHTYPE_NONE;
3400 int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE;
3403 if (rxr->hn_rxvf_ifp != NULL) {
3405 * Non-transparent mode VF; pretend this packet is from
3408 ifp = rxr->hn_rxvf_ifp;
3410 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3411 /* Transparent mode VF. */
3415 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3418 * See the NOTE of hn_rndis_init_fixat(). This
3419 * function can be reached, immediately after the
3420 * RNDIS is initialized but before the ifnet is
3421 * setup on the hn_attach() path; drop the unexpected
3427 if (__predict_false(dlen < ETHER_HDR_LEN)) {
3428 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3432 if (dlen <= MHLEN) {
3433 m_new = m_gethdr(M_NOWAIT, MT_DATA);
3434 if (m_new == NULL) {
3435 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3438 memcpy(mtod(m_new, void *), data, dlen);
3439 m_new->m_pkthdr.len = m_new->m_len = dlen;
3440 rxr->hn_small_pkts++;
3443 * Get an mbuf with a cluster. For packets 2K or less,
3444 * get a standard 2K cluster. For anything larger, get a
3445 * 4K cluster. Any buffers larger than 4K can cause problems
3446 * if looped around to the Hyper-V TX channel, so avoid them.
3449 if (dlen > MCLBYTES) {
3451 size = MJUMPAGESIZE;
3454 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3455 if (m_new == NULL) {
3456 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3460 hv_m_append(m_new, dlen, data);
3462 m_new->m_pkthdr.rcvif = ifp;
3464 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3467 /* receive side checksum offload */
3468 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3469 /* IP csum offload */
3470 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3471 m_new->m_pkthdr.csum_flags |=
3472 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3476 /* TCP/UDP csum offload */
3477 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3478 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3479 m_new->m_pkthdr.csum_flags |=
3480 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3481 m_new->m_pkthdr.csum_data = 0xffff;
3482 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3490 * As of this write (Oct 28th, 2016), host side will turn
3491 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3492 * the do_lro setting here is actually _not_ accurate. We
3493 * depend on the RSS hash type check to reset do_lro.
3495 if ((info->csum_info &
3496 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3497 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3500 hn_rxpkt_proto(m_new, &l3proto, &l4proto);
3501 if (l3proto == ETHERTYPE_IP) {
3502 if (l4proto == IPPROTO_TCP) {
3504 (rxr->hn_trust_hcsum &
3505 HN_TRUST_HCSUM_TCP)) {
3506 rxr->hn_csum_trusted++;
3507 m_new->m_pkthdr.csum_flags |=
3508 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3509 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3510 m_new->m_pkthdr.csum_data = 0xffff;
3513 } else if (l4proto == IPPROTO_UDP) {
3515 (rxr->hn_trust_hcsum &
3516 HN_TRUST_HCSUM_UDP)) {
3517 rxr->hn_csum_trusted++;
3518 m_new->m_pkthdr.csum_flags |=
3519 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3520 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3521 m_new->m_pkthdr.csum_data = 0xffff;
3523 } else if (l4proto != IPPROTO_DONE && do_csum &&
3524 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3525 rxr->hn_csum_trusted++;
3526 m_new->m_pkthdr.csum_flags |=
3527 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3532 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3533 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3534 NDIS_VLAN_INFO_ID(info->vlan_info),
3535 NDIS_VLAN_INFO_PRI(info->vlan_info),
3536 NDIS_VLAN_INFO_CFI(info->vlan_info));
3537 m_new->m_flags |= M_VLANTAG;
3541 * If VF is activated (tranparent/non-transparent mode does not
3546 * hn(4) will only receive broadcast packets, multicast packets,
3547 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3550 * For non-transparent, we definitely _cannot_ enable LRO at
3551 * all, since the LRO flush will use hn(4) as the receiving
3552 * interface; i.e. hn_ifp->if_input(hn_ifp, m).
3558 * If VF is activated (tranparent/non-transparent mode does not
3559 * matter here), do _not_ mess with unsupported hash types or
3562 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3564 m_new->m_pkthdr.flowid = info->hash_value;
3566 hash_type = M_HASHTYPE_OPAQUE;
3567 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3568 NDIS_HASH_FUNCTION_TOEPLITZ) {
3569 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3574 * do_lro is resetted, if the hash types are not TCP
3575 * related. See the comment in the above csum_flags
3579 case NDIS_HASH_IPV4:
3580 hash_type = M_HASHTYPE_RSS_IPV4;
3584 case NDIS_HASH_TCP_IPV4:
3585 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3586 if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) {
3587 int def_htype = M_HASHTYPE_OPAQUE;
3590 def_htype = M_HASHTYPE_NONE;
3593 * UDP 4-tuple hash is delivered as
3596 if (l3proto == ETHERTYPE_MAX) {
3597 hn_rxpkt_proto(m_new,
3598 &l3proto, &l4proto);
3600 if (l3proto == ETHERTYPE_IP) {
3601 if (l4proto == IPPROTO_UDP &&
3602 (rxr->hn_mbuf_hash &
3603 NDIS_HASH_UDP_IPV4_X)) {
3605 M_HASHTYPE_RSS_UDP_IPV4;
3607 } else if (l4proto !=
3609 hash_type = def_htype;
3613 hash_type = def_htype;
3619 case NDIS_HASH_IPV6:
3620 hash_type = M_HASHTYPE_RSS_IPV6;
3624 case NDIS_HASH_IPV6_EX:
3625 hash_type = M_HASHTYPE_RSS_IPV6_EX;
3629 case NDIS_HASH_TCP_IPV6:
3630 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3633 case NDIS_HASH_TCP_IPV6_EX:
3634 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3638 } else if (!is_vf) {
3639 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3641 M_HASHTYPE_SET(m_new, hash_type);
3643 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3644 if (hn_ifp != ifp) {
3645 const struct ether_header *eh;
3648 * Non-transparent mode VF is activated.
3652 * Allow tapping on hn(4).
3654 ETHER_BPF_MTAP(hn_ifp, m_new);
3657 * Update hn(4)'s stats.
3659 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3660 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3661 /* Checked at the beginning of this function. */
3662 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3663 eh = mtod(m_new, struct ether_header *);
3664 if (ETHER_IS_MULTICAST(eh->ether_dhost))
3665 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3669 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3670 #if defined(INET) || defined(INET6)
3671 struct lro_ctrl *lro = &rxr->hn_lro;
3674 rxr->hn_lro_tried++;
3675 if (hn_lro_rx(lro, m_new) == 0) {
3682 ifp->if_input(ifp, m_new);
3688 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3690 struct hn_softc *sc = ifp->if_softc;
3691 struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3692 struct ifnet *vf_ifp;
3693 int mask, error = 0;
3694 struct ifrsskey *ifrk;
3695 struct ifrsshash *ifrh;
3700 if (ifr->ifr_mtu > HN_MTU_MAX) {
3707 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3712 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3713 /* Can't change MTU */
3719 if (ifp->if_mtu == ifr->ifr_mtu) {
3724 if (hn_xpnt_vf_isready(sc)) {
3725 vf_ifp = sc->hn_vf_ifp;
3727 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3728 sizeof(ifr_vf.ifr_name));
3729 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3733 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3734 vf_ifp->if_xname, ifr->ifr_mtu, error);
3740 * Suspend this interface before the synthetic parts
3746 * Detach the synthetics parts, i.e. NVS and RNDIS.
3748 hn_synth_detach(sc);
3751 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3752 * with the new MTU setting.
3754 error = hn_synth_attach(sc, ifr->ifr_mtu);
3760 error = hn_rndis_get_mtu(sc, &mtu);
3763 else if (bootverbose)
3764 if_printf(ifp, "RNDIS mtu %u\n", mtu);
3767 * Commit the requested MTU, after the synthetic parts
3768 * have been successfully attached.
3770 if (mtu >= ifr->ifr_mtu) {
3773 if_printf(ifp, "fixup mtu %d -> %u\n",
3779 * Synthetic parts' reattach may change the chimney
3780 * sending size; update it.
3782 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3783 hn_set_chim_size(sc, sc->hn_chim_szmax);
3786 * Make sure that various parameters based on MTU are
3787 * still valid, after the MTU change.
3789 hn_mtu_change_fixup(sc);
3792 * All done! Resume the interface now.
3796 if ((sc->hn_flags & HN_FLAG_RXVF) ||
3797 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3799 * Since we have reattached the NVS part,
3800 * change the datapath to VF again; in case
3801 * that it is lost, after the NVS was detached.
3803 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3812 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3817 if (hn_xpnt_vf_isready(sc))
3818 hn_xpnt_vf_saveifflags(sc);
3820 if (ifp->if_flags & IFF_UP) {
3821 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3823 * Caller meight hold mutex, e.g.
3824 * bpf; use busy-wait for the RNDIS
3828 hn_rxfilter_config(sc);
3831 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3832 error = hn_xpnt_vf_iocsetflags(sc);
3837 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3840 sc->hn_if_flags = ifp->if_flags;
3848 if (hn_xpnt_vf_isready(sc)) {
3850 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3851 sizeof(ifr_vf.ifr_name));
3852 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3858 * Fix up requested capabilities w/ supported capabilities,
3859 * since the supported capabilities could have been changed.
3861 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3864 if (mask & IFCAP_TXCSUM) {
3865 ifp->if_capenable ^= IFCAP_TXCSUM;
3866 if (ifp->if_capenable & IFCAP_TXCSUM)
3867 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3869 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3871 if (mask & IFCAP_TXCSUM_IPV6) {
3872 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3873 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3874 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3876 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3879 /* TODO: flip RNDIS offload parameters for RXCSUM. */
3880 if (mask & IFCAP_RXCSUM)
3881 ifp->if_capenable ^= IFCAP_RXCSUM;
3883 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
3884 if (mask & IFCAP_RXCSUM_IPV6)
3885 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3888 if (mask & IFCAP_LRO)
3889 ifp->if_capenable ^= IFCAP_LRO;
3891 if (mask & IFCAP_TSO4) {
3892 ifp->if_capenable ^= IFCAP_TSO4;
3893 if (ifp->if_capenable & IFCAP_TSO4)
3894 ifp->if_hwassist |= CSUM_IP_TSO;
3896 ifp->if_hwassist &= ~CSUM_IP_TSO;
3898 if (mask & IFCAP_TSO6) {
3899 ifp->if_capenable ^= IFCAP_TSO6;
3900 if (ifp->if_capenable & IFCAP_TSO6)
3901 ifp->if_hwassist |= CSUM_IP6_TSO;
3903 ifp->if_hwassist &= ~CSUM_IP6_TSO;
3913 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3917 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3919 * Multicast uses mutex; use busy-wait for
3923 hn_rxfilter_config(sc);
3927 /* XXX vlan(4) style mcast addr maintenance */
3928 if (hn_xpnt_vf_isready(sc)) {
3931 old_if_flags = sc->hn_vf_ifp->if_flags;
3932 hn_xpnt_vf_saveifflags(sc);
3934 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3935 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3937 error = hn_xpnt_vf_iocsetflags(sc);
3946 if (hn_xpnt_vf_isready(sc)) {
3948 * SIOCGIFMEDIA expects ifmediareq, so don't
3949 * create and pass ifr_vf to the VF here; just
3950 * replace the ifr_name.
3952 vf_ifp = sc->hn_vf_ifp;
3953 strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3954 sizeof(ifr->ifr_name));
3955 error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3956 /* Restore the ifr_name. */
3957 strlcpy(ifr->ifr_name, ifp->if_xname,
3958 sizeof(ifr->ifr_name));
3963 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3966 case SIOCGIFRSSHASH:
3967 ifrh = (struct ifrsshash *)data;
3969 if (sc->hn_rx_ring_inuse == 1) {
3971 ifrh->ifrh_func = RSS_FUNC_NONE;
3972 ifrh->ifrh_types = 0;
3976 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3977 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3979 ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3980 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3985 ifrk = (struct ifrsskey *)data;
3987 if (sc->hn_rx_ring_inuse == 1) {
3989 ifrk->ifrk_func = RSS_FUNC_NONE;
3990 ifrk->ifrk_keylen = 0;
3993 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3994 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3996 ifrk->ifrk_func = RSS_FUNC_PRIVATE;
3997 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
3998 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
3999 NDIS_HASH_KEYSIZE_TOEPLITZ);
4004 error = ether_ioctl(ifp, cmd, data);
4011 hn_stop(struct hn_softc *sc, bool detaching)
4013 struct ifnet *ifp = sc->hn_ifp;
4018 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4019 ("synthetic parts were not attached"));
4021 /* Clear RUNNING bit ASAP. */
4022 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4024 /* Disable polling. */
4027 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
4028 KASSERT(sc->hn_vf_ifp != NULL,
4029 ("%s: VF is not attached", ifp->if_xname));
4031 /* Mark transparent mode VF as disabled. */
4032 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4036 * Datapath setting must happen _before_ bringing
4039 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4042 * Bring the VF down.
4044 hn_xpnt_vf_saveifflags(sc);
4045 sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4046 hn_xpnt_vf_iocsetflags(sc);
4049 /* Suspend data transfers. */
4050 hn_suspend_data(sc);
4052 /* Clear OACTIVE bit. */
4053 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4054 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4055 sc->hn_tx_ring[i].hn_oactive = 0;
4058 * If the non-transparent mode VF is active, make sure
4059 * that the RX filter still allows packet reception.
4061 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4062 hn_rxfilter_config(sc);
4066 hn_init_locked(struct hn_softc *sc)
4068 struct ifnet *ifp = sc->hn_ifp;
4073 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4076 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4079 /* Configure RX filter */
4080 hn_rxfilter_config(sc);
4082 /* Clear OACTIVE bit. */
4083 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4084 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4085 sc->hn_tx_ring[i].hn_oactive = 0;
4087 /* Clear TX 'suspended' bit. */
4088 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4090 if (hn_xpnt_vf_isready(sc)) {
4091 /* Initialize transparent VF. */
4092 hn_xpnt_vf_init(sc);
4095 /* Everything is ready; unleash! */
4096 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4098 /* Re-enable polling if requested. */
4099 if (sc->hn_pollhz > 0)
4100 hn_polling(sc, sc->hn_pollhz);
4106 struct hn_softc *sc = xsc;
4113 #if __FreeBSD_version >= 1100099
4116 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4118 struct hn_softc *sc = arg1;
4119 unsigned int lenlim;
4122 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4123 error = sysctl_handle_int(oidp, &lenlim, 0, req);
4124 if (error || req->newptr == NULL)
4128 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4129 lenlim > TCP_LRO_LENGTH_MAX) {
4133 hn_set_lro_lenlim(sc, lenlim);
4140 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4142 struct hn_softc *sc = arg1;
4143 int ackcnt, error, i;
4146 * lro_ackcnt_lim is append count limit,
4147 * +1 to turn it into aggregation limit.
4149 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4150 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4151 if (error || req->newptr == NULL)
4154 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4158 * Convert aggregation limit back to append
4163 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4164 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4172 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4174 struct hn_softc *sc = arg1;
4179 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4182 error = sysctl_handle_int(oidp, &on, 0, req);
4183 if (error || req->newptr == NULL)
4187 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4188 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4191 rxr->hn_trust_hcsum |= hcsum;
4193 rxr->hn_trust_hcsum &= ~hcsum;
4200 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4202 struct hn_softc *sc = arg1;
4203 int chim_size, error;
4205 chim_size = sc->hn_tx_ring[0].hn_chim_size;
4206 error = sysctl_handle_int(oidp, &chim_size, 0, req);
4207 if (error || req->newptr == NULL)
4210 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4214 hn_set_chim_size(sc, chim_size);
4219 #if __FreeBSD_version < 1100095
4221 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4223 struct hn_softc *sc = arg1;
4224 int ofs = arg2, i, error;
4225 struct hn_rx_ring *rxr;
4229 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4230 rxr = &sc->hn_rx_ring[i];
4231 stat += *((int *)((uint8_t *)rxr + ofs));
4234 error = sysctl_handle_64(oidp, &stat, 0, req);
4235 if (error || req->newptr == NULL)
4238 /* Zero out this stat. */
4239 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4240 rxr = &sc->hn_rx_ring[i];
4241 *((int *)((uint8_t *)rxr + ofs)) = 0;
4247 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4249 struct hn_softc *sc = arg1;
4250 int ofs = arg2, i, error;
4251 struct hn_rx_ring *rxr;
4255 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4256 rxr = &sc->hn_rx_ring[i];
4257 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4260 error = sysctl_handle_64(oidp, &stat, 0, req);
4261 if (error || req->newptr == NULL)
4264 /* Zero out this stat. */
4265 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4266 rxr = &sc->hn_rx_ring[i];
4267 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4275 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4277 struct hn_softc *sc = arg1;
4278 int ofs = arg2, i, error;
4279 struct hn_rx_ring *rxr;
4283 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4284 rxr = &sc->hn_rx_ring[i];
4285 stat += *((u_long *)((uint8_t *)rxr + ofs));
4288 error = sysctl_handle_long(oidp, &stat, 0, req);
4289 if (error || req->newptr == NULL)
4292 /* Zero out this stat. */
4293 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4294 rxr = &sc->hn_rx_ring[i];
4295 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
4301 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4303 struct hn_softc *sc = arg1;
4304 int ofs = arg2, i, error;
4305 struct hn_tx_ring *txr;
4309 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4310 txr = &sc->hn_tx_ring[i];
4311 stat += *((u_long *)((uint8_t *)txr + ofs));
4314 error = sysctl_handle_long(oidp, &stat, 0, req);
4315 if (error || req->newptr == NULL)
4318 /* Zero out this stat. */
4319 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4320 txr = &sc->hn_tx_ring[i];
4321 *((u_long *)((uint8_t *)txr + ofs)) = 0;
4327 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4329 struct hn_softc *sc = arg1;
4330 int ofs = arg2, i, error, conf;
4331 struct hn_tx_ring *txr;
4333 txr = &sc->hn_tx_ring[0];
4334 conf = *((int *)((uint8_t *)txr + ofs));
4336 error = sysctl_handle_int(oidp, &conf, 0, req);
4337 if (error || req->newptr == NULL)
4341 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4342 txr = &sc->hn_tx_ring[i];
4343 *((int *)((uint8_t *)txr + ofs)) = conf;
4351 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4353 struct hn_softc *sc = arg1;
4356 size = sc->hn_agg_size;
4357 error = sysctl_handle_int(oidp, &size, 0, req);
4358 if (error || req->newptr == NULL)
4362 sc->hn_agg_size = size;
4370 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4372 struct hn_softc *sc = arg1;
4375 pkts = sc->hn_agg_pkts;
4376 error = sysctl_handle_int(oidp, &pkts, 0, req);
4377 if (error || req->newptr == NULL)
4381 sc->hn_agg_pkts = pkts;
4389 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4391 struct hn_softc *sc = arg1;
4394 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4395 return (sysctl_handle_int(oidp, &pkts, 0, req));
4399 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4401 struct hn_softc *sc = arg1;
4404 align = sc->hn_tx_ring[0].hn_agg_align;
4405 return (sysctl_handle_int(oidp, &align, 0, req));
4409 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4412 vmbus_chan_poll_disable(chan);
4414 vmbus_chan_poll_enable(chan, pollhz);
4418 hn_polling(struct hn_softc *sc, u_int pollhz)
4420 int nsubch = sc->hn_rx_ring_inuse - 1;
4425 struct vmbus_channel **subch;
4428 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4429 for (i = 0; i < nsubch; ++i)
4430 hn_chan_polling(subch[i], pollhz);
4431 vmbus_subchan_rel(subch, nsubch);
4433 hn_chan_polling(sc->hn_prichan, pollhz);
4437 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4439 struct hn_softc *sc = arg1;
4442 pollhz = sc->hn_pollhz;
4443 error = sysctl_handle_int(oidp, &pollhz, 0, req);
4444 if (error || req->newptr == NULL)
4448 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4452 if (sc->hn_pollhz != pollhz) {
4453 sc->hn_pollhz = pollhz;
4454 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4455 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4456 hn_polling(sc, sc->hn_pollhz);
4464 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4466 struct hn_softc *sc = arg1;
4469 snprintf(verstr, sizeof(verstr), "%u.%u",
4470 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4471 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4472 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4476 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4478 struct hn_softc *sc = arg1;
4485 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4486 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4490 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4492 struct hn_softc *sc = arg1;
4493 char assist_str[128];
4497 hwassist = sc->hn_ifp->if_hwassist;
4499 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4500 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4504 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4506 struct hn_softc *sc = arg1;
4507 char filter_str[128];
4511 filter = sc->hn_rx_filter;
4513 snprintf(filter_str, sizeof(filter_str), "%b", filter,
4515 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4519 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4521 struct hn_softc *sc = arg1;
4526 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4527 if (error || req->newptr == NULL)
4530 if ((sc->hn_flags & HN_FLAG_RXVF) ||
4531 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4533 * RSS key is synchronized w/ VF's, don't allow users
4540 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4543 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4545 if (sc->hn_rx_ring_inuse > 1) {
4546 error = hn_rss_reconfig(sc);
4548 /* Not RSS capable, at least for now; just save the RSS key. */
4557 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4559 struct hn_softc *sc = arg1;
4564 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4565 if (error || req->newptr == NULL)
4569 * Don't allow RSS indirect table change, if this interface is not
4570 * RSS capable currently.
4572 if (sc->hn_rx_ring_inuse == 1) {
4577 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4580 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4582 hn_rss_ind_fixup(sc);
4583 error = hn_rss_reconfig(sc);
4590 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4592 struct hn_softc *sc = arg1;
4597 hash = sc->hn_rss_hash;
4599 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4600 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4604 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4606 struct hn_softc *sc = arg1;
4611 hash = sc->hn_rss_hcap;
4613 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4614 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4618 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4620 struct hn_softc *sc = arg1;
4625 hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4627 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4628 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4632 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4634 struct hn_softc *sc = arg1;
4635 char vf_name[IFNAMSIZ + 1];
4636 struct ifnet *vf_ifp;
4640 vf_ifp = sc->hn_vf_ifp;
4642 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4644 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4648 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4650 struct hn_softc *sc = arg1;
4651 char vf_name[IFNAMSIZ + 1];
4652 struct ifnet *vf_ifp;
4656 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4658 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4660 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4664 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4666 struct rm_priotracker pt;
4671 error = sysctl_wire_old_buffer(req, 0);
4675 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4679 rm_rlock(&hn_vfmap_lock, &pt);
4682 for (i = 0; i < hn_vfmap_size; ++i) {
4685 if (hn_vfmap[i] == NULL)
4688 ifp = ifnet_byindex(i);
4691 sbuf_printf(sb, "%s", ifp->if_xname);
4693 sbuf_printf(sb, " %s", ifp->if_xname);
4698 rm_runlock(&hn_vfmap_lock, &pt);
4700 error = sbuf_finish(sb);
4706 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4708 struct rm_priotracker pt;
4713 error = sysctl_wire_old_buffer(req, 0);
4717 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4721 rm_rlock(&hn_vfmap_lock, &pt);
4724 for (i = 0; i < hn_vfmap_size; ++i) {
4725 struct ifnet *ifp, *hn_ifp;
4727 hn_ifp = hn_vfmap[i];
4731 ifp = ifnet_byindex(i);
4734 sbuf_printf(sb, "%s:%s", ifp->if_xname,
4737 sbuf_printf(sb, " %s:%s", ifp->if_xname,
4744 rm_runlock(&hn_vfmap_lock, &pt);
4746 error = sbuf_finish(sb);
4752 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4754 struct hn_softc *sc = arg1;
4755 int error, onoff = 0;
4757 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4759 error = sysctl_handle_int(oidp, &onoff, 0, req);
4760 if (error || req->newptr == NULL)
4764 /* NOTE: hn_vf_lock for hn_transmit() */
4765 rm_wlock(&sc->hn_vf_lock);
4767 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4769 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4770 rm_wunlock(&sc->hn_vf_lock);
4777 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4779 struct hn_softc *sc = arg1;
4782 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4784 return (sysctl_handle_int(oidp, &enabled, 0, req));
4788 hn_check_iplen(const struct mbuf *m, int hoff)
4790 const struct ip *ip;
4791 int len, iphlen, iplen;
4792 const struct tcphdr *th;
4793 int thoff; /* TCP data offset */
4795 len = hoff + sizeof(struct ip);
4797 /* The packet must be at least the size of an IP header. */
4798 if (m->m_pkthdr.len < len)
4799 return IPPROTO_DONE;
4801 /* The fixed IP header must reside completely in the first mbuf. */
4803 return IPPROTO_DONE;
4805 ip = mtodo(m, hoff);
4807 /* Bound check the packet's stated IP header length. */
4808 iphlen = ip->ip_hl << 2;
4809 if (iphlen < sizeof(struct ip)) /* minimum header length */
4810 return IPPROTO_DONE;
4812 /* The full IP header must reside completely in the one mbuf. */
4813 if (m->m_len < hoff + iphlen)
4814 return IPPROTO_DONE;
4816 iplen = ntohs(ip->ip_len);
4819 * Check that the amount of data in the buffers is as
4820 * at least much as the IP header would have us expect.
4822 if (m->m_pkthdr.len < hoff + iplen)
4823 return IPPROTO_DONE;
4826 * Ignore IP fragments.
4828 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4829 return IPPROTO_DONE;
4832 * The TCP/IP or UDP/IP header must be entirely contained within
4833 * the first fragment of a packet.
4837 if (iplen < iphlen + sizeof(struct tcphdr))
4838 return IPPROTO_DONE;
4839 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4840 return IPPROTO_DONE;
4841 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4842 thoff = th->th_off << 2;
4843 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4844 return IPPROTO_DONE;
4845 if (m->m_len < hoff + iphlen + thoff)
4846 return IPPROTO_DONE;
4849 if (iplen < iphlen + sizeof(struct udphdr))
4850 return IPPROTO_DONE;
4851 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4852 return IPPROTO_DONE;
4856 return IPPROTO_DONE;
4863 hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto)
4865 const struct ether_header *eh;
4870 /* Checked at the beginning of this function. */
4871 KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
4873 eh = mtod(m_new, const struct ether_header *);
4874 etype = ntohs(eh->ether_type);
4875 if (etype == ETHERTYPE_VLAN) {
4876 const struct ether_vlan_header *evl;
4878 hoff = sizeof(*evl);
4879 if (m_new->m_len < hoff)
4881 evl = mtod(m_new, const struct ether_vlan_header *);
4882 etype = ntohs(evl->evl_proto);
4886 if (etype == ETHERTYPE_IP)
4887 *l4proto = hn_check_iplen(m_new, hoff);
4889 *l4proto = IPPROTO_DONE;
4893 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4895 struct sysctl_oid_list *child;
4896 struct sysctl_ctx_list *ctx;
4897 device_t dev = sc->hn_dev;
4898 #if defined(INET) || defined(INET6)
4899 #if __FreeBSD_version >= 1100095
4906 * Create RXBUF for reception.
4909 * - It is shared by all channels.
4910 * - A large enough buffer is allocated, certain version of NVSes
4911 * may further limit the usable space.
4913 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4914 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4915 BUS_DMA_WAITOK | BUS_DMA_ZERO);
4916 if (sc->hn_rxbuf == NULL) {
4917 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4921 sc->hn_rx_ring_cnt = ring_cnt;
4922 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4924 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4925 M_DEVBUF, M_WAITOK | M_ZERO);
4927 #if defined(INET) || defined(INET6)
4928 #if __FreeBSD_version >= 1100095
4929 lroent_cnt = hn_lro_entry_count;
4930 if (lroent_cnt < TCP_LRO_ENTRIES)
4931 lroent_cnt = TCP_LRO_ENTRIES;
4933 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4935 #endif /* INET || INET6 */
4937 ctx = device_get_sysctl_ctx(dev);
4938 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4940 /* Create dev.hn.UNIT.rx sysctl tree */
4941 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4942 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4944 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4945 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4947 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4948 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4949 &rxr->hn_br_dma, BUS_DMA_WAITOK);
4950 if (rxr->hn_br == NULL) {
4951 device_printf(dev, "allocate bufring failed\n");
4955 if (hn_trust_hosttcp)
4956 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4957 if (hn_trust_hostudp)
4958 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4959 if (hn_trust_hostip)
4960 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4961 rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4962 rxr->hn_ifp = sc->hn_ifp;
4963 if (i < sc->hn_tx_ring_cnt)
4964 rxr->hn_txr = &sc->hn_tx_ring[i];
4965 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4966 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4968 rxr->hn_rxbuf = sc->hn_rxbuf;
4973 #if defined(INET) || defined(INET6)
4974 #if __FreeBSD_version >= 1100095
4975 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4976 hn_lro_mbufq_depth);
4978 tcp_lro_init(&rxr->hn_lro);
4979 rxr->hn_lro.ifp = sc->hn_ifp;
4981 #if __FreeBSD_version >= 1100099
4982 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4983 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4985 #endif /* INET || INET6 */
4987 if (sc->hn_rx_sysctl_tree != NULL) {
4991 * Create per RX ring sysctl tree:
4992 * dev.hn.UNIT.rx.RINGID
4994 snprintf(name, sizeof(name), "%d", i);
4995 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4996 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4997 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4999 if (rxr->hn_rx_sysctl_tree != NULL) {
5000 SYSCTL_ADD_ULONG(ctx,
5001 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5002 OID_AUTO, "packets", CTLFLAG_RW,
5003 &rxr->hn_pkts, "# of packets received");
5004 SYSCTL_ADD_ULONG(ctx,
5005 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5006 OID_AUTO, "rss_pkts", CTLFLAG_RW,
5008 "# of packets w/ RSS info received");
5010 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
5011 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
5012 &rxr->hn_pktbuf_len, 0,
5013 "Temporary channel packet buffer length");
5018 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
5019 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5020 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
5021 #if __FreeBSD_version < 1100095
5022 hn_rx_stat_int_sysctl,
5024 hn_rx_stat_u64_sysctl,
5026 "LU", "LRO queued");
5027 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
5028 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5029 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
5030 #if __FreeBSD_version < 1100095
5031 hn_rx_stat_int_sysctl,
5033 hn_rx_stat_u64_sysctl,
5035 "LU", "LRO flushed");
5036 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
5037 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5038 __offsetof(struct hn_rx_ring, hn_lro_tried),
5039 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
5040 #if __FreeBSD_version >= 1100099
5041 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
5042 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5043 hn_lro_lenlim_sysctl, "IU",
5044 "Max # of data bytes to be aggregated by LRO");
5045 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
5046 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5047 hn_lro_ackcnt_sysctl, "I",
5048 "Max # of ACKs to be aggregated by LRO");
5050 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
5051 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
5052 hn_trust_hcsum_sysctl, "I",
5053 "Trust tcp segement verification on host side, "
5054 "when csum info is missing");
5055 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
5056 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
5057 hn_trust_hcsum_sysctl, "I",
5058 "Trust udp datagram verification on host side, "
5059 "when csum info is missing");
5060 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
5061 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
5062 hn_trust_hcsum_sysctl, "I",
5063 "Trust ip packet verification on host side, "
5064 "when csum info is missing");
5065 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5066 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5067 __offsetof(struct hn_rx_ring, hn_csum_ip),
5068 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5069 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5070 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5071 __offsetof(struct hn_rx_ring, hn_csum_tcp),
5072 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5073 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5074 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5075 __offsetof(struct hn_rx_ring, hn_csum_udp),
5076 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5077 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5078 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5079 __offsetof(struct hn_rx_ring, hn_csum_trusted),
5080 hn_rx_stat_ulong_sysctl, "LU",
5081 "# of packets that we trust host's csum verification");
5082 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5083 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5084 __offsetof(struct hn_rx_ring, hn_small_pkts),
5085 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5086 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5087 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5088 __offsetof(struct hn_rx_ring, hn_ack_failed),
5089 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5090 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5091 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5092 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5093 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5099 hn_destroy_rx_data(struct hn_softc *sc)
5103 if (sc->hn_rxbuf != NULL) {
5104 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5105 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5107 device_printf(sc->hn_dev, "RXBUF is referenced\n");
5108 sc->hn_rxbuf = NULL;
5111 if (sc->hn_rx_ring_cnt == 0)
5114 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5115 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5117 if (rxr->hn_br == NULL)
5119 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5120 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5122 device_printf(sc->hn_dev,
5123 "%dth channel bufring is referenced", i);
5127 #if defined(INET) || defined(INET6)
5128 tcp_lro_free(&rxr->hn_lro);
5130 free(rxr->hn_pktbuf, M_DEVBUF);
5132 free(sc->hn_rx_ring, M_DEVBUF);
5133 sc->hn_rx_ring = NULL;
5135 sc->hn_rx_ring_cnt = 0;
5136 sc->hn_rx_ring_inuse = 0;
5140 hn_tx_ring_create(struct hn_softc *sc, int id)
5142 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5143 device_t dev = sc->hn_dev;
5144 bus_dma_tag_t parent_dtag;
5148 txr->hn_tx_idx = id;
5150 #ifndef HN_USE_TXDESC_BUFRING
5151 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5153 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5155 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5156 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5157 M_DEVBUF, M_WAITOK | M_ZERO);
5158 #ifndef HN_USE_TXDESC_BUFRING
5159 SLIST_INIT(&txr->hn_txlist);
5161 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5162 M_WAITOK, &txr->hn_tx_lock);
5165 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5166 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5167 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5169 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5172 #ifdef HN_IFSTART_SUPPORT
5173 if (hn_use_if_start) {
5174 txr->hn_txeof = hn_start_txeof;
5175 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5176 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5182 txr->hn_txeof = hn_xmit_txeof;
5183 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5184 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5186 br_depth = hn_get_txswq_depth(txr);
5187 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5188 M_WAITOK, &txr->hn_tx_lock);
5191 txr->hn_direct_tx_size = hn_direct_tx_size;
5194 * Always schedule transmission instead of trying to do direct
5195 * transmission. This one gives the best performance so far.
5197 txr->hn_sched_tx = 1;
5199 parent_dtag = bus_get_dma_tag(dev);
5201 /* DMA tag for RNDIS packet messages. */
5202 error = bus_dma_tag_create(parent_dtag, /* parent */
5203 HN_RNDIS_PKT_ALIGN, /* alignment */
5204 HN_RNDIS_PKT_BOUNDARY, /* boundary */
5205 BUS_SPACE_MAXADDR, /* lowaddr */
5206 BUS_SPACE_MAXADDR, /* highaddr */
5207 NULL, NULL, /* filter, filterarg */
5208 HN_RNDIS_PKT_LEN, /* maxsize */
5210 HN_RNDIS_PKT_LEN, /* maxsegsize */
5212 NULL, /* lockfunc */
5213 NULL, /* lockfuncarg */
5214 &txr->hn_tx_rndis_dtag);
5216 device_printf(dev, "failed to create rndis dmatag\n");
5220 /* DMA tag for data. */
5221 error = bus_dma_tag_create(parent_dtag, /* parent */
5223 HN_TX_DATA_BOUNDARY, /* boundary */
5224 BUS_SPACE_MAXADDR, /* lowaddr */
5225 BUS_SPACE_MAXADDR, /* highaddr */
5226 NULL, NULL, /* filter, filterarg */
5227 HN_TX_DATA_MAXSIZE, /* maxsize */
5228 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
5229 HN_TX_DATA_SEGSIZE, /* maxsegsize */
5231 NULL, /* lockfunc */
5232 NULL, /* lockfuncarg */
5233 &txr->hn_tx_data_dtag);
5235 device_printf(dev, "failed to create data dmatag\n");
5239 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5240 struct hn_txdesc *txd = &txr->hn_txdesc[i];
5243 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5244 STAILQ_INIT(&txd->agg_list);
5247 * Allocate and load RNDIS packet message.
5249 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5250 (void **)&txd->rndis_pkt,
5251 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5252 &txd->rndis_pkt_dmap);
5255 "failed to allocate rndis_packet_msg, %d\n", i);
5259 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5260 txd->rndis_pkt_dmap,
5261 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5262 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5266 "failed to load rndis_packet_msg, %d\n", i);
5267 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5268 txd->rndis_pkt, txd->rndis_pkt_dmap);
5272 /* DMA map for TX data. */
5273 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5277 "failed to allocate tx data dmamap\n");
5278 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5279 txd->rndis_pkt_dmap);
5280 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5281 txd->rndis_pkt, txd->rndis_pkt_dmap);
5285 /* All set, put it to list */
5286 txd->flags |= HN_TXD_FLAG_ONLIST;
5287 #ifndef HN_USE_TXDESC_BUFRING
5288 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5290 buf_ring_enqueue(txr->hn_txdesc_br, txd);
5293 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5295 if (sc->hn_tx_sysctl_tree != NULL) {
5296 struct sysctl_oid_list *child;
5297 struct sysctl_ctx_list *ctx;
5301 * Create per TX ring sysctl tree:
5302 * dev.hn.UNIT.tx.RINGID
5304 ctx = device_get_sysctl_ctx(dev);
5305 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5307 snprintf(name, sizeof(name), "%d", id);
5308 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5309 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5311 if (txr->hn_tx_sysctl_tree != NULL) {
5312 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5315 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5316 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5317 "# of available TX descs");
5319 #ifdef HN_IFSTART_SUPPORT
5320 if (!hn_use_if_start)
5323 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5324 CTLFLAG_RD, &txr->hn_oactive, 0,
5327 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5328 CTLFLAG_RW, &txr->hn_pkts,
5329 "# of packets transmitted");
5330 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5331 CTLFLAG_RW, &txr->hn_sends, "# of sends");
5339 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5341 struct hn_tx_ring *txr = txd->txr;
5343 KASSERT(txd->m == NULL, ("still has mbuf installed"));
5344 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5346 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5347 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5348 txd->rndis_pkt_dmap);
5349 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5353 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5356 KASSERT(txd->refs == 0 || txd->refs == 1,
5357 ("invalid txd refs %d", txd->refs));
5359 /* Aggregated txds will be freed by their aggregating txd. */
5360 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5363 freed = hn_txdesc_put(txr, txd);
5364 KASSERT(freed, ("can't free txdesc"));
5369 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5373 if (txr->hn_txdesc == NULL)
5378 * Because the freeing of aggregated txds will be deferred
5379 * to the aggregating txd, two passes are used here:
5380 * - The first pass GCes any pending txds. This GC is necessary,
5381 * since if the channels are revoked, hypervisor will not
5382 * deliver send-done for all pending txds.
5383 * - The second pass frees the busdma stuffs, i.e. after all txds
5386 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5387 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5388 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5389 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5391 if (txr->hn_tx_data_dtag != NULL)
5392 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5393 if (txr->hn_tx_rndis_dtag != NULL)
5394 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5396 #ifdef HN_USE_TXDESC_BUFRING
5397 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5400 free(txr->hn_txdesc, M_DEVBUF);
5401 txr->hn_txdesc = NULL;
5403 if (txr->hn_mbuf_br != NULL)
5404 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5406 #ifndef HN_USE_TXDESC_BUFRING
5407 mtx_destroy(&txr->hn_txlist_spin);
5409 mtx_destroy(&txr->hn_tx_lock);
5413 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5415 struct sysctl_oid_list *child;
5416 struct sysctl_ctx_list *ctx;
5420 * Create TXBUF for chimney sending.
5422 * NOTE: It is shared by all channels.
5424 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5425 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5426 BUS_DMA_WAITOK | BUS_DMA_ZERO);
5427 if (sc->hn_chim == NULL) {
5428 device_printf(sc->hn_dev, "allocate txbuf failed\n");
5432 sc->hn_tx_ring_cnt = ring_cnt;
5433 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5435 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5436 M_DEVBUF, M_WAITOK | M_ZERO);
5438 ctx = device_get_sysctl_ctx(sc->hn_dev);
5439 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5441 /* Create dev.hn.UNIT.tx sysctl tree */
5442 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5443 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5445 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5448 error = hn_tx_ring_create(sc, i);
5453 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5454 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5455 __offsetof(struct hn_tx_ring, hn_no_txdescs),
5456 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5457 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5458 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5459 __offsetof(struct hn_tx_ring, hn_send_failed),
5460 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5461 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5462 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5463 __offsetof(struct hn_tx_ring, hn_txdma_failed),
5464 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5465 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5466 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5467 __offsetof(struct hn_tx_ring, hn_flush_failed),
5468 hn_tx_stat_ulong_sysctl, "LU",
5469 "# of packet transmission aggregation flush failure");
5470 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5471 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5472 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5473 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5474 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5475 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5476 __offsetof(struct hn_tx_ring, hn_tx_chimney),
5477 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5478 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5479 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5480 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5481 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5482 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5483 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5484 "# of total TX descs");
5485 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5486 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5487 "Chimney send packet size upper boundary");
5488 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5489 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5490 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5491 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5492 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5493 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5494 hn_tx_conf_int_sysctl, "I",
5495 "Size of the packet for direct transmission");
5496 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5497 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5498 __offsetof(struct hn_tx_ring, hn_sched_tx),
5499 hn_tx_conf_int_sysctl, "I",
5500 "Always schedule transmission "
5501 "instead of doing direct transmission");
5502 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5503 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5504 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5505 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5506 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5507 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5508 "Applied packet transmission aggregation size");
5509 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5510 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5511 hn_txagg_pktmax_sysctl, "I",
5512 "Applied packet transmission aggregation packets");
5513 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5514 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5515 hn_txagg_align_sysctl, "I",
5516 "Applied packet transmission aggregation alignment");
5522 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5526 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5527 sc->hn_tx_ring[i].hn_chim_size = chim_size;
5531 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5533 struct ifnet *ifp = sc->hn_ifp;
5539 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5542 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5543 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5544 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5546 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5547 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5548 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5550 if (tso_maxlen < tso_minlen)
5551 tso_maxlen = tso_minlen;
5552 else if (tso_maxlen > IP_MAXPACKET)
5553 tso_maxlen = IP_MAXPACKET;
5554 if (tso_maxlen > sc->hn_ndis_tso_szmax)
5555 tso_maxlen = sc->hn_ndis_tso_szmax;
5556 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5558 if (hn_xpnt_vf_isready(sc)) {
5559 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5560 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5562 ifp->if_hw_tsomax = hw_tsomax;
5564 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5568 hn_fixup_tx_data(struct hn_softc *sc)
5570 uint64_t csum_assist;
5573 hn_set_chim_size(sc, sc->hn_chim_szmax);
5574 if (hn_tx_chimney_size > 0 &&
5575 hn_tx_chimney_size < sc->hn_chim_szmax)
5576 hn_set_chim_size(sc, hn_tx_chimney_size);
5579 if (sc->hn_caps & HN_CAP_IPCS)
5580 csum_assist |= CSUM_IP;
5581 if (sc->hn_caps & HN_CAP_TCP4CS)
5582 csum_assist |= CSUM_IP_TCP;
5583 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5584 csum_assist |= CSUM_IP_UDP;
5585 if (sc->hn_caps & HN_CAP_TCP6CS)
5586 csum_assist |= CSUM_IP6_TCP;
5587 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5588 csum_assist |= CSUM_IP6_UDP;
5589 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5590 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5592 if (sc->hn_caps & HN_CAP_HASHVAL) {
5594 * Support HASHVAL pktinfo on TX path.
5597 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5598 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5599 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5604 hn_fixup_rx_data(struct hn_softc *sc)
5607 if (sc->hn_caps & HN_CAP_UDPHASH) {
5610 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
5611 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH;
5616 hn_destroy_tx_data(struct hn_softc *sc)
5620 if (sc->hn_chim != NULL) {
5621 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5622 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5624 device_printf(sc->hn_dev,
5625 "chimney sending buffer is referenced");
5630 if (sc->hn_tx_ring_cnt == 0)
5633 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5634 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5636 free(sc->hn_tx_ring, M_DEVBUF);
5637 sc->hn_tx_ring = NULL;
5639 sc->hn_tx_ring_cnt = 0;
5640 sc->hn_tx_ring_inuse = 0;
5643 #ifdef HN_IFSTART_SUPPORT
5646 hn_start_taskfunc(void *xtxr, int pending __unused)
5648 struct hn_tx_ring *txr = xtxr;
5650 mtx_lock(&txr->hn_tx_lock);
5651 hn_start_locked(txr, 0);
5652 mtx_unlock(&txr->hn_tx_lock);
5656 hn_start_locked(struct hn_tx_ring *txr, int len)
5658 struct hn_softc *sc = txr->hn_sc;
5659 struct ifnet *ifp = sc->hn_ifp;
5662 KASSERT(hn_use_if_start,
5663 ("hn_start_locked is called, when if_start is disabled"));
5664 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5665 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5666 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5668 if (__predict_false(txr->hn_suspended))
5671 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5675 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5676 struct hn_txdesc *txd;
5677 struct mbuf *m_head;
5680 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5684 if (len > 0 && m_head->m_pkthdr.len > len) {
5686 * This sending could be time consuming; let callers
5687 * dispatch this packet sending (and sending of any
5688 * following up packets) to tx taskqueue.
5690 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5695 #if defined(INET6) || defined(INET)
5696 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5697 m_head = hn_tso_fixup(m_head);
5698 if (__predict_false(m_head == NULL)) {
5699 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5702 } else if (m_head->m_pkthdr.csum_flags &
5703 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5704 m_head = hn_set_hlen(m_head);
5705 if (__predict_false(m_head == NULL)) {
5706 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5712 txd = hn_txdesc_get(txr);
5714 txr->hn_no_txdescs++;
5715 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5716 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5720 error = hn_encap(ifp, txr, txd, &m_head);
5722 /* Both txd and m_head are freed */
5723 KASSERT(txr->hn_agg_txd == NULL,
5724 ("encap failed w/ pending aggregating txdesc"));
5728 if (txr->hn_agg_pktleft == 0) {
5729 if (txr->hn_agg_txd != NULL) {
5730 KASSERT(m_head == NULL,
5731 ("pending mbuf for aggregating txdesc"));
5732 error = hn_flush_txagg(ifp, txr);
5733 if (__predict_false(error)) {
5734 atomic_set_int(&ifp->if_drv_flags,
5739 KASSERT(m_head != NULL, ("mbuf was freed"));
5740 error = hn_txpkt(ifp, txr, txd);
5741 if (__predict_false(error)) {
5742 /* txd is freed, but m_head is not */
5743 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5744 atomic_set_int(&ifp->if_drv_flags,
5752 KASSERT(txr->hn_agg_txd != NULL,
5753 ("no aggregating txdesc"));
5754 KASSERT(m_head == NULL,
5755 ("pending mbuf for aggregating txdesc"));
5760 /* Flush pending aggerated transmission. */
5761 if (txr->hn_agg_txd != NULL)
5762 hn_flush_txagg(ifp, txr);
5767 hn_start(struct ifnet *ifp)
5769 struct hn_softc *sc = ifp->if_softc;
5770 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5772 if (txr->hn_sched_tx)
5775 if (mtx_trylock(&txr->hn_tx_lock)) {
5778 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5779 mtx_unlock(&txr->hn_tx_lock);
5784 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5788 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5790 struct hn_tx_ring *txr = xtxr;
5792 mtx_lock(&txr->hn_tx_lock);
5793 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5794 hn_start_locked(txr, 0);
5795 mtx_unlock(&txr->hn_tx_lock);
5799 hn_start_txeof(struct hn_tx_ring *txr)
5801 struct hn_softc *sc = txr->hn_sc;
5802 struct ifnet *ifp = sc->hn_ifp;
5804 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5806 if (txr->hn_sched_tx)
5809 if (mtx_trylock(&txr->hn_tx_lock)) {
5812 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5813 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5814 mtx_unlock(&txr->hn_tx_lock);
5816 taskqueue_enqueue(txr->hn_tx_taskq,
5822 * Release the OACTIVE earlier, with the hope, that
5823 * others could catch up. The task will clear the
5824 * flag again with the hn_tx_lock to avoid possible
5827 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5828 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5832 #endif /* HN_IFSTART_SUPPORT */
5835 hn_xmit(struct hn_tx_ring *txr, int len)
5837 struct hn_softc *sc = txr->hn_sc;
5838 struct ifnet *ifp = sc->hn_ifp;
5839 struct mbuf *m_head;
5842 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5843 #ifdef HN_IFSTART_SUPPORT
5844 KASSERT(hn_use_if_start == 0,
5845 ("hn_xmit is called, when if_start is enabled"));
5847 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5849 if (__predict_false(txr->hn_suspended))
5852 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5855 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5856 struct hn_txdesc *txd;
5859 if (len > 0 && m_head->m_pkthdr.len > len) {
5861 * This sending could be time consuming; let callers
5862 * dispatch this packet sending (and sending of any
5863 * following up packets) to tx taskqueue.
5865 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5870 txd = hn_txdesc_get(txr);
5872 txr->hn_no_txdescs++;
5873 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5874 txr->hn_oactive = 1;
5878 error = hn_encap(ifp, txr, txd, &m_head);
5880 /* Both txd and m_head are freed; discard */
5881 KASSERT(txr->hn_agg_txd == NULL,
5882 ("encap failed w/ pending aggregating txdesc"));
5883 drbr_advance(ifp, txr->hn_mbuf_br);
5887 if (txr->hn_agg_pktleft == 0) {
5888 if (txr->hn_agg_txd != NULL) {
5889 KASSERT(m_head == NULL,
5890 ("pending mbuf for aggregating txdesc"));
5891 error = hn_flush_txagg(ifp, txr);
5892 if (__predict_false(error)) {
5893 txr->hn_oactive = 1;
5897 KASSERT(m_head != NULL, ("mbuf was freed"));
5898 error = hn_txpkt(ifp, txr, txd);
5899 if (__predict_false(error)) {
5900 /* txd is freed, but m_head is not */
5901 drbr_putback(ifp, txr->hn_mbuf_br,
5903 txr->hn_oactive = 1;
5910 KASSERT(txr->hn_agg_txd != NULL,
5911 ("no aggregating txdesc"));
5912 KASSERT(m_head == NULL,
5913 ("pending mbuf for aggregating txdesc"));
5918 drbr_advance(ifp, txr->hn_mbuf_br);
5921 /* Flush pending aggerated transmission. */
5922 if (txr->hn_agg_txd != NULL)
5923 hn_flush_txagg(ifp, txr);
5928 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5930 struct hn_softc *sc = ifp->if_softc;
5931 struct hn_tx_ring *txr;
5934 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5935 struct rm_priotracker pt;
5937 rm_rlock(&sc->hn_vf_lock, &pt);
5938 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5939 struct mbuf *m_bpf = NULL;
5942 obytes = m->m_pkthdr.len;
5943 if (m->m_flags & M_MCAST)
5946 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5947 if (bpf_peers_present(ifp->if_bpf)) {
5948 m_bpf = m_copypacket(m, M_NOWAIT);
5949 if (m_bpf == NULL) {
5951 * Failed to grab a shallow
5954 ETHER_BPF_MTAP(ifp, m);
5958 ETHER_BPF_MTAP(ifp, m);
5961 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5962 rm_runlock(&sc->hn_vf_lock, &pt);
5964 if (m_bpf != NULL) {
5966 ETHER_BPF_MTAP(ifp, m_bpf);
5970 if (error == ENOBUFS) {
5971 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5973 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5975 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5976 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5978 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5984 rm_runlock(&sc->hn_vf_lock, &pt);
5987 #if defined(INET6) || defined(INET)
5989 * Perform TSO packet header fixup or get l2/l3 header length now,
5990 * since packet headers should be cache-hot.
5992 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5993 m = hn_tso_fixup(m);
5994 if (__predict_false(m == NULL)) {
5995 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5998 } else if (m->m_pkthdr.csum_flags &
5999 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
6001 if (__predict_false(m == NULL)) {
6002 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
6009 * Select the TX ring based on flowid
6011 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
6012 #if defined(INET6) || defined(INET)
6015 if (m->m_pkthdr.len < 128 &&
6016 (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
6017 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
6018 m = hn_check_tcpsyn(m, &tcpsyn);
6019 if (__predict_false(m == NULL)) {
6021 IFCOUNTER_OERRORS, 1);
6026 const int tcpsyn = 0;
6031 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
6033 txr = &sc->hn_tx_ring[idx];
6035 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
6037 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
6041 if (txr->hn_oactive)
6044 if (txr->hn_sched_tx)
6047 if (mtx_trylock(&txr->hn_tx_lock)) {
6050 sched = hn_xmit(txr, txr->hn_direct_tx_size);
6051 mtx_unlock(&txr->hn_tx_lock);
6056 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
6061 hn_tx_ring_qflush(struct hn_tx_ring *txr)
6065 mtx_lock(&txr->hn_tx_lock);
6066 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
6068 mtx_unlock(&txr->hn_tx_lock);
6072 hn_xmit_qflush(struct ifnet *ifp)
6074 struct hn_softc *sc = ifp->if_softc;
6075 struct rm_priotracker pt;
6078 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6079 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6082 rm_rlock(&sc->hn_vf_lock, &pt);
6083 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6084 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6085 rm_runlock(&sc->hn_vf_lock, &pt);
6089 hn_xmit_txeof(struct hn_tx_ring *txr)
6092 if (txr->hn_sched_tx)
6095 if (mtx_trylock(&txr->hn_tx_lock)) {
6098 txr->hn_oactive = 0;
6099 sched = hn_xmit(txr, txr->hn_direct_tx_size);
6100 mtx_unlock(&txr->hn_tx_lock);
6102 taskqueue_enqueue(txr->hn_tx_taskq,
6108 * Release the oactive earlier, with the hope, that
6109 * others could catch up. The task will clear the
6110 * oactive again with the hn_tx_lock to avoid possible
6113 txr->hn_oactive = 0;
6114 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6119 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6121 struct hn_tx_ring *txr = xtxr;
6123 mtx_lock(&txr->hn_tx_lock);
6125 mtx_unlock(&txr->hn_tx_lock);
6129 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6131 struct hn_tx_ring *txr = xtxr;
6133 mtx_lock(&txr->hn_tx_lock);
6134 txr->hn_oactive = 0;
6136 mtx_unlock(&txr->hn_tx_lock);
6140 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6142 struct vmbus_chan_br cbr;
6143 struct hn_rx_ring *rxr;
6144 struct hn_tx_ring *txr = NULL;
6147 idx = vmbus_chan_subidx(chan);
6150 * Link this channel to RX/TX ring.
6152 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6153 ("invalid channel index %d, should > 0 && < %d",
6154 idx, sc->hn_rx_ring_inuse));
6155 rxr = &sc->hn_rx_ring[idx];
6156 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6157 ("RX ring %d already attached", idx));
6158 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6159 rxr->hn_chan = chan;
6162 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6163 idx, vmbus_chan_id(chan));
6166 if (idx < sc->hn_tx_ring_inuse) {
6167 txr = &sc->hn_tx_ring[idx];
6168 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6169 ("TX ring %d already attached", idx));
6170 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6172 txr->hn_chan = chan;
6174 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6175 idx, vmbus_chan_id(chan));
6179 /* Bind this channel to a proper CPU. */
6180 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6185 cbr.cbr = rxr->hn_br;
6186 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6187 cbr.cbr_txsz = HN_TXBR_SIZE;
6188 cbr.cbr_rxsz = HN_RXBR_SIZE;
6189 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6191 if (error == EISCONN) {
6192 if_printf(sc->hn_ifp, "bufring is connected after "
6193 "chan%u open failure\n", vmbus_chan_id(chan));
6194 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6196 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6197 vmbus_chan_id(chan), error);
6204 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6206 struct hn_rx_ring *rxr;
6209 idx = vmbus_chan_subidx(chan);
6212 * Link this channel to RX/TX ring.
6214 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6215 ("invalid channel index %d, should > 0 && < %d",
6216 idx, sc->hn_rx_ring_inuse));
6217 rxr = &sc->hn_rx_ring[idx];
6218 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6219 ("RX ring %d is not attached", idx));
6220 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6222 if (idx < sc->hn_tx_ring_inuse) {
6223 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6225 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6226 ("TX ring %d is not attached attached", idx));
6227 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6231 * Close this channel.
6234 * Channel closing does _not_ destroy the target channel.
6236 error = vmbus_chan_close_direct(chan);
6237 if (error == EISCONN) {
6238 if_printf(sc->hn_ifp, "chan%u bufring is connected "
6239 "after being closed\n", vmbus_chan_id(chan));
6240 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6242 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6243 vmbus_chan_id(chan), error);
6248 hn_attach_subchans(struct hn_softc *sc)
6250 struct vmbus_channel **subchans;
6251 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6254 KASSERT(subchan_cnt > 0, ("no sub-channels"));
6256 /* Attach the sub-channels. */
6257 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6258 for (i = 0; i < subchan_cnt; ++i) {
6261 error1 = hn_chan_attach(sc, subchans[i]);
6264 /* Move on; all channels will be detached later. */
6267 vmbus_subchan_rel(subchans, subchan_cnt);
6270 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6273 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6281 hn_detach_allchans(struct hn_softc *sc)
6283 struct vmbus_channel **subchans;
6284 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6287 if (subchan_cnt == 0)
6290 /* Detach the sub-channels. */
6291 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6292 for (i = 0; i < subchan_cnt; ++i)
6293 hn_chan_detach(sc, subchans[i]);
6294 vmbus_subchan_rel(subchans, subchan_cnt);
6298 * Detach the primary channel, _after_ all sub-channels
6301 hn_chan_detach(sc, sc->hn_prichan);
6303 /* Wait for sub-channels to be destroyed, if any. */
6304 vmbus_subchan_drain(sc->hn_prichan);
6307 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6308 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6309 HN_RX_FLAG_ATTACHED) == 0,
6310 ("%dth RX ring is still attached", i));
6312 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6313 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6314 HN_TX_FLAG_ATTACHED) == 0,
6315 ("%dth TX ring is still attached", i));
6321 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6323 struct vmbus_channel **subchans;
6324 int nchan, rxr_cnt, error;
6326 nchan = *nsubch + 1;
6329 * Multiple RX/TX rings are not requested.
6336 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6339 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6341 /* No RSS; this is benign. */
6346 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6350 if (nchan > rxr_cnt)
6353 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6359 * Allocate sub-channels from NVS.
6361 *nsubch = nchan - 1;
6362 error = hn_nvs_alloc_subchans(sc, nsubch);
6363 if (error || *nsubch == 0) {
6364 /* Failed to allocate sub-channels. */
6370 * Wait for all sub-channels to become ready before moving on.
6372 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6373 vmbus_subchan_rel(subchans, *nsubch);
6378 hn_synth_attachable(const struct hn_softc *sc)
6382 if (sc->hn_flags & HN_FLAG_ERRORS)
6385 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6386 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6388 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6395 * Make sure that the RX filter is zero after the successful
6396 * RNDIS initialization.
6399 * Under certain conditions on certain versions of Hyper-V,
6400 * the RNDIS rxfilter is _not_ zero on the hypervisor side
6401 * after the successful RNDIS initialization, which breaks
6402 * the assumption of any following code (well, it breaks the
6403 * RNDIS API contract actually). Clear the RNDIS rxfilter
6404 * explicitly, drain packets sneaking through, and drain the
6405 * interrupt taskqueues scheduled due to the stealth packets.
6408 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6412 hn_drain_rxtx(sc, nchan);
6416 hn_synth_attach(struct hn_softc *sc, int mtu)
6418 #define ATTACHED_NVS 0x0002
6419 #define ATTACHED_RNDIS 0x0004
6421 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6422 int error, nsubch, nchan = 1, i, rndis_inited;
6423 uint32_t old_caps, attached = 0;
6425 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6426 ("synthetic parts were attached"));
6428 if (!hn_synth_attachable(sc))
6431 /* Save capabilities for later verification. */
6432 old_caps = sc->hn_caps;
6435 /* Clear RSS stuffs. */
6436 sc->hn_rss_ind_size = 0;
6437 sc->hn_rss_hash = 0;
6438 sc->hn_rss_hcap = 0;
6441 * Attach the primary channel _before_ attaching NVS and RNDIS.
6443 error = hn_chan_attach(sc, sc->hn_prichan);
6450 error = hn_nvs_attach(sc, mtu);
6453 attached |= ATTACHED_NVS;
6456 * Attach RNDIS _after_ NVS is attached.
6458 error = hn_rndis_attach(sc, mtu, &rndis_inited);
6460 attached |= ATTACHED_RNDIS;
6465 * Make sure capabilities are not changed.
6467 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6468 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6469 old_caps, sc->hn_caps);
6475 * Allocate sub-channels for multi-TX/RX rings.
6478 * The # of RX rings that can be used is equivalent to the # of
6479 * channels to be requested.
6481 nsubch = sc->hn_rx_ring_cnt - 1;
6482 error = hn_synth_alloc_subchans(sc, &nsubch);
6485 /* NOTE: _Full_ synthetic parts detach is required now. */
6486 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6489 * Set the # of TX/RX rings that could be used according to
6490 * the # of channels that NVS offered.
6493 hn_set_ring_inuse(sc, nchan);
6495 /* Only the primary channel can be used; done */
6500 * Attach the sub-channels.
6502 * NOTE: hn_set_ring_inuse() _must_ have been called.
6504 error = hn_attach_subchans(sc);
6509 * Configure RSS key and indirect table _after_ all sub-channels
6512 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6514 * RSS key is not set yet; set it to the default RSS key.
6517 if_printf(sc->hn_ifp, "setup default RSS key\n");
6518 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6519 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6522 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6524 * RSS indirect table is not set yet; set it up in round-
6528 if_printf(sc->hn_ifp, "setup default RSS indirect "
6531 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
6532 rss->rss_ind[i] = i % nchan;
6533 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6536 * # of usable channels may be changed, so we have to
6537 * make sure that all entries in RSS indirect table
6540 * NOTE: hn_set_ring_inuse() _must_ have been called.
6542 hn_rss_ind_fixup(sc);
6545 sc->hn_rss_hash = sc->hn_rss_hcap;
6546 if ((sc->hn_flags & HN_FLAG_RXVF) ||
6547 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6548 /* NOTE: Don't reconfigure RSS; will do immediately. */
6549 hn_vf_rss_fixup(sc, false);
6551 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6556 * Fixup transmission aggregation setup.
6559 hn_rndis_init_fixat(sc, nchan);
6563 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6564 hn_rndis_init_fixat(sc, nchan);
6565 hn_synth_detach(sc);
6567 if (attached & ATTACHED_RNDIS) {
6568 hn_rndis_init_fixat(sc, nchan);
6569 hn_rndis_detach(sc);
6571 if (attached & ATTACHED_NVS)
6573 hn_chan_detach(sc, sc->hn_prichan);
6574 /* Restore old capabilities. */
6575 sc->hn_caps = old_caps;
6579 #undef ATTACHED_RNDIS
6585 * The interface must have been suspended though hn_suspend(), before
6586 * this function get called.
6589 hn_synth_detach(struct hn_softc *sc)
6592 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6593 ("synthetic parts were not attached"));
6595 /* Detach the RNDIS first. */
6596 hn_rndis_detach(sc);
6601 /* Detach all of the channels. */
6602 hn_detach_allchans(sc);
6604 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6608 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6610 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6611 ("invalid ring count %d", ring_cnt));
6613 if (sc->hn_tx_ring_cnt > ring_cnt)
6614 sc->hn_tx_ring_inuse = ring_cnt;
6616 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6617 sc->hn_rx_ring_inuse = ring_cnt;
6620 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6621 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6626 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6631 * The TX bufring will not be drained by the hypervisor,
6632 * if the primary channel is revoked.
6634 while (!vmbus_chan_rx_empty(chan) ||
6635 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6636 !vmbus_chan_tx_empty(chan)))
6638 vmbus_chan_intr_drain(chan);
6642 hn_disable_rx(struct hn_softc *sc)
6646 * Disable RX by clearing RX filter forcefully.
6648 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6649 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6652 * Give RNDIS enough time to flush all pending data packets.
6654 pause("waitrx", (200 * hz) / 1000);
6659 * RX/TX _must_ have been suspended/disabled, before this function
6663 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6665 struct vmbus_channel **subch = NULL;
6669 * Drain RX/TX bufrings and interrupts.
6673 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6675 if (subch != NULL) {
6678 for (i = 0; i < nsubch; ++i)
6679 hn_chan_drain(sc, subch[i]);
6681 hn_chan_drain(sc, sc->hn_prichan);
6684 vmbus_subchan_rel(subch, nsubch);
6688 hn_suspend_data(struct hn_softc *sc)
6690 struct hn_tx_ring *txr;
6698 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6699 txr = &sc->hn_tx_ring[i];
6701 mtx_lock(&txr->hn_tx_lock);
6702 txr->hn_suspended = 1;
6703 mtx_unlock(&txr->hn_tx_lock);
6704 /* No one is able send more packets now. */
6707 * Wait for all pending sends to finish.
6710 * We will _not_ receive all pending send-done, if the
6711 * primary channel is revoked.
6713 while (hn_tx_ring_pending(txr) &&
6714 !vmbus_chan_is_revoked(sc->hn_prichan))
6715 pause("hnwtx", 1 /* 1 tick */);
6726 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6729 * Drain any pending TX tasks.
6732 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6733 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6735 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6736 txr = &sc->hn_tx_ring[i];
6738 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6739 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6744 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6747 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6751 hn_suspend_mgmt(struct hn_softc *sc)
6758 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6759 * through hn_mgmt_taskq.
6761 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6762 vmbus_chan_run_task(sc->hn_prichan, &task);
6765 * Make sure that all pending management tasks are completed.
6767 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6768 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6769 taskqueue_drain_all(sc->hn_mgmt_taskq0);
6773 hn_suspend(struct hn_softc *sc)
6776 /* Disable polling. */
6780 * If the non-transparent mode VF is activated, the synthetic
6781 * device is receiving packets, so the data path of the
6782 * synthetic device must be suspended.
6784 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6785 (sc->hn_flags & HN_FLAG_RXVF))
6786 hn_suspend_data(sc);
6787 hn_suspend_mgmt(sc);
6791 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6795 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6796 ("invalid TX ring count %d", tx_ring_cnt));
6798 for (i = 0; i < tx_ring_cnt; ++i) {
6799 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6801 mtx_lock(&txr->hn_tx_lock);
6802 txr->hn_suspended = 0;
6803 mtx_unlock(&txr->hn_tx_lock);
6808 hn_resume_data(struct hn_softc *sc)
6817 hn_rxfilter_config(sc);
6820 * Make sure to clear suspend status on "all" TX rings,
6821 * since hn_tx_ring_inuse can be changed after
6822 * hn_suspend_data().
6824 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6826 #ifdef HN_IFSTART_SUPPORT
6827 if (!hn_use_if_start)
6831 * Flush unused drbrs, since hn_tx_ring_inuse may be
6834 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6835 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6841 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6842 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6845 * Use txeof task, so that any pending oactive can be
6848 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6853 hn_resume_mgmt(struct hn_softc *sc)
6856 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6859 * Kick off network change detection, if it was pending.
6860 * If no network change was pending, start link status
6861 * checks, which is more lightweight than network change
6864 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6865 hn_change_network(sc);
6867 hn_update_link_status(sc);
6871 hn_resume(struct hn_softc *sc)
6875 * If the non-transparent mode VF is activated, the synthetic
6876 * device have to receive packets, so the data path of the
6877 * synthetic device must be resumed.
6879 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6880 (sc->hn_flags & HN_FLAG_RXVF))
6884 * Don't resume link status change if VF is attached/activated.
6885 * - In the non-transparent VF mode, the synthetic device marks
6886 * link down until the VF is deactivated; i.e. VF is down.
6887 * - In transparent VF mode, VF's media status is used until
6888 * the VF is detached.
6890 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6891 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6895 * Re-enable polling if this interface is running and
6896 * the polling is requested.
6898 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6899 hn_polling(sc, sc->hn_pollhz);
6903 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6905 const struct rndis_status_msg *msg;
6908 if (dlen < sizeof(*msg)) {
6909 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6914 switch (msg->rm_status) {
6915 case RNDIS_STATUS_MEDIA_CONNECT:
6916 case RNDIS_STATUS_MEDIA_DISCONNECT:
6917 hn_update_link_status(sc);
6920 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6921 case RNDIS_STATUS_LINK_SPEED_CHANGE:
6922 /* Not really useful; ignore. */
6925 case RNDIS_STATUS_NETWORK_CHANGE:
6926 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6927 if (dlen < ofs + msg->rm_stbuflen ||
6928 msg->rm_stbuflen < sizeof(uint32_t)) {
6929 if_printf(sc->hn_ifp, "network changed\n");
6933 memcpy(&change, ((const uint8_t *)msg) + ofs,
6935 if_printf(sc->hn_ifp, "network changed, change %u\n",
6938 hn_change_network(sc);
6942 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6949 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6951 const struct rndis_pktinfo *pi = info_data;
6954 while (info_dlen != 0) {
6958 if (__predict_false(info_dlen < sizeof(*pi)))
6960 if (__predict_false(info_dlen < pi->rm_size))
6962 info_dlen -= pi->rm_size;
6964 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6966 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6968 dlen = pi->rm_size - pi->rm_pktinfooffset;
6971 switch (pi->rm_type) {
6972 case NDIS_PKTINFO_TYPE_VLAN:
6973 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
6975 info->vlan_info = *((const uint32_t *)data);
6976 mask |= HN_RXINFO_VLAN;
6979 case NDIS_PKTINFO_TYPE_CSUM:
6980 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
6982 info->csum_info = *((const uint32_t *)data);
6983 mask |= HN_RXINFO_CSUM;
6986 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
6987 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
6989 info->hash_value = *((const uint32_t *)data);
6990 mask |= HN_RXINFO_HASHVAL;
6993 case HN_NDIS_PKTINFO_TYPE_HASHINF:
6994 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
6996 info->hash_info = *((const uint32_t *)data);
6997 mask |= HN_RXINFO_HASHINF;
7004 if (mask == HN_RXINFO_ALL) {
7005 /* All found; done */
7009 pi = (const struct rndis_pktinfo *)
7010 ((const uint8_t *)pi + pi->rm_size);
7015 * - If there is no hash value, invalidate the hash info.
7017 if ((mask & HN_RXINFO_HASHVAL) == 0)
7018 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
7022 static __inline bool
7023 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
7026 if (off < check_off) {
7027 if (__predict_true(off + len <= check_off))
7029 } else if (off > check_off) {
7030 if (__predict_true(check_off + check_len <= off))
7037 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
7039 const struct rndis_packet_msg *pkt;
7040 struct hn_rxinfo info;
7041 int data_off, pktinfo_off, data_len, pktinfo_len;
7046 if (__predict_false(dlen < sizeof(*pkt))) {
7047 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
7052 if (__predict_false(dlen < pkt->rm_len)) {
7053 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
7054 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
7057 if (__predict_false(pkt->rm_len <
7058 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
7059 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
7060 "msglen %u, data %u, oob %u, pktinfo %u\n",
7061 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
7062 pkt->rm_pktinfolen);
7065 if (__predict_false(pkt->rm_datalen == 0)) {
7066 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
7073 #define IS_OFFSET_INVALID(ofs) \
7074 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
7075 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
7077 /* XXX Hyper-V does not meet data offset alignment requirement */
7078 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7079 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7080 "data offset %u\n", pkt->rm_dataoffset);
7083 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7084 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7085 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7086 "oob offset %u\n", pkt->rm_oobdataoffset);
7089 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7090 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7091 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7092 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7096 #undef IS_OFFSET_INVALID
7098 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7099 data_len = pkt->rm_datalen;
7100 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7101 pktinfo_len = pkt->rm_pktinfolen;
7104 * Check OOB coverage.
7106 if (__predict_false(pkt->rm_oobdatalen != 0)) {
7107 int oob_off, oob_len;
7109 if_printf(rxr->hn_ifp, "got oobdata\n");
7110 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7111 oob_len = pkt->rm_oobdatalen;
7113 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7114 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7115 "oob overflow, msglen %u, oob abs %d len %d\n",
7116 pkt->rm_len, oob_off, oob_len);
7121 * Check against data.
7123 if (hn_rndis_check_overlap(oob_off, oob_len,
7124 data_off, data_len)) {
7125 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7126 "oob overlaps data, oob abs %d len %d, "
7127 "data abs %d len %d\n",
7128 oob_off, oob_len, data_off, data_len);
7133 * Check against pktinfo.
7135 if (pktinfo_len != 0 &&
7136 hn_rndis_check_overlap(oob_off, oob_len,
7137 pktinfo_off, pktinfo_len)) {
7138 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7139 "oob overlaps pktinfo, oob abs %d len %d, "
7140 "pktinfo abs %d len %d\n",
7141 oob_off, oob_len, pktinfo_off, pktinfo_len);
7147 * Check per-packet-info coverage and find useful per-packet-info.
7149 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7150 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7151 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7152 if (__predict_true(pktinfo_len != 0)) {
7156 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7157 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7158 "pktinfo overflow, msglen %u, "
7159 "pktinfo abs %d len %d\n",
7160 pkt->rm_len, pktinfo_off, pktinfo_len);
7165 * Check packet info coverage.
7167 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7168 data_off, data_len);
7169 if (__predict_false(overlap)) {
7170 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7171 "pktinfo overlap data, pktinfo abs %d len %d, "
7172 "data abs %d len %d\n",
7173 pktinfo_off, pktinfo_len, data_off, data_len);
7178 * Find useful per-packet-info.
7180 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7181 pktinfo_len, &info);
7182 if (__predict_false(error)) {
7183 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7189 if (__predict_false(data_off + data_len > pkt->rm_len)) {
7190 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7191 "data overflow, msglen %u, data abs %d len %d\n",
7192 pkt->rm_len, data_off, data_len);
7195 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7198 static __inline void
7199 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7201 const struct rndis_msghdr *hdr;
7203 if (__predict_false(dlen < sizeof(*hdr))) {
7204 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7209 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7210 /* Hot data path. */
7211 hn_rndis_rx_data(rxr, data, dlen);
7216 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7217 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7219 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7223 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7225 const struct hn_nvs_hdr *hdr;
7227 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7228 if_printf(sc->hn_ifp, "invalid nvs notify\n");
7231 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7233 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7234 /* Useless; ignore */
7237 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7241 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7242 const struct vmbus_chanpkt_hdr *pkt)
7244 struct hn_nvs_sendctx *sndc;
7246 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7247 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7248 VMBUS_CHANPKT_DATALEN(pkt));
7251 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7257 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7258 const struct vmbus_chanpkt_hdr *pkthdr)
7260 const struct vmbus_chanpkt_rxbuf *pkt;
7261 const struct hn_nvs_hdr *nvs_hdr;
7264 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7265 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7268 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7270 /* Make sure that this is a RNDIS message. */
7271 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7272 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7277 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7278 if (__predict_false(hlen < sizeof(*pkt))) {
7279 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7282 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7284 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7285 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7290 count = pkt->cp_rxbuf_cnt;
7291 if (__predict_false(hlen <
7292 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7293 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7297 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7298 for (i = 0; i < count; ++i) {
7301 ofs = pkt->cp_rxbuf[i].rb_ofs;
7302 len = pkt->cp_rxbuf[i].rb_len;
7303 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7304 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7305 "ofs %d, len %d\n", i, ofs, len);
7308 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7312 * Ack the consumed RXBUF associated w/ this channel packet,
7313 * so that this RXBUF can be recycled by the hypervisor.
7315 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7319 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7322 struct hn_nvs_rndis_ack ack;
7325 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7326 ack.nvs_status = HN_NVS_STATUS_OK;
7330 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7331 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7332 if (__predict_false(error == EAGAIN)) {
7335 * This should _not_ happen in real world, since the
7336 * consumption of the TX bufring from the TX path is
7339 if (rxr->hn_ack_failed == 0)
7340 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7341 rxr->hn_ack_failed++;
7348 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7353 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7355 struct hn_rx_ring *rxr = xrxr;
7356 struct hn_softc *sc = rxr->hn_ifp->if_softc;
7359 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7362 pktlen = rxr->hn_pktbuf_len;
7363 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7364 if (__predict_false(error == ENOBUFS)) {
7369 * Expand channel packet buffer.
7372 * Use M_WAITOK here, since allocation failure
7375 nlen = rxr->hn_pktbuf_len * 2;
7376 while (nlen < pktlen)
7378 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7380 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7381 rxr->hn_pktbuf_len, nlen);
7383 free(rxr->hn_pktbuf, M_DEVBUF);
7384 rxr->hn_pktbuf = nbuf;
7385 rxr->hn_pktbuf_len = nlen;
7388 } else if (__predict_false(error == EAGAIN)) {
7389 /* No more channel packets; done! */
7392 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7394 switch (pkt->cph_type) {
7395 case VMBUS_CHANPKT_TYPE_COMP:
7396 hn_nvs_handle_comp(sc, chan, pkt);
7399 case VMBUS_CHANPKT_TYPE_RXBUF:
7400 hn_nvs_handle_rxbuf(rxr, chan, pkt);
7403 case VMBUS_CHANPKT_TYPE_INBAND:
7404 hn_nvs_handle_notify(sc, pkt);
7408 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7413 hn_chan_rollup(rxr, rxr->hn_txr);
7417 hn_sysinit(void *arg __unused)
7421 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7423 #ifdef HN_IFSTART_SUPPORT
7425 * Don't use ifnet.if_start if transparent VF mode is requested;
7426 * mainly due to the IFF_DRV_OACTIVE flag.
7428 if (hn_xpnt_vf && hn_use_if_start) {
7429 hn_use_if_start = 0;
7430 printf("hn: tranparent VF mode, if_transmit will be used, "
7431 "instead of if_start\n");
7434 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7435 printf("hn: invalid transparent VF attach routing "
7436 "wait timeout %d, reset to %d\n",
7437 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7438 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7442 * Initialize VF map.
7444 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7445 hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7446 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7450 * Fix the # of TX taskqueues.
7452 if (hn_tx_taskq_cnt <= 0)
7453 hn_tx_taskq_cnt = 1;
7454 else if (hn_tx_taskq_cnt > mp_ncpus)
7455 hn_tx_taskq_cnt = mp_ncpus;
7458 * Fix the TX taskqueue mode.
7460 switch (hn_tx_taskq_mode) {
7461 case HN_TX_TASKQ_M_INDEP:
7462 case HN_TX_TASKQ_M_GLOBAL:
7463 case HN_TX_TASKQ_M_EVTTQ:
7466 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7470 if (vm_guest != VM_GUEST_HV)
7473 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7476 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7477 M_DEVBUF, M_WAITOK);
7478 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7479 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7480 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7481 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7485 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7488 hn_sysuninit(void *arg __unused)
7491 if (hn_tx_taskque != NULL) {
7494 for (i = 0; i < hn_tx_taskq_cnt; ++i)
7495 taskqueue_free(hn_tx_taskque[i]);
7496 free(hn_tx_taskque, M_DEVBUF);
7499 if (hn_vfmap != NULL)
7500 free(hn_vfmap, M_DEVBUF);
7501 rm_destroy(&hn_vfmap_lock);
7503 counter_u64_free(hn_udpcs_fixup);
7505 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);