2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
62 #include <sys/param.h>
63 #include <sys/systm.h>
65 #include <sys/counter.h>
66 #include <sys/kernel.h>
67 #include <sys/limits.h>
68 #include <sys/malloc.h>
70 #include <sys/module.h>
72 #include <sys/queue.h>
74 #include <sys/rmlock.h>
77 #include <sys/socket.h>
78 #include <sys/sockio.h>
80 #include <sys/sysctl.h>
81 #include <sys/taskqueue.h>
82 #include <sys/buf_ring.h>
83 #include <sys/eventhandler.h>
85 #include <machine/atomic.h>
86 #include <machine/in_cksum.h>
89 #include <net/ethernet.h>
91 #include <net/if_arp.h>
92 #include <net/if_dl.h>
93 #include <net/if_media.h>
94 #include <net/if_types.h>
95 #include <net/if_var.h>
96 #include <net/if_vlan_var.h>
97 #include <net/rndis.h>
99 #include <netinet/in_systm.h>
100 #include <netinet/in.h>
101 #include <netinet/ip.h>
102 #include <netinet/ip6.h>
103 #include <netinet/tcp.h>
104 #include <netinet/tcp_lro.h>
105 #include <netinet/udp.h>
107 #include <dev/hyperv/include/hyperv.h>
108 #include <dev/hyperv/include/hyperv_busdma.h>
109 #include <dev/hyperv/include/vmbus.h>
110 #include <dev/hyperv/include/vmbus_xact.h>
112 #include <dev/hyperv/netvsc/ndis.h>
113 #include <dev/hyperv/netvsc/if_hnreg.h>
114 #include <dev/hyperv/netvsc/if_hnvar.h>
115 #include <dev/hyperv/netvsc/hn_nvs.h>
116 #include <dev/hyperv/netvsc/hn_rndis.h>
118 #include "vmbus_if.h"
120 #define HN_IFSTART_SUPPORT
122 #define HN_RING_CNT_DEF_MAX 8
124 #define HN_VFMAP_SIZE_DEF 8
126 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */
128 /* YYY should get it from the underlying channel */
129 #define HN_TX_DESC_CNT 512
131 #define HN_RNDIS_PKT_LEN \
132 (sizeof(struct rndis_packet_msg) + \
133 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
134 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
135 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
136 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
137 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
138 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
140 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
141 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
142 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
143 /* -1 for RNDIS packet message */
144 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
146 #define HN_DIRECT_TX_SIZE_DEF 128
148 #define HN_EARLY_TXEOF_THRESH 8
150 #define HN_PKTBUF_LEN_DEF (16 * 1024)
152 #define HN_LROENT_CNT_DEF 128
154 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
155 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
156 /* YYY 2*MTU is a bit rough, but should be good enough. */
157 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
159 #define HN_LRO_ACKCNT_DEF 1
161 #define HN_LOCK_INIT(sc) \
162 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
163 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
164 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
165 #define HN_LOCK(sc) \
167 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
170 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
172 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
173 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
174 #define HN_CSUM_IP_HWASSIST(sc) \
175 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
176 #define HN_CSUM_IP6_HWASSIST(sc) \
177 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
179 #define HN_PKTSIZE_MIN(align) \
180 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
181 HN_RNDIS_PKT_LEN, (align))
182 #define HN_PKTSIZE(m, align) \
183 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
185 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
188 #ifndef HN_USE_TXDESC_BUFRING
189 SLIST_ENTRY(hn_txdesc) link;
191 STAILQ_ENTRY(hn_txdesc) agg_link;
193 /* Aggregated txdescs, in sending order. */
194 STAILQ_HEAD(, hn_txdesc) agg_list;
196 /* The oldest packet, if transmission aggregation happens. */
198 struct hn_tx_ring *txr;
200 uint32_t flags; /* HN_TXD_FLAG_ */
201 struct hn_nvs_sendctx send_ctx;
205 bus_dmamap_t data_dmap;
207 bus_addr_t rndis_pkt_paddr;
208 struct rndis_packet_msg *rndis_pkt;
209 bus_dmamap_t rndis_pkt_dmap;
212 #define HN_TXD_FLAG_ONLIST 0x0001
213 #define HN_TXD_FLAG_DMAMAP 0x0002
214 #define HN_TXD_FLAG_ONAGG 0x0004
223 struct hn_rxvf_setarg {
224 struct hn_rx_ring *rxr;
225 struct ifnet *vf_ifp;
228 #define HN_RXINFO_VLAN 0x0001
229 #define HN_RXINFO_CSUM 0x0002
230 #define HN_RXINFO_HASHINF 0x0004
231 #define HN_RXINFO_HASHVAL 0x0008
232 #define HN_RXINFO_ALL \
235 HN_RXINFO_HASHINF | \
238 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
239 #define HN_NDIS_RXCSUM_INFO_INVALID 0
240 #define HN_NDIS_HASH_INFO_INVALID 0
242 static int hn_probe(device_t);
243 static int hn_attach(device_t);
244 static int hn_detach(device_t);
245 static int hn_shutdown(device_t);
246 static void hn_chan_callback(struct vmbus_channel *,
249 static void hn_init(void *);
250 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
251 #ifdef HN_IFSTART_SUPPORT
252 static void hn_start(struct ifnet *);
254 static int hn_transmit(struct ifnet *, struct mbuf *);
255 static void hn_xmit_qflush(struct ifnet *);
256 static int hn_ifmedia_upd(struct ifnet *);
257 static void hn_ifmedia_sts(struct ifnet *,
258 struct ifmediareq *);
260 static void hn_ifnet_event(void *, struct ifnet *, int);
261 static void hn_ifaddr_event(void *, struct ifnet *);
262 static void hn_ifnet_attevent(void *, struct ifnet *);
263 static void hn_ifnet_detevent(void *, struct ifnet *);
264 static void hn_ifnet_lnkevent(void *, struct ifnet *, int);
266 static bool hn_ismyvf(const struct hn_softc *,
267 const struct ifnet *);
268 static void hn_rxvf_change(struct hn_softc *,
269 struct ifnet *, bool);
270 static void hn_rxvf_set(struct hn_softc *, struct ifnet *);
271 static void hn_rxvf_set_task(void *, int);
272 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
273 static int hn_xpnt_vf_iocsetflags(struct hn_softc *);
274 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *,
276 static void hn_xpnt_vf_saveifflags(struct hn_softc *);
277 static bool hn_xpnt_vf_isready(struct hn_softc *);
278 static void hn_xpnt_vf_setready(struct hn_softc *);
279 static void hn_xpnt_vf_init_taskfunc(void *, int);
280 static void hn_xpnt_vf_init(struct hn_softc *);
281 static void hn_xpnt_vf_setenable(struct hn_softc *);
282 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool);
283 static void hn_vf_rss_fixup(struct hn_softc *, bool);
284 static void hn_vf_rss_restore(struct hn_softc *);
286 static int hn_rndis_rxinfo(const void *, int,
288 static void hn_rndis_rx_data(struct hn_rx_ring *,
290 static void hn_rndis_rx_status(struct hn_softc *,
292 static void hn_rndis_init_fixat(struct hn_softc *, int);
294 static void hn_nvs_handle_notify(struct hn_softc *,
295 const struct vmbus_chanpkt_hdr *);
296 static void hn_nvs_handle_comp(struct hn_softc *,
297 struct vmbus_channel *,
298 const struct vmbus_chanpkt_hdr *);
299 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
300 struct vmbus_channel *,
301 const struct vmbus_chanpkt_hdr *);
302 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
303 struct vmbus_channel *, uint64_t);
305 #if __FreeBSD_version >= 1100099
306 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
307 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
309 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
310 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
311 #if __FreeBSD_version < 1100095
312 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
314 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
316 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
317 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
318 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
319 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
320 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
321 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
322 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
323 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
324 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
325 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
326 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
327 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
328 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
329 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
330 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
331 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
332 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
333 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
334 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
335 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
336 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
337 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
338 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
340 static void hn_stop(struct hn_softc *, bool);
341 static void hn_init_locked(struct hn_softc *);
342 static int hn_chan_attach(struct hn_softc *,
343 struct vmbus_channel *);
344 static void hn_chan_detach(struct hn_softc *,
345 struct vmbus_channel *);
346 static int hn_attach_subchans(struct hn_softc *);
347 static void hn_detach_allchans(struct hn_softc *);
348 static void hn_chan_rollup(struct hn_rx_ring *,
349 struct hn_tx_ring *);
350 static void hn_set_ring_inuse(struct hn_softc *, int);
351 static int hn_synth_attach(struct hn_softc *, int);
352 static void hn_synth_detach(struct hn_softc *);
353 static int hn_synth_alloc_subchans(struct hn_softc *,
355 static bool hn_synth_attachable(const struct hn_softc *);
356 static void hn_suspend(struct hn_softc *);
357 static void hn_suspend_data(struct hn_softc *);
358 static void hn_suspend_mgmt(struct hn_softc *);
359 static void hn_resume(struct hn_softc *);
360 static void hn_resume_data(struct hn_softc *);
361 static void hn_resume_mgmt(struct hn_softc *);
362 static void hn_suspend_mgmt_taskfunc(void *, int);
363 static void hn_chan_drain(struct hn_softc *,
364 struct vmbus_channel *);
365 static void hn_disable_rx(struct hn_softc *);
366 static void hn_drain_rxtx(struct hn_softc *, int);
367 static void hn_polling(struct hn_softc *, u_int);
368 static void hn_chan_polling(struct vmbus_channel *, u_int);
369 static void hn_mtu_change_fixup(struct hn_softc *);
371 static void hn_update_link_status(struct hn_softc *);
372 static void hn_change_network(struct hn_softc *);
373 static void hn_link_taskfunc(void *, int);
374 static void hn_netchg_init_taskfunc(void *, int);
375 static void hn_netchg_status_taskfunc(void *, int);
376 static void hn_link_status(struct hn_softc *);
378 static int hn_create_rx_data(struct hn_softc *, int);
379 static void hn_destroy_rx_data(struct hn_softc *);
380 static int hn_check_iplen(const struct mbuf *, int);
381 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
382 static int hn_rxfilter_config(struct hn_softc *);
383 static int hn_rss_reconfig(struct hn_softc *);
384 static void hn_rss_ind_fixup(struct hn_softc *);
385 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
386 static int hn_rxpkt(struct hn_rx_ring *, const void *,
387 int, const struct hn_rxinfo *);
388 static uint32_t hn_rss_type_fromndis(uint32_t);
389 static uint32_t hn_rss_type_tondis(uint32_t);
391 static int hn_tx_ring_create(struct hn_softc *, int);
392 static void hn_tx_ring_destroy(struct hn_tx_ring *);
393 static int hn_create_tx_data(struct hn_softc *, int);
394 static void hn_fixup_tx_data(struct hn_softc *);
395 static void hn_destroy_tx_data(struct hn_softc *);
396 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
397 static void hn_txdesc_gc(struct hn_tx_ring *,
399 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
400 struct hn_txdesc *, struct mbuf **);
401 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
403 static void hn_set_chim_size(struct hn_softc *, int);
404 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
405 static bool hn_tx_ring_pending(struct hn_tx_ring *);
406 static void hn_tx_ring_qflush(struct hn_tx_ring *);
407 static void hn_resume_tx(struct hn_softc *, int);
408 static void hn_set_txagg(struct hn_softc *);
409 static void *hn_try_txagg(struct ifnet *,
410 struct hn_tx_ring *, struct hn_txdesc *,
412 static int hn_get_txswq_depth(const struct hn_tx_ring *);
413 static void hn_txpkt_done(struct hn_nvs_sendctx *,
414 struct hn_softc *, struct vmbus_channel *,
416 static int hn_txpkt_sglist(struct hn_tx_ring *,
418 static int hn_txpkt_chim(struct hn_tx_ring *,
420 static int hn_xmit(struct hn_tx_ring *, int);
421 static void hn_xmit_taskfunc(void *, int);
422 static void hn_xmit_txeof(struct hn_tx_ring *);
423 static void hn_xmit_txeof_taskfunc(void *, int);
424 #ifdef HN_IFSTART_SUPPORT
425 static int hn_start_locked(struct hn_tx_ring *, int);
426 static void hn_start_taskfunc(void *, int);
427 static void hn_start_txeof(struct hn_tx_ring *);
428 static void hn_start_txeof_taskfunc(void *, int);
431 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
432 "Hyper-V network interface");
434 /* Trust tcp segements verification on host side. */
435 static int hn_trust_hosttcp = 1;
436 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
437 &hn_trust_hosttcp, 0,
438 "Trust tcp segement verification on host side, "
439 "when csum info is missing (global setting)");
441 /* Trust udp datagrams verification on host side. */
442 static int hn_trust_hostudp = 1;
443 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
444 &hn_trust_hostudp, 0,
445 "Trust udp datagram verification on host side, "
446 "when csum info is missing (global setting)");
448 /* Trust ip packets verification on host side. */
449 static int hn_trust_hostip = 1;
450 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
452 "Trust ip packet verification on host side, "
453 "when csum info is missing (global setting)");
456 * Offload UDP/IPv4 checksum.
458 static int hn_enable_udp4cs = 1;
459 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN,
460 &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum");
463 * Offload UDP/IPv6 checksum.
465 static int hn_enable_udp6cs = 1;
466 SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN,
467 &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum");
470 static counter_u64_t hn_udpcs_fixup;
471 SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW,
472 &hn_udpcs_fixup, "# of UDP checksum fixup");
477 * This value is for Azure. For Hyper-V, set this above
478 * 65536 to disable UDP datagram checksum fixup.
480 static int hn_udpcs_fixup_mtu = 1420;
481 SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN,
482 &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold");
484 /* Limit TSO burst size */
485 static int hn_tso_maxlen = IP_MAXPACKET;
486 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
487 &hn_tso_maxlen, 0, "TSO burst limit");
489 /* Limit chimney send size */
490 static int hn_tx_chimney_size = 0;
491 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
492 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
494 /* Limit the size of packet for direct transmission */
495 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
496 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
497 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
499 /* # of LRO entries per RX ring */
500 #if defined(INET) || defined(INET6)
501 #if __FreeBSD_version >= 1100095
502 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
503 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
504 &hn_lro_entry_count, 0, "LRO entry count");
508 static int hn_tx_taskq_cnt = 1;
509 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
510 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
512 #define HN_TX_TASKQ_M_INDEP 0
513 #define HN_TX_TASKQ_M_GLOBAL 1
514 #define HN_TX_TASKQ_M_EVTTQ 2
516 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
517 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
518 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
519 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
521 #ifndef HN_USE_TXDESC_BUFRING
522 static int hn_use_txdesc_bufring = 0;
524 static int hn_use_txdesc_bufring = 1;
526 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
527 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
529 #ifdef HN_IFSTART_SUPPORT
530 /* Use ifnet.if_start instead of ifnet.if_transmit */
531 static int hn_use_if_start = 0;
532 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
533 &hn_use_if_start, 0, "Use if_start TX method");
536 /* # of channels to use */
537 static int hn_chan_cnt = 0;
538 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
540 "# of channels to use; each channel has one RX ring and one TX ring");
542 /* # of transmit rings to use */
543 static int hn_tx_ring_cnt = 0;
544 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
545 &hn_tx_ring_cnt, 0, "# of TX rings to use");
547 /* Software TX ring deptch */
548 static int hn_tx_swq_depth = 0;
549 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
550 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
552 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
553 #if __FreeBSD_version >= 1100095
554 static u_int hn_lro_mbufq_depth = 0;
555 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
556 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
559 /* Packet transmission aggregation size limit */
560 static int hn_tx_agg_size = -1;
561 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
562 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
564 /* Packet transmission aggregation count limit */
565 static int hn_tx_agg_pkts = -1;
566 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
567 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
570 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
571 0, 0, hn_vflist_sysctl, "A", "VF list");
574 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
575 0, 0, hn_vfmap_sysctl, "A", "VF mapping");
578 static int hn_xpnt_vf = 0;
579 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
580 &hn_xpnt_vf, 0, "Transparent VF mod");
582 /* Accurate BPF support for Transparent VF */
583 static int hn_xpnt_vf_accbpf = 0;
584 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
585 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
587 /* Extra wait for transparent VF attach routing; unit seconds. */
588 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
589 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
590 &hn_xpnt_vf_attwait, 0,
591 "Extra wait for transparent VF attach routing; unit: seconds");
593 static u_int hn_cpu_index; /* next CPU for channel */
594 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
596 static struct rmlock hn_vfmap_lock;
597 static int hn_vfmap_size;
598 static struct ifnet **hn_vfmap;
601 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
602 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
603 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
604 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
605 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
606 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
609 static const struct hyperv_guid hn_guid = {
611 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
612 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
615 static device_method_t hn_methods[] = {
616 /* Device interface */
617 DEVMETHOD(device_probe, hn_probe),
618 DEVMETHOD(device_attach, hn_attach),
619 DEVMETHOD(device_detach, hn_detach),
620 DEVMETHOD(device_shutdown, hn_shutdown),
624 static driver_t hn_driver = {
627 sizeof(struct hn_softc)
630 static devclass_t hn_devclass;
632 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
633 MODULE_VERSION(hn, 1);
634 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
636 #if __FreeBSD_version >= 1100099
638 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
642 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
643 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
648 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
651 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
652 txd->chim_size == 0, ("invalid rndis sglist txd"));
653 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
654 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
658 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
660 struct hn_nvs_rndis rndis;
662 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
663 txd->chim_size > 0, ("invalid rndis chim txd"));
665 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
666 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
667 rndis.nvs_chim_idx = txd->chim_index;
668 rndis.nvs_chim_sz = txd->chim_size;
670 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
671 &rndis, sizeof(rndis), &txd->send_ctx));
674 static __inline uint32_t
675 hn_chim_alloc(struct hn_softc *sc)
677 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
678 u_long *bmap = sc->hn_chim_bmap;
679 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
681 for (i = 0; i < bmap_cnt; ++i) {
684 idx = ffsl(~bmap[i]);
688 --idx; /* ffsl is 1-based */
689 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
690 ("invalid i %d and idx %d", i, idx));
692 if (atomic_testandset_long(&bmap[i], idx))
695 ret = i * LONG_BIT + idx;
702 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
707 idx = chim_idx / LONG_BIT;
708 KASSERT(idx < sc->hn_chim_bmap_cnt,
709 ("invalid chimney index 0x%x", chim_idx));
711 mask = 1UL << (chim_idx % LONG_BIT);
712 KASSERT(sc->hn_chim_bmap[idx] & mask,
713 ("index bitmap 0x%lx, chimney index %u, "
714 "bitmap idx %d, bitmask 0x%lx",
715 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
717 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
720 #if defined(INET6) || defined(INET)
722 #define PULLUP_HDR(m, len) \
724 if (__predict_false((m)->m_len < (len))) { \
725 (m) = m_pullup((m), (len)); \
732 * NOTE: If this function failed, the m_head would be freed.
734 static __inline struct mbuf *
735 hn_tso_fixup(struct mbuf *m_head)
737 struct ether_vlan_header *evl;
741 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
743 PULLUP_HDR(m_head, sizeof(*evl));
744 evl = mtod(m_head, struct ether_vlan_header *);
745 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
746 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
748 ehlen = ETHER_HDR_LEN;
749 m_head->m_pkthdr.l2hlen = ehlen;
752 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
756 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
757 ip = mtodo(m_head, ehlen);
758 iphlen = ip->ip_hl << 2;
759 m_head->m_pkthdr.l3hlen = iphlen;
761 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
762 th = mtodo(m_head, ehlen + iphlen);
766 th->th_sum = in_pseudo(ip->ip_src.s_addr,
767 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
770 #if defined(INET6) && defined(INET)
777 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
778 ip6 = mtodo(m_head, ehlen);
779 if (ip6->ip6_nxt != IPPROTO_TCP) {
783 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
785 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
786 th = mtodo(m_head, ehlen + sizeof(*ip6));
789 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
796 * NOTE: If this function failed, the m_head would be freed.
798 static __inline struct mbuf *
799 hn_set_hlen(struct mbuf *m_head)
801 const struct ether_vlan_header *evl;
804 PULLUP_HDR(m_head, sizeof(*evl));
805 evl = mtod(m_head, const struct ether_vlan_header *);
806 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
807 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
809 ehlen = ETHER_HDR_LEN;
810 m_head->m_pkthdr.l2hlen = ehlen;
813 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) {
817 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
818 ip = mtodo(m_head, ehlen);
819 iphlen = ip->ip_hl << 2;
820 m_head->m_pkthdr.l3hlen = iphlen;
823 * UDP checksum offload does not work in Azure, if the
824 * following conditions meet:
825 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
826 * - IP_DF is not set in the IP hdr.
828 * Fallback to software checksum for these UDP datagrams.
830 if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) &&
831 m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen &&
832 (ntohs(ip->ip_off) & IP_DF) == 0) {
833 uint16_t off = ehlen + iphlen;
835 counter_u64_add(hn_udpcs_fixup, 1);
836 PULLUP_HDR(m_head, off + sizeof(struct udphdr));
837 *(uint16_t *)(m_head->m_data + off +
838 m_head->m_pkthdr.csum_data) = in_cksum_skip(
839 m_head, m_head->m_pkthdr.len, off);
840 m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP;
844 #if defined(INET6) && defined(INET)
849 const struct ip6_hdr *ip6;
851 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
852 ip6 = mtodo(m_head, ehlen);
853 if (ip6->ip6_nxt != IPPROTO_TCP) {
857 m_head->m_pkthdr.l3hlen = sizeof(*ip6);
864 * NOTE: If this function failed, the m_head would be freed.
866 static __inline struct mbuf *
867 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
869 const struct tcphdr *th;
873 ehlen = m_head->m_pkthdr.l2hlen;
874 iphlen = m_head->m_pkthdr.l3hlen;
876 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
877 th = mtodo(m_head, ehlen + iphlen);
878 if (th->th_flags & TH_SYN)
885 #endif /* INET6 || INET */
888 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
894 if (sc->hn_rx_filter != filter) {
895 error = hn_rndis_set_rxfilter(sc, filter);
897 sc->hn_rx_filter = filter;
903 hn_rxfilter_config(struct hn_softc *sc)
905 struct ifnet *ifp = sc->hn_ifp;
911 * If the non-transparent mode VF is activated, we don't know how
912 * its RX filter is configured, so stick the synthetic device in
913 * the promiscous mode.
915 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
916 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
918 filter = NDIS_PACKET_TYPE_DIRECTED;
919 if (ifp->if_flags & IFF_BROADCAST)
920 filter |= NDIS_PACKET_TYPE_BROADCAST;
921 /* TODO: support multicast list */
922 if ((ifp->if_flags & IFF_ALLMULTI) ||
923 !TAILQ_EMPTY(&ifp->if_multiaddrs))
924 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
926 return (hn_set_rxfilter(sc, filter));
930 hn_set_txagg(struct hn_softc *sc)
936 * Setup aggregation size.
938 if (sc->hn_agg_size < 0)
941 size = sc->hn_agg_size;
943 if (sc->hn_rndis_agg_size < size)
944 size = sc->hn_rndis_agg_size;
946 /* NOTE: We only aggregate packets using chimney sending buffers. */
947 if (size > (uint32_t)sc->hn_chim_szmax)
948 size = sc->hn_chim_szmax;
950 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
957 /* NOTE: Type of the per TX ring setting is 'int'. */
962 * Setup aggregation packet count.
964 if (sc->hn_agg_pkts < 0)
967 pkts = sc->hn_agg_pkts;
969 if (sc->hn_rndis_agg_pkts < pkts)
970 pkts = sc->hn_rndis_agg_pkts;
979 /* NOTE: Type of the per TX ring setting is 'short'. */
984 /* NOTE: Type of the per TX ring setting is 'short'. */
985 if (sc->hn_rndis_agg_align > SHRT_MAX) {
992 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
993 size, pkts, sc->hn_rndis_agg_align);
996 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
997 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
999 mtx_lock(&txr->hn_tx_lock);
1000 txr->hn_agg_szmax = size;
1001 txr->hn_agg_pktmax = pkts;
1002 txr->hn_agg_align = sc->hn_rndis_agg_align;
1003 mtx_unlock(&txr->hn_tx_lock);
1008 hn_get_txswq_depth(const struct hn_tx_ring *txr)
1011 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
1012 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
1013 return txr->hn_txdesc_cnt;
1014 return hn_tx_swq_depth;
1018 hn_rss_reconfig(struct hn_softc *sc)
1024 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1028 * Disable RSS first.
1031 * Direct reconfiguration by setting the UNCHG flags does
1032 * _not_ work properly.
1035 if_printf(sc->hn_ifp, "disable RSS\n");
1036 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
1038 if_printf(sc->hn_ifp, "RSS disable failed\n");
1043 * Reenable the RSS w/ the updated RSS key or indirect
1047 if_printf(sc->hn_ifp, "reconfig RSS\n");
1048 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
1050 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
1057 hn_rss_ind_fixup(struct hn_softc *sc)
1059 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
1062 nchan = sc->hn_rx_ring_inuse;
1063 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1066 * Check indirect table to make sure that all channels in it
1069 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1070 if (rss->rss_ind[i] >= nchan) {
1071 if_printf(sc->hn_ifp,
1072 "RSS indirect table %d fixup: %u -> %d\n",
1073 i, rss->rss_ind[i], nchan - 1);
1074 rss->rss_ind[i] = nchan - 1;
1080 hn_ifmedia_upd(struct ifnet *ifp __unused)
1087 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1089 struct hn_softc *sc = ifp->if_softc;
1091 ifmr->ifm_status = IFM_AVALID;
1092 ifmr->ifm_active = IFM_ETHER;
1094 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1095 ifmr->ifm_active |= IFM_NONE;
1098 ifmr->ifm_status |= IFM_ACTIVE;
1099 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1103 hn_rxvf_set_task(void *xarg, int pending __unused)
1105 struct hn_rxvf_setarg *arg = xarg;
1107 arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1111 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1113 struct hn_rx_ring *rxr;
1114 struct hn_rxvf_setarg arg;
1120 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1122 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1123 rxr = &sc->hn_rx_ring[i];
1125 if (i < sc->hn_rx_ring_inuse) {
1127 arg.vf_ifp = vf_ifp;
1128 vmbus_chan_run_task(rxr->hn_chan, &task);
1130 rxr->hn_rxvf_ifp = vf_ifp;
1136 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1138 const struct ifnet *hn_ifp;
1140 hn_ifp = sc->hn_ifp;
1145 if (ifp->if_alloctype != IFT_ETHER)
1148 /* Ignore lagg/vlan interfaces */
1149 if (strcmp(ifp->if_dname, "lagg") == 0 ||
1150 strcmp(ifp->if_dname, "vlan") == 0)
1153 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1160 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1162 struct ifnet *hn_ifp;
1166 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1169 if (!hn_ismyvf(sc, ifp))
1171 hn_ifp = sc->hn_ifp;
1174 if (sc->hn_flags & HN_FLAG_RXVF)
1177 sc->hn_flags |= HN_FLAG_RXVF;
1178 hn_rxfilter_config(sc);
1180 if (!(sc->hn_flags & HN_FLAG_RXVF))
1183 sc->hn_flags &= ~HN_FLAG_RXVF;
1184 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1185 hn_rxfilter_config(sc);
1187 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1190 hn_nvs_set_datapath(sc,
1191 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1193 hn_rxvf_set(sc, rxvf ? ifp : NULL);
1196 hn_vf_rss_fixup(sc, true);
1197 hn_suspend_mgmt(sc);
1198 sc->hn_link_flags &=
1199 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1200 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1202 hn_vf_rss_restore(sc);
1206 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1207 rxvf ? "VF_UP" : "VF_DOWN", NULL);
1210 if_printf(hn_ifp, "datapath is switched %s %s\n",
1211 rxvf ? "to" : "from", ifp->if_xname);
1218 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1221 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1223 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1227 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1230 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1234 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1236 struct ifnet *ifp, *vf_ifp;
1242 vf_ifp = sc->hn_vf_ifp;
1245 * Fix up requested capabilities w/ supported capabilities,
1246 * since the supported capabilities could have been changed.
1248 ifr->ifr_reqcap &= ifp->if_capabilities;
1249 /* Pass SIOCSIFCAP to VF. */
1250 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1254 * The error will be propagated to the callers, however, it
1255 * is _not_ useful here.
1259 * Merge VF's enabled capabilities.
1261 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1263 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1264 if (ifp->if_capenable & IFCAP_TXCSUM)
1265 ifp->if_hwassist |= tmp;
1267 ifp->if_hwassist &= ~tmp;
1269 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1270 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1271 ifp->if_hwassist |= tmp;
1273 ifp->if_hwassist &= ~tmp;
1275 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1276 if (ifp->if_capenable & IFCAP_TSO4)
1277 ifp->if_hwassist |= tmp;
1279 ifp->if_hwassist &= ~tmp;
1281 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1282 if (ifp->if_capenable & IFCAP_TSO6)
1283 ifp->if_hwassist |= tmp;
1285 ifp->if_hwassist &= ~tmp;
1291 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1293 struct ifnet *vf_ifp;
1297 vf_ifp = sc->hn_vf_ifp;
1299 memset(&ifr, 0, sizeof(ifr));
1300 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1301 ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1302 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1303 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1307 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1309 struct ifnet *ifp = sc->hn_ifp;
1314 /* XXX vlan(4) style mcast addr maintenance */
1315 if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1316 allmulti = IFF_ALLMULTI;
1318 /* Always set the VF's if_flags */
1319 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1323 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1325 struct rm_priotracker pt;
1326 struct ifnet *hn_ifp = NULL;
1330 * XXX racy, if hn(4) ever detached.
1332 rm_rlock(&hn_vfmap_lock, &pt);
1333 if (vf_ifp->if_index < hn_vfmap_size)
1334 hn_ifp = hn_vfmap[vf_ifp->if_index];
1335 rm_runlock(&hn_vfmap_lock, &pt);
1337 if (hn_ifp != NULL) {
1338 for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1340 * Allow tapping on the VF.
1342 ETHER_BPF_MTAP(vf_ifp, mn);
1347 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1348 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1352 * XXX IFCOUNTER_IMCAST
1353 * This stat updating is kinda invasive, since it
1354 * requires two checks on the mbuf: the length check
1355 * and the ethernet header check. As of this write,
1356 * all multicast packets go directly to hn(4), which
1357 * makes imcast stat updating in the VF a try in vian.
1361 * Fix up rcvif and increase hn(4)'s ipackets.
1363 mn->m_pkthdr.rcvif = hn_ifp;
1364 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1367 * Go through hn(4)'s if_input.
1369 hn_ifp->if_input(hn_ifp, m);
1372 * In the middle of the transition; free this
1377 m->m_nextpkt = NULL;
1385 hn_mtu_change_fixup(struct hn_softc *sc)
1392 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1393 #if __FreeBSD_version >= 1100099
1394 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1395 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1400 hn_rss_type_fromndis(uint32_t rss_hash)
1404 if (rss_hash & NDIS_HASH_IPV4)
1405 types |= RSS_TYPE_IPV4;
1406 if (rss_hash & NDIS_HASH_TCP_IPV4)
1407 types |= RSS_TYPE_TCP_IPV4;
1408 if (rss_hash & NDIS_HASH_IPV6)
1409 types |= RSS_TYPE_IPV6;
1410 if (rss_hash & NDIS_HASH_IPV6_EX)
1411 types |= RSS_TYPE_IPV6_EX;
1412 if (rss_hash & NDIS_HASH_TCP_IPV6)
1413 types |= RSS_TYPE_TCP_IPV6;
1414 if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1415 types |= RSS_TYPE_TCP_IPV6_EX;
1420 hn_rss_type_tondis(uint32_t types)
1422 uint32_t rss_hash = 0;
1425 (RSS_TYPE_UDP_IPV4 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1426 ("UDP4, UDP6 and UDP6EX are not supported"));
1428 if (types & RSS_TYPE_IPV4)
1429 rss_hash |= NDIS_HASH_IPV4;
1430 if (types & RSS_TYPE_TCP_IPV4)
1431 rss_hash |= NDIS_HASH_TCP_IPV4;
1432 if (types & RSS_TYPE_IPV6)
1433 rss_hash |= NDIS_HASH_IPV6;
1434 if (types & RSS_TYPE_IPV6_EX)
1435 rss_hash |= NDIS_HASH_IPV6_EX;
1436 if (types & RSS_TYPE_TCP_IPV6)
1437 rss_hash |= NDIS_HASH_TCP_IPV6;
1438 if (types & RSS_TYPE_TCP_IPV6_EX)
1439 rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1444 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1450 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1451 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1455 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1457 struct ifnet *ifp, *vf_ifp;
1458 struct ifrsshash ifrh;
1459 struct ifrsskey ifrk;
1461 uint32_t my_types, diff_types, mbuf_types = 0;
1464 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1465 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1467 if (sc->hn_rx_ring_inuse == 1) {
1468 /* No RSS on synthetic parts; done. */
1471 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1472 /* Synthetic parts do not support Toeplitz; done. */
1477 vf_ifp = sc->hn_vf_ifp;
1480 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is
1483 memset(&ifrk, 0, sizeof(ifrk));
1484 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1485 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1487 if_printf(ifp, "%s SIOCGRSSKEY failed: %d\n",
1488 vf_ifp->if_xname, error);
1491 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1492 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1493 vf_ifp->if_xname, ifrk.ifrk_func);
1496 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1497 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1498 vf_ifp->if_xname, ifrk.ifrk_keylen);
1503 * Extract VF's RSS hash. Only Toeplitz is supported.
1505 memset(&ifrh, 0, sizeof(ifrh));
1506 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1507 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1509 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1510 vf_ifp->if_xname, error);
1513 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1514 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1515 vf_ifp->if_xname, ifrh.ifrh_func);
1519 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1520 if ((ifrh.ifrh_types & my_types) == 0) {
1521 /* This disables RSS; ignore it then */
1522 if_printf(ifp, "%s intersection of RSS types failed. "
1523 "VF %#x, mine %#x\n", vf_ifp->if_xname,
1524 ifrh.ifrh_types, my_types);
1528 diff_types = my_types ^ ifrh.ifrh_types;
1529 my_types &= ifrh.ifrh_types;
1530 mbuf_types = my_types;
1533 * Detect RSS hash value/type confliction.
1536 * We don't disable the hash type, but stop delivery the hash
1537 * value/type through mbufs on RX path.
1539 if ((my_types & RSS_TYPE_IPV4) &&
1540 (diff_types & ifrh.ifrh_types &
1541 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1542 /* Conflict; disable IPV4 hash type/value delivery. */
1543 if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1544 mbuf_types &= ~RSS_TYPE_IPV4;
1546 if ((my_types & RSS_TYPE_IPV6) &&
1547 (diff_types & ifrh.ifrh_types &
1548 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1549 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1550 RSS_TYPE_IPV6_EX))) {
1551 /* Conflict; disable IPV6 hash type/value delivery. */
1552 if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1553 mbuf_types &= ~RSS_TYPE_IPV6;
1555 if ((my_types & RSS_TYPE_IPV6_EX) &&
1556 (diff_types & ifrh.ifrh_types &
1557 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1558 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1560 /* Conflict; disable IPV6_EX hash type/value delivery. */
1561 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1562 mbuf_types &= ~RSS_TYPE_IPV6_EX;
1564 if ((my_types & RSS_TYPE_TCP_IPV6) &&
1565 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1566 /* Conflict; disable TCP_IPV6 hash type/value delivery. */
1567 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1568 mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1570 if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1571 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1572 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1573 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1574 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1576 if ((my_types & RSS_TYPE_UDP_IPV6) &&
1577 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1578 /* Conflict; disable UDP_IPV6 hash type/value delivery. */
1579 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1580 mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1582 if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1583 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1584 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1585 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1586 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1590 * Indirect table does not matter.
1593 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1594 hn_rss_type_tondis(my_types);
1595 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1596 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1599 error = hn_rss_reconfig(sc);
1601 /* XXX roll-back? */
1602 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1603 /* XXX keep going. */
1607 /* Hash deliverability for mbufs. */
1608 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1612 hn_vf_rss_restore(struct hn_softc *sc)
1616 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1617 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1619 if (sc->hn_rx_ring_inuse == 1)
1623 * Restore hash types. Key does _not_ matter.
1625 if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1628 sc->hn_rss_hash = sc->hn_rss_hcap;
1629 error = hn_rss_reconfig(sc);
1631 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1633 /* XXX keep going. */
1637 /* Hash deliverability for mbufs. */
1638 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1642 hn_xpnt_vf_setready(struct hn_softc *sc)
1644 struct ifnet *ifp, *vf_ifp;
1649 vf_ifp = sc->hn_vf_ifp;
1652 * Mark the VF ready.
1654 sc->hn_vf_rdytick = 0;
1657 * Save information for restoration.
1659 sc->hn_saved_caps = ifp->if_capabilities;
1660 sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1661 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1662 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1665 * Intersect supported/enabled capabilities.
1668 * if_hwassist is not changed here.
1670 ifp->if_capabilities &= vf_ifp->if_capabilities;
1671 ifp->if_capenable &= ifp->if_capabilities;
1676 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1677 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1678 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1679 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1680 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1681 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1684 * Change VF's enabled capabilities.
1686 memset(&ifr, 0, sizeof(ifr));
1687 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1688 ifr.ifr_reqcap = ifp->if_capenable;
1689 hn_xpnt_vf_iocsetcaps(sc, &ifr);
1691 if (ifp->if_mtu != ETHERMTU) {
1697 memset(&ifr, 0, sizeof(ifr));
1698 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1699 ifr.ifr_mtu = ifp->if_mtu;
1700 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1702 if_printf(ifp, "%s SIOCSIFMTU %lu failed\n",
1703 vf_ifp->if_xname, ifp->if_mtu);
1704 if (ifp->if_mtu > ETHERMTU) {
1705 if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1709 * No need to adjust the synthetic parts' MTU;
1710 * failure of the adjustment will cause us
1711 * infinite headache.
1713 ifp->if_mtu = ETHERMTU;
1714 hn_mtu_change_fixup(sc);
1721 hn_xpnt_vf_isready(struct hn_softc *sc)
1726 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1729 if (sc->hn_vf_rdytick == 0)
1732 if (sc->hn_vf_rdytick > ticks)
1735 /* Mark VF as ready. */
1736 hn_xpnt_vf_setready(sc);
1741 hn_xpnt_vf_setenable(struct hn_softc *sc)
1747 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1748 rm_wlock(&sc->hn_vf_lock);
1749 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1750 rm_wunlock(&sc->hn_vf_lock);
1752 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1753 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1757 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1763 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1764 rm_wlock(&sc->hn_vf_lock);
1765 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1767 sc->hn_vf_ifp = NULL;
1768 rm_wunlock(&sc->hn_vf_lock);
1770 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1771 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1775 hn_xpnt_vf_init(struct hn_softc *sc)
1781 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1782 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1785 if_printf(sc->hn_ifp, "try bringing up %s\n",
1786 sc->hn_vf_ifp->if_xname);
1792 hn_xpnt_vf_saveifflags(sc);
1793 sc->hn_vf_ifp->if_flags |= IFF_UP;
1794 error = hn_xpnt_vf_iocsetflags(sc);
1796 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1797 sc->hn_vf_ifp->if_xname, error);
1803 * Datapath setting must happen _after_ bringing the VF up.
1805 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1809 * Fixup RSS related bits _after_ the VF is brought up, since
1810 * many VFs generate RSS key during it's initialization.
1812 hn_vf_rss_fixup(sc, true);
1814 /* Mark transparent mode VF as enabled. */
1815 hn_xpnt_vf_setenable(sc);
1819 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1821 struct hn_softc *sc = xsc;
1825 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1827 if (sc->hn_vf_ifp == NULL)
1829 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1832 if (sc->hn_vf_rdytick != 0) {
1833 /* Mark VF as ready. */
1834 hn_xpnt_vf_setready(sc);
1837 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1839 * Delayed VF initialization.
1842 if_printf(sc->hn_ifp, "delayed initialize %s\n",
1843 sc->hn_vf_ifp->if_xname);
1845 hn_xpnt_vf_init(sc);
1852 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1854 struct hn_softc *sc = xsc;
1858 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1861 if (!hn_ismyvf(sc, ifp))
1864 if (sc->hn_vf_ifp != NULL) {
1865 if_printf(sc->hn_ifp, "%s was attached as VF\n",
1866 sc->hn_vf_ifp->if_xname);
1870 if (hn_xpnt_vf && ifp->if_start != NULL) {
1872 * ifnet.if_start is _not_ supported by transparent
1873 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1875 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1876 "in transparent VF mode.\n", ifp->if_xname);
1880 rm_wlock(&hn_vfmap_lock);
1882 if (ifp->if_index >= hn_vfmap_size) {
1883 struct ifnet **newmap;
1886 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1887 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1890 memcpy(newmap, hn_vfmap,
1891 sizeof(struct ifnet *) * hn_vfmap_size);
1892 free(hn_vfmap, M_DEVBUF);
1894 hn_vfmap_size = newsize;
1896 KASSERT(hn_vfmap[ifp->if_index] == NULL,
1897 ("%s: ifindex %d was mapped to %s",
1898 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1899 hn_vfmap[ifp->if_index] = sc->hn_ifp;
1901 rm_wunlock(&hn_vfmap_lock);
1903 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1904 rm_wlock(&sc->hn_vf_lock);
1905 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1906 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1907 sc->hn_vf_ifp = ifp;
1908 rm_wunlock(&sc->hn_vf_lock);
1914 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1915 * Save vf_ifp's current if_input for later restoration.
1917 sc->hn_vf_input = ifp->if_input;
1918 ifp->if_input = hn_xpnt_vf_input;
1921 * Stop link status management; use the VF's.
1923 hn_suspend_mgmt(sc);
1926 * Give VF sometime to complete its attach routing.
1928 wait_ticks = hn_xpnt_vf_attwait * hz;
1929 sc->hn_vf_rdytick = ticks + wait_ticks;
1931 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1939 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1941 struct hn_softc *sc = xsc;
1945 if (sc->hn_vf_ifp == NULL)
1948 if (!hn_ismyvf(sc, ifp))
1953 * Make sure that the delayed initialization is not running.
1956 * - This lock _must_ be released, since the hn_vf_init task
1957 * will try holding this lock.
1958 * - It is safe to release this lock here, since the
1959 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1961 * XXX racy, if hn(4) ever detached.
1964 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1967 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1968 sc->hn_ifp->if_xname));
1969 ifp->if_input = sc->hn_vf_input;
1970 sc->hn_vf_input = NULL;
1972 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1973 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1974 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1976 if (sc->hn_vf_rdytick == 0) {
1978 * The VF was ready; restore some settings.
1980 sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
1983 * There is _no_ need to fixup if_capenable and
1984 * if_hwassist, since the if_capabilities before
1985 * restoration was an intersection of the VF's
1986 * if_capabilites and the synthetic device's
1989 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
1990 sc->hn_ifp->if_hw_tsomaxsegcount =
1991 sc->hn_saved_tsosegcnt;
1992 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
1995 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1997 * Restore RSS settings.
1999 hn_vf_rss_restore(sc);
2002 * Resume link status management, which was suspended
2003 * by hn_ifnet_attevent().
2009 /* Mark transparent mode VF as disabled. */
2010 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
2012 rm_wlock(&hn_vfmap_lock);
2014 KASSERT(ifp->if_index < hn_vfmap_size,
2015 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
2016 if (hn_vfmap[ifp->if_index] != NULL) {
2017 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
2018 ("%s: ifindex %d was mapped to %s",
2019 ifp->if_xname, ifp->if_index,
2020 hn_vfmap[ifp->if_index]->if_xname));
2021 hn_vfmap[ifp->if_index] = NULL;
2024 rm_wunlock(&hn_vfmap_lock);
2030 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
2032 struct hn_softc *sc = xsc;
2034 if (sc->hn_vf_ifp == ifp)
2035 if_link_state_change(sc->hn_ifp, link_state);
2039 hn_probe(device_t dev)
2042 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
2043 device_set_desc(dev, "Hyper-V Network Interface");
2044 return BUS_PROBE_DEFAULT;
2050 hn_attach(device_t dev)
2052 struct hn_softc *sc = device_get_softc(dev);
2053 struct sysctl_oid_list *child;
2054 struct sysctl_ctx_list *ctx;
2055 uint8_t eaddr[ETHER_ADDR_LEN];
2056 struct ifnet *ifp = NULL;
2057 int error, ring_cnt, tx_ring_cnt;
2061 sc->hn_prichan = vmbus_get_channel(dev);
2063 rm_init(&sc->hn_vf_lock, "hnvf");
2064 if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2065 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2068 * Initialize these tunables once.
2070 sc->hn_agg_size = hn_tx_agg_size;
2071 sc->hn_agg_pkts = hn_tx_agg_pkts;
2074 * Setup taskqueue for transmission.
2076 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2080 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2081 M_DEVBUF, M_WAITOK);
2082 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2083 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2084 M_WAITOK, taskqueue_thread_enqueue,
2085 &sc->hn_tx_taskqs[i]);
2086 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2087 "%s tx%d", device_get_nameunit(dev), i);
2089 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2090 sc->hn_tx_taskqs = hn_tx_taskque;
2094 * Setup taskqueue for mangement tasks, e.g. link status.
2096 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2097 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2098 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2099 device_get_nameunit(dev));
2100 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2101 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2102 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2103 hn_netchg_status_taskfunc, sc);
2107 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2109 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2110 taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2111 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2112 device_get_nameunit(dev));
2113 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2114 hn_xpnt_vf_init_taskfunc, sc);
2118 * Allocate ifnet and setup its name earlier, so that if_printf
2119 * can be used by functions, which will be called after
2122 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
2124 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2127 * Initialize ifmedia earlier so that it can be unconditionally
2128 * destroyed, if error happened later on.
2130 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2133 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2134 * to use (tx_ring_cnt).
2137 * The # of RX rings to use is same as the # of channels to use.
2139 ring_cnt = hn_chan_cnt;
2140 if (ring_cnt <= 0) {
2142 ring_cnt = mp_ncpus;
2143 if (ring_cnt > HN_RING_CNT_DEF_MAX)
2144 ring_cnt = HN_RING_CNT_DEF_MAX;
2145 } else if (ring_cnt > mp_ncpus) {
2146 ring_cnt = mp_ncpus;
2149 tx_ring_cnt = hn_tx_ring_cnt;
2150 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2151 tx_ring_cnt = ring_cnt;
2152 #ifdef HN_IFSTART_SUPPORT
2153 if (hn_use_if_start) {
2154 /* ifnet.if_start only needs one TX ring. */
2160 * Set the leader CPU for channels.
2162 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2165 * Create enough TX/RX rings, even if only limited number of
2166 * channels can be allocated.
2168 error = hn_create_tx_data(sc, tx_ring_cnt);
2171 error = hn_create_rx_data(sc, ring_cnt);
2176 * Create transaction context for NVS and RNDIS transactions.
2178 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2179 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2180 if (sc->hn_xact == NULL) {
2186 * Install orphan handler for the revocation of this device's
2190 * The processing order is critical here:
2191 * Install the orphan handler, _before_ testing whether this
2192 * device's primary channel has been revoked or not.
2194 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2195 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2201 * Attach the synthetic parts, i.e. NVS and RNDIS.
2203 error = hn_synth_attach(sc, ETHERMTU);
2207 error = hn_rndis_get_eaddr(sc, eaddr);
2211 error = hn_rndis_get_mtu(sc, &mtu);
2214 else if (bootverbose)
2215 device_printf(dev, "RNDIS mtu %u\n", mtu);
2217 #if __FreeBSD_version >= 1100099
2218 if (sc->hn_rx_ring_inuse > 1) {
2220 * Reduce TCP segment aggregation limit for multiple
2221 * RX rings to increase ACK timeliness.
2223 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2228 * Fixup TX stuffs after synthetic parts are attached.
2230 hn_fixup_tx_data(sc);
2232 ctx = device_get_sysctl_ctx(dev);
2233 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2234 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2235 &sc->hn_nvs_ver, 0, "NVS version");
2236 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2237 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2238 hn_ndis_version_sysctl, "A", "NDIS version");
2239 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2240 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2241 hn_caps_sysctl, "A", "capabilities");
2242 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2243 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2244 hn_hwassist_sysctl, "A", "hwassist");
2245 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2246 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2247 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2248 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2249 "max # of TSO segments");
2250 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2251 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2252 "max size of TSO segment");
2253 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2254 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2255 hn_rxfilter_sysctl, "A", "rxfilter");
2256 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2257 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2258 hn_rss_hash_sysctl, "A", "RSS hash");
2259 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2260 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2261 hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2262 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2263 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2264 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2265 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2266 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2267 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2268 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2269 hn_rss_key_sysctl, "IU", "RSS key");
2270 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2271 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2272 hn_rss_ind_sysctl, "IU", "RSS indirect table");
2273 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2274 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2275 "RNDIS offered packet transmission aggregation size limit");
2276 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2277 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2278 "RNDIS offered packet transmission aggregation count limit");
2279 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2280 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2281 "RNDIS packet transmission aggregation alignment");
2282 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2283 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2284 hn_txagg_size_sysctl, "I",
2285 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2286 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2287 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2288 hn_txagg_pkts_sysctl, "I",
2289 "Packet transmission aggregation packets, "
2290 "0 -- disable, -1 -- auto");
2291 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2292 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2293 hn_polling_sysctl, "I",
2294 "Polling frequency: [100,1000000], 0 disable polling");
2295 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2296 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2297 hn_vf_sysctl, "A", "Virtual Function's name");
2299 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2300 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2301 hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2303 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2304 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2305 hn_xpnt_vf_enabled_sysctl, "I",
2306 "Transparent VF enabled");
2307 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2308 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2309 hn_xpnt_vf_accbpf_sysctl, "I",
2310 "Accurate BPF for transparent VF");
2314 * Setup the ifmedia, which has been initialized earlier.
2316 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2317 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2318 /* XXX ifmedia_set really should do this for us */
2319 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2322 * Setup the ifnet for this interface.
2326 ifp->if_baudrate = IF_Gbps(10);
2328 /* if_baudrate is 32bits on 32bit system. */
2329 ifp->if_baudrate = IF_Gbps(1);
2331 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2332 ifp->if_ioctl = hn_ioctl;
2333 ifp->if_init = hn_init;
2334 #ifdef HN_IFSTART_SUPPORT
2335 if (hn_use_if_start) {
2336 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2338 ifp->if_start = hn_start;
2339 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2340 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2341 IFQ_SET_READY(&ifp->if_snd);
2345 ifp->if_transmit = hn_transmit;
2346 ifp->if_qflush = hn_xmit_qflush;
2349 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2351 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2352 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2354 if (sc->hn_caps & HN_CAP_VLAN) {
2355 /* XXX not sure about VLAN_MTU. */
2356 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2359 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2360 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2361 ifp->if_capabilities |= IFCAP_TXCSUM;
2362 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2363 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2364 if (sc->hn_caps & HN_CAP_TSO4) {
2365 ifp->if_capabilities |= IFCAP_TSO4;
2366 ifp->if_hwassist |= CSUM_IP_TSO;
2368 if (sc->hn_caps & HN_CAP_TSO6) {
2369 ifp->if_capabilities |= IFCAP_TSO6;
2370 ifp->if_hwassist |= CSUM_IP6_TSO;
2373 /* Enable all available capabilities by default. */
2374 ifp->if_capenable = ifp->if_capabilities;
2377 * Disable IPv6 TSO and TXCSUM by default, they still can
2378 * be enabled through SIOCSIFCAP.
2380 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2381 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2383 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2385 * Lock hn_set_tso_maxsize() to simplify its
2389 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2391 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2392 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2395 ether_ifattach(ifp, eaddr);
2397 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2398 if_printf(ifp, "TSO segcnt %u segsz %u\n",
2399 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2401 if (mtu < ETHERMTU) {
2402 if_printf(ifp, "fixup mtu %lu -> %u\n", ifp->if_mtu, mtu);
2406 /* Inform the upper layer about the long frame support. */
2407 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2410 * Kick off link status check.
2412 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2413 hn_update_link_status(sc);
2416 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2417 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2418 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2419 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2421 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2422 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2427 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2428 * since interface's LLADDR is needed; interface LLADDR is not
2429 * available when ifnet_arrival event is triggered.
2431 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2432 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2433 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2434 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2438 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2439 hn_synth_detach(sc);
2445 hn_detach(device_t dev)
2447 struct hn_softc *sc = device_get_softc(dev);
2448 struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2450 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2452 * In case that the vmbus missed the orphan handler
2455 vmbus_xact_ctx_orphan(sc->hn_xact);
2458 if (sc->hn_ifaddr_evthand != NULL)
2459 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2460 if (sc->hn_ifnet_evthand != NULL)
2461 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2462 if (sc->hn_ifnet_atthand != NULL) {
2463 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2464 sc->hn_ifnet_atthand);
2466 if (sc->hn_ifnet_dethand != NULL) {
2467 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2468 sc->hn_ifnet_dethand);
2470 if (sc->hn_ifnet_lnkhand != NULL)
2471 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2473 vf_ifp = sc->hn_vf_ifp;
2474 __compiler_membar();
2476 hn_ifnet_detevent(sc, vf_ifp);
2478 if (device_is_attached(dev)) {
2480 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2481 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2485 * hn_stop() only suspends data, so managment
2486 * stuffs have to be suspended manually here.
2488 hn_suspend_mgmt(sc);
2489 hn_synth_detach(sc);
2492 ether_ifdetach(ifp);
2495 ifmedia_removeall(&sc->hn_media);
2496 hn_destroy_rx_data(sc);
2497 hn_destroy_tx_data(sc);
2499 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2502 for (i = 0; i < hn_tx_taskq_cnt; ++i)
2503 taskqueue_free(sc->hn_tx_taskqs[i]);
2504 free(sc->hn_tx_taskqs, M_DEVBUF);
2506 taskqueue_free(sc->hn_mgmt_taskq0);
2507 if (sc->hn_vf_taskq != NULL)
2508 taskqueue_free(sc->hn_vf_taskq);
2510 if (sc->hn_xact != NULL) {
2512 * Uninstall the orphan handler _before_ the xact is
2515 vmbus_chan_unset_orphan(sc->hn_prichan);
2516 vmbus_xact_ctx_destroy(sc->hn_xact);
2521 HN_LOCK_DESTROY(sc);
2522 rm_destroy(&sc->hn_vf_lock);
2527 hn_shutdown(device_t dev)
2534 hn_link_status(struct hn_softc *sc)
2536 uint32_t link_status;
2539 error = hn_rndis_get_linkstatus(sc, &link_status);
2541 /* XXX what to do? */
2545 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2546 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2548 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2549 if_link_state_change(sc->hn_ifp,
2550 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2551 LINK_STATE_UP : LINK_STATE_DOWN);
2555 hn_link_taskfunc(void *xsc, int pending __unused)
2557 struct hn_softc *sc = xsc;
2559 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2565 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2567 struct hn_softc *sc = xsc;
2569 /* Prevent any link status checks from running. */
2570 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2573 * Fake up a [link down --> link up] state change; 5 seconds
2574 * delay is used, which closely simulates miibus reaction
2575 * upon link down event.
2577 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2578 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2579 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2580 &sc->hn_netchg_status, 5 * hz);
2584 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2586 struct hn_softc *sc = xsc;
2588 /* Re-allow link status checks. */
2589 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2594 hn_update_link_status(struct hn_softc *sc)
2597 if (sc->hn_mgmt_taskq != NULL)
2598 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2602 hn_change_network(struct hn_softc *sc)
2605 if (sc->hn_mgmt_taskq != NULL)
2606 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2610 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2611 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2613 struct mbuf *m = *m_head;
2616 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2618 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2619 m, segs, nsegs, BUS_DMA_NOWAIT);
2620 if (error == EFBIG) {
2623 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2627 *m_head = m = m_new;
2628 txr->hn_tx_collapsed++;
2630 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2631 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2634 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2635 BUS_DMASYNC_PREWRITE);
2636 txd->flags |= HN_TXD_FLAG_DMAMAP;
2642 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2645 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2646 ("put an onlist txd %#x", txd->flags));
2647 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2648 ("put an onagg txd %#x", txd->flags));
2650 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2651 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2654 if (!STAILQ_EMPTY(&txd->agg_list)) {
2655 struct hn_txdesc *tmp_txd;
2657 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2660 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2661 ("resursive aggregation on aggregated txdesc"));
2662 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2663 ("not aggregated txdesc"));
2664 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2665 ("aggregated txdesc uses dmamap"));
2666 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2667 ("aggregated txdesc consumes "
2668 "chimney sending buffer"));
2669 KASSERT(tmp_txd->chim_size == 0,
2670 ("aggregated txdesc has non-zero "
2671 "chimney sending size"));
2673 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2674 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2675 freed = hn_txdesc_put(txr, tmp_txd);
2676 KASSERT(freed, ("failed to free aggregated txdesc"));
2680 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2681 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2682 ("chim txd uses dmamap"));
2683 hn_chim_free(txr->hn_sc, txd->chim_index);
2684 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2686 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2687 bus_dmamap_sync(txr->hn_tx_data_dtag,
2688 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2689 bus_dmamap_unload(txr->hn_tx_data_dtag,
2691 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2694 if (txd->m != NULL) {
2699 txd->flags |= HN_TXD_FLAG_ONLIST;
2700 #ifndef HN_USE_TXDESC_BUFRING
2701 mtx_lock_spin(&txr->hn_txlist_spin);
2702 KASSERT(txr->hn_txdesc_avail >= 0 &&
2703 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2704 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2705 txr->hn_txdesc_avail++;
2706 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2707 mtx_unlock_spin(&txr->hn_txlist_spin);
2708 #else /* HN_USE_TXDESC_BUFRING */
2710 atomic_add_int(&txr->hn_txdesc_avail, 1);
2712 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2713 #endif /* !HN_USE_TXDESC_BUFRING */
2718 static __inline struct hn_txdesc *
2719 hn_txdesc_get(struct hn_tx_ring *txr)
2721 struct hn_txdesc *txd;
2723 #ifndef HN_USE_TXDESC_BUFRING
2724 mtx_lock_spin(&txr->hn_txlist_spin);
2725 txd = SLIST_FIRST(&txr->hn_txlist);
2727 KASSERT(txr->hn_txdesc_avail > 0,
2728 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2729 txr->hn_txdesc_avail--;
2730 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2732 mtx_unlock_spin(&txr->hn_txlist_spin);
2734 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2738 #ifdef HN_USE_TXDESC_BUFRING
2740 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2742 #endif /* HN_USE_TXDESC_BUFRING */
2743 KASSERT(txd->m == NULL && txd->refs == 0 &&
2744 STAILQ_EMPTY(&txd->agg_list) &&
2745 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2746 txd->chim_size == 0 &&
2747 (txd->flags & HN_TXD_FLAG_ONLIST) &&
2748 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2749 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2750 txd->flags &= ~HN_TXD_FLAG_ONLIST;
2756 static __inline void
2757 hn_txdesc_hold(struct hn_txdesc *txd)
2760 /* 0->1 transition will never work */
2761 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2762 atomic_add_int(&txd->refs, 1);
2765 static __inline void
2766 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2769 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2770 ("recursive aggregation on aggregating txdesc"));
2772 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2773 ("already aggregated"));
2774 KASSERT(STAILQ_EMPTY(&txd->agg_list),
2775 ("recursive aggregation on to-be-aggregated txdesc"));
2777 txd->flags |= HN_TXD_FLAG_ONAGG;
2778 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2782 hn_tx_ring_pending(struct hn_tx_ring *txr)
2784 bool pending = false;
2786 #ifndef HN_USE_TXDESC_BUFRING
2787 mtx_lock_spin(&txr->hn_txlist_spin);
2788 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2790 mtx_unlock_spin(&txr->hn_txlist_spin);
2792 if (!buf_ring_full(txr->hn_txdesc_br))
2798 static __inline void
2799 hn_txeof(struct hn_tx_ring *txr)
2801 txr->hn_has_txeof = 0;
2806 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2807 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2809 struct hn_txdesc *txd = sndc->hn_cbarg;
2810 struct hn_tx_ring *txr;
2813 KASSERT(txr->hn_chan == chan,
2814 ("channel mismatch, on chan%u, should be chan%u",
2815 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2817 txr->hn_has_txeof = 1;
2818 hn_txdesc_put(txr, txd);
2820 ++txr->hn_txdone_cnt;
2821 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2822 txr->hn_txdone_cnt = 0;
2823 if (txr->hn_oactive)
2829 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2831 #if defined(INET) || defined(INET6)
2832 struct lro_ctrl *lro = &rxr->hn_lro;
2833 struct lro_entry *queued;
2835 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
2836 SLIST_REMOVE_HEAD(&lro->lro_active, next);
2837 tcp_lro_flush(lro, queued);
2843 * 'txr' could be NULL, if multiple channels and
2844 * ifnet.if_start method are enabled.
2846 if (txr == NULL || !txr->hn_has_txeof)
2849 txr->hn_txdone_cnt = 0;
2853 static __inline uint32_t
2854 hn_rndis_pktmsg_offset(uint32_t ofs)
2857 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2858 ("invalid RNDIS packet msg offset %u", ofs));
2859 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2862 static __inline void *
2863 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2864 size_t pi_dlen, uint32_t pi_type)
2866 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2867 struct rndis_pktinfo *pi;
2869 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2870 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2873 * Per-packet-info does not move; it only grows.
2876 * rm_pktinfooffset in this phase counts from the beginning
2877 * of rndis_packet_msg.
2879 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2880 ("%u pktinfo overflows RNDIS packet msg", pi_type));
2881 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2882 pkt->rm_pktinfolen);
2883 pkt->rm_pktinfolen += pi_size;
2885 pi->rm_size = pi_size;
2886 pi->rm_type = pi_type;
2887 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2889 return (pi->rm_data);
2893 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2895 struct hn_txdesc *txd;
2899 txd = txr->hn_agg_txd;
2900 KASSERT(txd != NULL, ("no aggregate txdesc"));
2903 * Since hn_txpkt() will reset this temporary stat, save
2904 * it now, so that oerrors can be updated properly, if
2905 * hn_txpkt() ever fails.
2907 pkts = txr->hn_stat_pkts;
2910 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2911 * failure, save it for later freeing, if hn_txpkt() ever
2915 error = hn_txpkt(ifp, txr, txd);
2916 if (__predict_false(error)) {
2917 /* txd is freed, but m is not. */
2920 txr->hn_flush_failed++;
2921 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2924 /* Reset all aggregation states. */
2925 txr->hn_agg_txd = NULL;
2926 txr->hn_agg_szleft = 0;
2927 txr->hn_agg_pktleft = 0;
2928 txr->hn_agg_prevpkt = NULL;
2934 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2939 if (txr->hn_agg_txd != NULL) {
2940 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2941 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2942 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2946 * Update the previous RNDIS packet's total length,
2947 * it can be increased due to the mandatory alignment
2948 * padding for this RNDIS packet. And update the
2949 * aggregating txdesc's chimney sending buffer size
2953 * Zero-out the padding, as required by the RNDIS spec.
2956 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2957 agg_txd->chim_size += pkt->rm_len - olen;
2959 /* Link this txdesc to the parent. */
2960 hn_txdesc_agg(agg_txd, txd);
2962 chim = (uint8_t *)pkt + pkt->rm_len;
2963 /* Save the current packet for later fixup. */
2964 txr->hn_agg_prevpkt = chim;
2966 txr->hn_agg_pktleft--;
2967 txr->hn_agg_szleft -= pktsize;
2968 if (txr->hn_agg_szleft <=
2969 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2971 * Probably can't aggregate more packets,
2972 * flush this aggregating txdesc proactively.
2974 txr->hn_agg_pktleft = 0;
2979 hn_flush_txagg(ifp, txr);
2981 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
2983 txr->hn_tx_chimney_tried++;
2984 txd->chim_index = hn_chim_alloc(txr->hn_sc);
2985 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
2987 txr->hn_tx_chimney++;
2989 chim = txr->hn_sc->hn_chim +
2990 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
2992 if (txr->hn_agg_pktmax > 1 &&
2993 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2994 txr->hn_agg_txd = txd;
2995 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
2996 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
2997 txr->hn_agg_prevpkt = chim;
3004 * If this function fails, then both txd and m_head0 will be freed.
3007 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
3008 struct mbuf **m_head0)
3010 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
3011 int error, nsegs, i;
3012 struct mbuf *m_head = *m_head0;
3013 struct rndis_packet_msg *pkt;
3016 int pkt_hlen, pkt_size;
3018 pkt = txd->rndis_pkt;
3019 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
3020 if (pkt_size < txr->hn_chim_size) {
3021 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
3025 if (txr->hn_agg_txd != NULL)
3026 hn_flush_txagg(ifp, txr);
3029 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
3030 pkt->rm_len = m_head->m_pkthdr.len;
3031 pkt->rm_dataoffset = 0;
3032 pkt->rm_datalen = m_head->m_pkthdr.len;
3033 pkt->rm_oobdataoffset = 0;
3034 pkt->rm_oobdatalen = 0;
3035 pkt->rm_oobdataelements = 0;
3036 pkt->rm_pktinfooffset = sizeof(*pkt);
3037 pkt->rm_pktinfolen = 0;
3038 pkt->rm_vchandle = 0;
3039 pkt->rm_reserved = 0;
3041 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
3043 * Set the hash value for this packet, so that the host could
3044 * dispatch the TX done event for this packet back to this TX
3047 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3048 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
3049 *pi_data = txr->hn_tx_idx;
3052 if (m_head->m_flags & M_VLANTAG) {
3053 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3054 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
3055 *pi_data = NDIS_VLAN_INFO_MAKE(
3056 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
3057 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
3058 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
3061 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3062 #if defined(INET6) || defined(INET)
3063 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3064 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3066 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3067 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(
3068 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3069 m_head->m_pkthdr.tso_segsz);
3072 #if defined(INET6) && defined(INET)
3077 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(
3078 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen,
3079 m_head->m_pkthdr.tso_segsz);
3082 #endif /* INET6 || INET */
3083 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3084 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3085 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3086 if (m_head->m_pkthdr.csum_flags &
3087 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3088 *pi_data = NDIS_TXCSUM_INFO_IPV6;
3090 *pi_data = NDIS_TXCSUM_INFO_IPV4;
3091 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3092 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
3095 if (m_head->m_pkthdr.csum_flags &
3096 (CSUM_IP_TCP | CSUM_IP6_TCP)) {
3097 *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS(
3098 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3099 } else if (m_head->m_pkthdr.csum_flags &
3100 (CSUM_IP_UDP | CSUM_IP6_UDP)) {
3101 *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS(
3102 m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen);
3106 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3107 /* Fixup RNDIS packet message total length */
3108 pkt->rm_len += pkt_hlen;
3109 /* Convert RNDIS packet message offsets */
3110 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3111 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3114 * Fast path: Chimney sending.
3117 struct hn_txdesc *tgt_txd = txd;
3119 if (txr->hn_agg_txd != NULL) {
3120 tgt_txd = txr->hn_agg_txd;
3126 KASSERT(pkt == chim,
3127 ("RNDIS pkt not in chimney sending buffer"));
3128 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3129 ("chimney sending buffer is not used"));
3130 tgt_txd->chim_size += pkt->rm_len;
3132 m_copydata(m_head, 0, m_head->m_pkthdr.len,
3133 ((uint8_t *)chim) + pkt_hlen);
3135 txr->hn_gpa_cnt = 0;
3136 txr->hn_sendpkt = hn_txpkt_chim;
3140 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3141 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3142 ("chimney buffer is used"));
3143 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3145 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3146 if (__predict_false(error)) {
3150 * This mbuf is not linked w/ the txd yet, so free it now.
3155 freed = hn_txdesc_put(txr, txd);
3157 ("fail to free txd upon txdma error"));
3159 txr->hn_txdma_failed++;
3160 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3165 /* +1 RNDIS packet message */
3166 txr->hn_gpa_cnt = nsegs + 1;
3168 /* send packet with page buffer */
3169 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3170 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3171 txr->hn_gpa[0].gpa_len = pkt_hlen;
3174 * Fill the page buffers with mbuf info after the page
3175 * buffer for RNDIS packet message.
3177 for (i = 0; i < nsegs; ++i) {
3178 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3180 gpa->gpa_page = atop(segs[i].ds_addr);
3181 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3182 gpa->gpa_len = segs[i].ds_len;
3185 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3187 txr->hn_sendpkt = hn_txpkt_sglist;
3191 /* Set the completion routine */
3192 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3194 /* Update temporary stats for later use. */
3195 txr->hn_stat_pkts++;
3196 txr->hn_stat_size += m_head->m_pkthdr.len;
3197 if (m_head->m_flags & M_MCAST)
3198 txr->hn_stat_mcasts++;
3205 * If this function fails, then txd will be freed, but the mbuf
3206 * associated w/ the txd will _not_ be freed.
3209 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3211 int error, send_failed = 0, has_bpf;
3214 has_bpf = bpf_peers_present(ifp->if_bpf);
3217 * Make sure that this txd and any aggregated txds are not
3218 * freed before ETHER_BPF_MTAP.
3220 hn_txdesc_hold(txd);
3222 error = txr->hn_sendpkt(txr, txd);
3225 const struct hn_txdesc *tmp_txd;
3227 ETHER_BPF_MTAP(ifp, txd->m);
3228 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3229 ETHER_BPF_MTAP(ifp, tmp_txd->m);
3232 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3233 #ifdef HN_IFSTART_SUPPORT
3234 if (!hn_use_if_start)
3237 if_inc_counter(ifp, IFCOUNTER_OBYTES,
3239 if (txr->hn_stat_mcasts != 0) {
3240 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3241 txr->hn_stat_mcasts);
3244 txr->hn_pkts += txr->hn_stat_pkts;
3248 hn_txdesc_put(txr, txd);
3250 if (__predict_false(error)) {
3254 * This should "really rarely" happen.
3256 * XXX Too many RX to be acked or too many sideband
3257 * commands to run? Ask netvsc_channel_rollup()
3258 * to kick start later.
3260 txr->hn_has_txeof = 1;
3262 txr->hn_send_failed++;
3265 * Try sending again after set hn_has_txeof;
3266 * in case that we missed the last
3267 * netvsc_channel_rollup().
3271 if_printf(ifp, "send failed\n");
3274 * Caller will perform further processing on the
3275 * associated mbuf, so don't free it in hn_txdesc_put();
3276 * only unload it from the DMA map in hn_txdesc_put(),
3280 freed = hn_txdesc_put(txr, txd);
3282 ("fail to free txd upon send error"));
3284 txr->hn_send_failed++;
3287 /* Reset temporary stats, after this sending is done. */
3288 txr->hn_stat_size = 0;
3289 txr->hn_stat_pkts = 0;
3290 txr->hn_stat_mcasts = 0;
3296 * Append the specified data to the indicated mbuf chain,
3297 * Extend the mbuf chain if the new data does not fit in
3300 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3301 * There should be an equivalent in the kernel mbuf code,
3302 * but there does not appear to be one yet.
3304 * Differs from m_append() in that additional mbufs are
3305 * allocated with cluster size MJUMPAGESIZE, and filled
3308 * Return 1 if able to complete the job; otherwise 0.
3311 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3314 int remainder, space;
3316 for (m = m0; m->m_next != NULL; m = m->m_next)
3319 space = M_TRAILINGSPACE(m);
3322 * Copy into available space.
3324 if (space > remainder)
3326 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3331 while (remainder > 0) {
3333 * Allocate a new mbuf; could check space
3334 * and allocate a cluster instead.
3336 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
3339 n->m_len = min(MJUMPAGESIZE, remainder);
3340 bcopy(cp, mtod(n, caddr_t), n->m_len);
3342 remainder -= n->m_len;
3346 if (m0->m_flags & M_PKTHDR)
3347 m0->m_pkthdr.len += len - remainder;
3349 return (remainder == 0);
3352 #if defined(INET) || defined(INET6)
3354 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3356 #if __FreeBSD_version >= 1100095
3357 if (hn_lro_mbufq_depth) {
3358 tcp_lro_queue_mbuf(lc, m);
3362 return tcp_lro_rx(lc, m, 0);
3367 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3368 const struct hn_rxinfo *info)
3370 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3372 int size, do_lro = 0, do_csum = 1, is_vf = 0;
3373 int hash_type = M_HASHTYPE_NONE;
3376 if (rxr->hn_rxvf_ifp != NULL) {
3378 * Non-transparent mode VF; pretend this packet is from
3381 ifp = rxr->hn_rxvf_ifp;
3383 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3384 /* Transparent mode VF. */
3388 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3391 * See the NOTE of hn_rndis_init_fixat(). This
3392 * function can be reached, immediately after the
3393 * RNDIS is initialized but before the ifnet is
3394 * setup on the hn_attach() path; drop the unexpected
3400 if (__predict_false(dlen < ETHER_HDR_LEN)) {
3401 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3405 if (dlen <= MHLEN) {
3406 m_new = m_gethdr(M_NOWAIT, MT_DATA);
3407 if (m_new == NULL) {
3408 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3411 memcpy(mtod(m_new, void *), data, dlen);
3412 m_new->m_pkthdr.len = m_new->m_len = dlen;
3413 rxr->hn_small_pkts++;
3416 * Get an mbuf with a cluster. For packets 2K or less,
3417 * get a standard 2K cluster. For anything larger, get a
3418 * 4K cluster. Any buffers larger than 4K can cause problems
3419 * if looped around to the Hyper-V TX channel, so avoid them.
3422 if (dlen > MCLBYTES) {
3424 size = MJUMPAGESIZE;
3427 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3428 if (m_new == NULL) {
3429 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3433 hv_m_append(m_new, dlen, data);
3435 m_new->m_pkthdr.rcvif = ifp;
3437 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3440 /* receive side checksum offload */
3441 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3442 /* IP csum offload */
3443 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3444 m_new->m_pkthdr.csum_flags |=
3445 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3449 /* TCP/UDP csum offload */
3450 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3451 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3452 m_new->m_pkthdr.csum_flags |=
3453 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3454 m_new->m_pkthdr.csum_data = 0xffff;
3455 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3463 * As of this write (Oct 28th, 2016), host side will turn
3464 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3465 * the do_lro setting here is actually _not_ accurate. We
3466 * depend on the RSS hash type check to reset do_lro.
3468 if ((info->csum_info &
3469 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3470 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3473 const struct ether_header *eh;
3478 /* Checked at the beginning of this function. */
3479 KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
3481 eh = mtod(m_new, struct ether_header *);
3482 etype = ntohs(eh->ether_type);
3483 if (etype == ETHERTYPE_VLAN) {
3484 const struct ether_vlan_header *evl;
3486 hoff = sizeof(*evl);
3487 if (m_new->m_len < hoff)
3489 evl = mtod(m_new, struct ether_vlan_header *);
3490 etype = ntohs(evl->evl_proto);
3493 if (etype == ETHERTYPE_IP) {
3496 pr = hn_check_iplen(m_new, hoff);
3497 if (pr == IPPROTO_TCP) {
3499 (rxr->hn_trust_hcsum &
3500 HN_TRUST_HCSUM_TCP)) {
3501 rxr->hn_csum_trusted++;
3502 m_new->m_pkthdr.csum_flags |=
3503 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3504 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3505 m_new->m_pkthdr.csum_data = 0xffff;
3508 } else if (pr == IPPROTO_UDP) {
3510 (rxr->hn_trust_hcsum &
3511 HN_TRUST_HCSUM_UDP)) {
3512 rxr->hn_csum_trusted++;
3513 m_new->m_pkthdr.csum_flags |=
3514 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3515 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3516 m_new->m_pkthdr.csum_data = 0xffff;
3518 } else if (pr != IPPROTO_DONE && do_csum &&
3519 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3520 rxr->hn_csum_trusted++;
3521 m_new->m_pkthdr.csum_flags |=
3522 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3527 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3528 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3529 NDIS_VLAN_INFO_ID(info->vlan_info),
3530 NDIS_VLAN_INFO_PRI(info->vlan_info),
3531 NDIS_VLAN_INFO_CFI(info->vlan_info));
3532 m_new->m_flags |= M_VLANTAG;
3536 * If VF is activated (tranparent/non-transparent mode does not
3541 * hn(4) will only receive broadcast packets, multicast packets,
3542 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3545 * For non-transparent, we definitely _cannot_ enable LRO at
3546 * all, since the LRO flush will use hn(4) as the receiving
3547 * interface; i.e. hn_ifp->if_input(hn_ifp, m).
3553 * If VF is activated (tranparent/non-transparent mode does not
3554 * matter here), do _not_ mess with unsupported hash types or
3557 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3559 m_new->m_pkthdr.flowid = info->hash_value;
3561 hash_type = M_HASHTYPE_OPAQUE;
3562 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3563 NDIS_HASH_FUNCTION_TOEPLITZ) {
3564 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3569 * do_lro is resetted, if the hash types are not TCP
3570 * related. See the comment in the above csum_flags
3574 case NDIS_HASH_IPV4:
3575 hash_type = M_HASHTYPE_RSS_IPV4;
3579 case NDIS_HASH_TCP_IPV4:
3580 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3583 case NDIS_HASH_IPV6:
3584 hash_type = M_HASHTYPE_RSS_IPV6;
3588 case NDIS_HASH_IPV6_EX:
3589 hash_type = M_HASHTYPE_RSS_IPV6_EX;
3593 case NDIS_HASH_TCP_IPV6:
3594 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3597 case NDIS_HASH_TCP_IPV6_EX:
3598 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3602 } else if (!is_vf) {
3603 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3605 M_HASHTYPE_SET(m_new, hash_type);
3607 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3608 if (hn_ifp != ifp) {
3609 const struct ether_header *eh;
3612 * Non-transparent mode VF is activated.
3616 * Allow tapping on hn(4).
3618 ETHER_BPF_MTAP(hn_ifp, m_new);
3621 * Update hn(4)'s stats.
3623 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3624 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3625 /* Checked at the beginning of this function. */
3626 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3627 eh = mtod(m_new, struct ether_header *);
3628 if (ETHER_IS_MULTICAST(eh->ether_dhost))
3629 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3633 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3634 #if defined(INET) || defined(INET6)
3635 struct lro_ctrl *lro = &rxr->hn_lro;
3638 rxr->hn_lro_tried++;
3639 if (hn_lro_rx(lro, m_new) == 0) {
3646 ifp->if_input(ifp, m_new);
3652 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3654 struct hn_softc *sc = ifp->if_softc;
3655 struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3656 struct ifnet *vf_ifp;
3657 int mask, error = 0;
3658 struct ifrsskey *ifrk;
3659 struct ifrsshash *ifrh;
3664 if (ifr->ifr_mtu > HN_MTU_MAX) {
3671 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3676 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3677 /* Can't change MTU */
3683 if (ifp->if_mtu == ifr->ifr_mtu) {
3688 if (hn_xpnt_vf_isready(sc)) {
3689 vf_ifp = sc->hn_vf_ifp;
3691 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3692 sizeof(ifr_vf.ifr_name));
3693 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3697 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3698 vf_ifp->if_xname, ifr->ifr_mtu, error);
3704 * Suspend this interface before the synthetic parts
3710 * Detach the synthetics parts, i.e. NVS and RNDIS.
3712 hn_synth_detach(sc);
3715 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3716 * with the new MTU setting.
3718 error = hn_synth_attach(sc, ifr->ifr_mtu);
3724 error = hn_rndis_get_mtu(sc, &mtu);
3727 else if (bootverbose)
3728 if_printf(ifp, "RNDIS mtu %u\n", mtu);
3731 * Commit the requested MTU, after the synthetic parts
3732 * have been successfully attached.
3734 if (mtu >= ifr->ifr_mtu) {
3737 if_printf(ifp, "fixup mtu %d -> %u\n",
3743 * Synthetic parts' reattach may change the chimney
3744 * sending size; update it.
3746 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3747 hn_set_chim_size(sc, sc->hn_chim_szmax);
3750 * Make sure that various parameters based on MTU are
3751 * still valid, after the MTU change.
3753 hn_mtu_change_fixup(sc);
3756 * All done! Resume the interface now.
3760 if ((sc->hn_flags & HN_FLAG_RXVF) ||
3761 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3763 * Since we have reattached the NVS part,
3764 * change the datapath to VF again; in case
3765 * that it is lost, after the NVS was detached.
3767 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3776 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3781 if (hn_xpnt_vf_isready(sc))
3782 hn_xpnt_vf_saveifflags(sc);
3784 if (ifp->if_flags & IFF_UP) {
3785 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3787 * Caller meight hold mutex, e.g.
3788 * bpf; use busy-wait for the RNDIS
3792 hn_rxfilter_config(sc);
3795 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3796 error = hn_xpnt_vf_iocsetflags(sc);
3801 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3804 sc->hn_if_flags = ifp->if_flags;
3812 if (hn_xpnt_vf_isready(sc)) {
3814 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3815 sizeof(ifr_vf.ifr_name));
3816 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3822 * Fix up requested capabilities w/ supported capabilities,
3823 * since the supported capabilities could have been changed.
3825 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3828 if (mask & IFCAP_TXCSUM) {
3829 ifp->if_capenable ^= IFCAP_TXCSUM;
3830 if (ifp->if_capenable & IFCAP_TXCSUM)
3831 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3833 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3835 if (mask & IFCAP_TXCSUM_IPV6) {
3836 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3837 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3838 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3840 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3843 /* TODO: flip RNDIS offload parameters for RXCSUM. */
3844 if (mask & IFCAP_RXCSUM)
3845 ifp->if_capenable ^= IFCAP_RXCSUM;
3847 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
3848 if (mask & IFCAP_RXCSUM_IPV6)
3849 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3852 if (mask & IFCAP_LRO)
3853 ifp->if_capenable ^= IFCAP_LRO;
3855 if (mask & IFCAP_TSO4) {
3856 ifp->if_capenable ^= IFCAP_TSO4;
3857 if (ifp->if_capenable & IFCAP_TSO4)
3858 ifp->if_hwassist |= CSUM_IP_TSO;
3860 ifp->if_hwassist &= ~CSUM_IP_TSO;
3862 if (mask & IFCAP_TSO6) {
3863 ifp->if_capenable ^= IFCAP_TSO6;
3864 if (ifp->if_capenable & IFCAP_TSO6)
3865 ifp->if_hwassist |= CSUM_IP6_TSO;
3867 ifp->if_hwassist &= ~CSUM_IP6_TSO;
3877 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3881 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3883 * Multicast uses mutex; use busy-wait for
3887 hn_rxfilter_config(sc);
3891 /* XXX vlan(4) style mcast addr maintenance */
3892 if (hn_xpnt_vf_isready(sc)) {
3895 old_if_flags = sc->hn_vf_ifp->if_flags;
3896 hn_xpnt_vf_saveifflags(sc);
3898 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3899 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3901 error = hn_xpnt_vf_iocsetflags(sc);
3910 if (hn_xpnt_vf_isready(sc)) {
3912 * SIOCGIFMEDIA expects ifmediareq, so don't
3913 * create and pass ifr_vf to the VF here; just
3914 * replace the ifr_name.
3916 vf_ifp = sc->hn_vf_ifp;
3917 strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3918 sizeof(ifr->ifr_name));
3919 error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3920 /* Restore the ifr_name. */
3921 strlcpy(ifr->ifr_name, ifp->if_xname,
3922 sizeof(ifr->ifr_name));
3927 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3930 case SIOCGIFRSSHASH:
3931 ifrh = (struct ifrsshash *)data;
3933 if (sc->hn_rx_ring_inuse == 1) {
3935 ifrh->ifrh_func = RSS_FUNC_NONE;
3936 ifrh->ifrh_types = 0;
3940 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3941 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3943 ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3944 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3949 ifrk = (struct ifrsskey *)data;
3951 if (sc->hn_rx_ring_inuse == 1) {
3953 ifrk->ifrk_func = RSS_FUNC_NONE;
3954 ifrk->ifrk_keylen = 0;
3957 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3958 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3960 ifrk->ifrk_func = RSS_FUNC_PRIVATE;
3961 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
3962 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
3963 NDIS_HASH_KEYSIZE_TOEPLITZ);
3968 error = ether_ioctl(ifp, cmd, data);
3975 hn_stop(struct hn_softc *sc, bool detaching)
3977 struct ifnet *ifp = sc->hn_ifp;
3982 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3983 ("synthetic parts were not attached"));
3985 /* Clear RUNNING bit ASAP. */
3986 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3988 /* Disable polling. */
3991 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
3992 KASSERT(sc->hn_vf_ifp != NULL,
3993 ("%s: VF is not attached", ifp->if_xname));
3995 /* Mark transparent mode VF as disabled. */
3996 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
4000 * Datapath setting must happen _before_ bringing
4003 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
4006 * Bring the VF down.
4008 hn_xpnt_vf_saveifflags(sc);
4009 sc->hn_vf_ifp->if_flags &= ~IFF_UP;
4010 hn_xpnt_vf_iocsetflags(sc);
4013 /* Suspend data transfers. */
4014 hn_suspend_data(sc);
4016 /* Clear OACTIVE bit. */
4017 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4018 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4019 sc->hn_tx_ring[i].hn_oactive = 0;
4022 * If the non-transparent mode VF is active, make sure
4023 * that the RX filter still allows packet reception.
4025 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
4026 hn_rxfilter_config(sc);
4030 hn_init_locked(struct hn_softc *sc)
4032 struct ifnet *ifp = sc->hn_ifp;
4037 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
4040 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
4043 /* Configure RX filter */
4044 hn_rxfilter_config(sc);
4046 /* Clear OACTIVE bit. */
4047 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4048 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4049 sc->hn_tx_ring[i].hn_oactive = 0;
4051 /* Clear TX 'suspended' bit. */
4052 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
4054 if (hn_xpnt_vf_isready(sc)) {
4055 /* Initialize transparent VF. */
4056 hn_xpnt_vf_init(sc);
4059 /* Everything is ready; unleash! */
4060 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
4062 /* Re-enable polling if requested. */
4063 if (sc->hn_pollhz > 0)
4064 hn_polling(sc, sc->hn_pollhz);
4070 struct hn_softc *sc = xsc;
4077 #if __FreeBSD_version >= 1100099
4080 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4082 struct hn_softc *sc = arg1;
4083 unsigned int lenlim;
4086 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4087 error = sysctl_handle_int(oidp, &lenlim, 0, req);
4088 if (error || req->newptr == NULL)
4092 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4093 lenlim > TCP_LRO_LENGTH_MAX) {
4097 hn_set_lro_lenlim(sc, lenlim);
4104 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4106 struct hn_softc *sc = arg1;
4107 int ackcnt, error, i;
4110 * lro_ackcnt_lim is append count limit,
4111 * +1 to turn it into aggregation limit.
4113 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4114 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4115 if (error || req->newptr == NULL)
4118 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4122 * Convert aggregation limit back to append
4127 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4128 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4136 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4138 struct hn_softc *sc = arg1;
4143 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4146 error = sysctl_handle_int(oidp, &on, 0, req);
4147 if (error || req->newptr == NULL)
4151 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4152 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4155 rxr->hn_trust_hcsum |= hcsum;
4157 rxr->hn_trust_hcsum &= ~hcsum;
4164 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4166 struct hn_softc *sc = arg1;
4167 int chim_size, error;
4169 chim_size = sc->hn_tx_ring[0].hn_chim_size;
4170 error = sysctl_handle_int(oidp, &chim_size, 0, req);
4171 if (error || req->newptr == NULL)
4174 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4178 hn_set_chim_size(sc, chim_size);
4183 #if __FreeBSD_version < 1100095
4185 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4187 struct hn_softc *sc = arg1;
4188 int ofs = arg2, i, error;
4189 struct hn_rx_ring *rxr;
4193 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4194 rxr = &sc->hn_rx_ring[i];
4195 stat += *((int *)((uint8_t *)rxr + ofs));
4198 error = sysctl_handle_64(oidp, &stat, 0, req);
4199 if (error || req->newptr == NULL)
4202 /* Zero out this stat. */
4203 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4204 rxr = &sc->hn_rx_ring[i];
4205 *((int *)((uint8_t *)rxr + ofs)) = 0;
4211 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4213 struct hn_softc *sc = arg1;
4214 int ofs = arg2, i, error;
4215 struct hn_rx_ring *rxr;
4219 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4220 rxr = &sc->hn_rx_ring[i];
4221 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4224 error = sysctl_handle_64(oidp, &stat, 0, req);
4225 if (error || req->newptr == NULL)
4228 /* Zero out this stat. */
4229 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4230 rxr = &sc->hn_rx_ring[i];
4231 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4239 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4241 struct hn_softc *sc = arg1;
4242 int ofs = arg2, i, error;
4243 struct hn_rx_ring *rxr;
4247 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4248 rxr = &sc->hn_rx_ring[i];
4249 stat += *((u_long *)((uint8_t *)rxr + ofs));
4252 error = sysctl_handle_long(oidp, &stat, 0, req);
4253 if (error || req->newptr == NULL)
4256 /* Zero out this stat. */
4257 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4258 rxr = &sc->hn_rx_ring[i];
4259 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
4265 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4267 struct hn_softc *sc = arg1;
4268 int ofs = arg2, i, error;
4269 struct hn_tx_ring *txr;
4273 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4274 txr = &sc->hn_tx_ring[i];
4275 stat += *((u_long *)((uint8_t *)txr + ofs));
4278 error = sysctl_handle_long(oidp, &stat, 0, req);
4279 if (error || req->newptr == NULL)
4282 /* Zero out this stat. */
4283 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4284 txr = &sc->hn_tx_ring[i];
4285 *((u_long *)((uint8_t *)txr + ofs)) = 0;
4291 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4293 struct hn_softc *sc = arg1;
4294 int ofs = arg2, i, error, conf;
4295 struct hn_tx_ring *txr;
4297 txr = &sc->hn_tx_ring[0];
4298 conf = *((int *)((uint8_t *)txr + ofs));
4300 error = sysctl_handle_int(oidp, &conf, 0, req);
4301 if (error || req->newptr == NULL)
4305 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4306 txr = &sc->hn_tx_ring[i];
4307 *((int *)((uint8_t *)txr + ofs)) = conf;
4315 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4317 struct hn_softc *sc = arg1;
4320 size = sc->hn_agg_size;
4321 error = sysctl_handle_int(oidp, &size, 0, req);
4322 if (error || req->newptr == NULL)
4326 sc->hn_agg_size = size;
4334 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4336 struct hn_softc *sc = arg1;
4339 pkts = sc->hn_agg_pkts;
4340 error = sysctl_handle_int(oidp, &pkts, 0, req);
4341 if (error || req->newptr == NULL)
4345 sc->hn_agg_pkts = pkts;
4353 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4355 struct hn_softc *sc = arg1;
4358 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4359 return (sysctl_handle_int(oidp, &pkts, 0, req));
4363 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4365 struct hn_softc *sc = arg1;
4368 align = sc->hn_tx_ring[0].hn_agg_align;
4369 return (sysctl_handle_int(oidp, &align, 0, req));
4373 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4376 vmbus_chan_poll_disable(chan);
4378 vmbus_chan_poll_enable(chan, pollhz);
4382 hn_polling(struct hn_softc *sc, u_int pollhz)
4384 int nsubch = sc->hn_rx_ring_inuse - 1;
4389 struct vmbus_channel **subch;
4392 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4393 for (i = 0; i < nsubch; ++i)
4394 hn_chan_polling(subch[i], pollhz);
4395 vmbus_subchan_rel(subch, nsubch);
4397 hn_chan_polling(sc->hn_prichan, pollhz);
4401 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4403 struct hn_softc *sc = arg1;
4406 pollhz = sc->hn_pollhz;
4407 error = sysctl_handle_int(oidp, &pollhz, 0, req);
4408 if (error || req->newptr == NULL)
4412 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4416 if (sc->hn_pollhz != pollhz) {
4417 sc->hn_pollhz = pollhz;
4418 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4419 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4420 hn_polling(sc, sc->hn_pollhz);
4428 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4430 struct hn_softc *sc = arg1;
4433 snprintf(verstr, sizeof(verstr), "%u.%u",
4434 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4435 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4436 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4440 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4442 struct hn_softc *sc = arg1;
4449 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4450 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4454 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4456 struct hn_softc *sc = arg1;
4457 char assist_str[128];
4461 hwassist = sc->hn_ifp->if_hwassist;
4463 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4464 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4468 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4470 struct hn_softc *sc = arg1;
4471 char filter_str[128];
4475 filter = sc->hn_rx_filter;
4477 snprintf(filter_str, sizeof(filter_str), "%b", filter,
4479 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4483 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4485 struct hn_softc *sc = arg1;
4490 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4491 if (error || req->newptr == NULL)
4494 if ((sc->hn_flags & HN_FLAG_RXVF) ||
4495 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4497 * RSS key is synchronized w/ VF's, don't allow users
4504 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4507 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4509 if (sc->hn_rx_ring_inuse > 1) {
4510 error = hn_rss_reconfig(sc);
4512 /* Not RSS capable, at least for now; just save the RSS key. */
4521 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4523 struct hn_softc *sc = arg1;
4528 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4529 if (error || req->newptr == NULL)
4533 * Don't allow RSS indirect table change, if this interface is not
4534 * RSS capable currently.
4536 if (sc->hn_rx_ring_inuse == 1) {
4541 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4544 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4546 hn_rss_ind_fixup(sc);
4547 error = hn_rss_reconfig(sc);
4554 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4556 struct hn_softc *sc = arg1;
4561 hash = sc->hn_rss_hash;
4563 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4564 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4568 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4570 struct hn_softc *sc = arg1;
4575 hash = sc->hn_rss_hcap;
4577 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4578 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4582 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4584 struct hn_softc *sc = arg1;
4589 hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4591 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4592 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4596 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4598 struct hn_softc *sc = arg1;
4599 char vf_name[IFNAMSIZ + 1];
4600 struct ifnet *vf_ifp;
4604 vf_ifp = sc->hn_vf_ifp;
4606 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4608 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4612 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4614 struct hn_softc *sc = arg1;
4615 char vf_name[IFNAMSIZ + 1];
4616 struct ifnet *vf_ifp;
4620 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4622 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4624 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4628 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4630 struct rm_priotracker pt;
4635 error = sysctl_wire_old_buffer(req, 0);
4639 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4643 rm_rlock(&hn_vfmap_lock, &pt);
4646 for (i = 0; i < hn_vfmap_size; ++i) {
4649 if (hn_vfmap[i] == NULL)
4652 ifp = ifnet_byindex(i);
4655 sbuf_printf(sb, "%s", ifp->if_xname);
4657 sbuf_printf(sb, " %s", ifp->if_xname);
4662 rm_runlock(&hn_vfmap_lock, &pt);
4664 error = sbuf_finish(sb);
4670 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4672 struct rm_priotracker pt;
4677 error = sysctl_wire_old_buffer(req, 0);
4681 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4685 rm_rlock(&hn_vfmap_lock, &pt);
4688 for (i = 0; i < hn_vfmap_size; ++i) {
4689 struct ifnet *ifp, *hn_ifp;
4691 hn_ifp = hn_vfmap[i];
4695 ifp = ifnet_byindex(i);
4698 sbuf_printf(sb, "%s:%s", ifp->if_xname,
4701 sbuf_printf(sb, " %s:%s", ifp->if_xname,
4708 rm_runlock(&hn_vfmap_lock, &pt);
4710 error = sbuf_finish(sb);
4716 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4718 struct hn_softc *sc = arg1;
4719 int error, onoff = 0;
4721 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4723 error = sysctl_handle_int(oidp, &onoff, 0, req);
4724 if (error || req->newptr == NULL)
4728 /* NOTE: hn_vf_lock for hn_transmit() */
4729 rm_wlock(&sc->hn_vf_lock);
4731 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4733 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4734 rm_wunlock(&sc->hn_vf_lock);
4741 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4743 struct hn_softc *sc = arg1;
4746 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4748 return (sysctl_handle_int(oidp, &enabled, 0, req));
4752 hn_check_iplen(const struct mbuf *m, int hoff)
4754 const struct ip *ip;
4755 int len, iphlen, iplen;
4756 const struct tcphdr *th;
4757 int thoff; /* TCP data offset */
4759 len = hoff + sizeof(struct ip);
4761 /* The packet must be at least the size of an IP header. */
4762 if (m->m_pkthdr.len < len)
4763 return IPPROTO_DONE;
4765 /* The fixed IP header must reside completely in the first mbuf. */
4767 return IPPROTO_DONE;
4769 ip = mtodo(m, hoff);
4771 /* Bound check the packet's stated IP header length. */
4772 iphlen = ip->ip_hl << 2;
4773 if (iphlen < sizeof(struct ip)) /* minimum header length */
4774 return IPPROTO_DONE;
4776 /* The full IP header must reside completely in the one mbuf. */
4777 if (m->m_len < hoff + iphlen)
4778 return IPPROTO_DONE;
4780 iplen = ntohs(ip->ip_len);
4783 * Check that the amount of data in the buffers is as
4784 * at least much as the IP header would have us expect.
4786 if (m->m_pkthdr.len < hoff + iplen)
4787 return IPPROTO_DONE;
4790 * Ignore IP fragments.
4792 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4793 return IPPROTO_DONE;
4796 * The TCP/IP or UDP/IP header must be entirely contained within
4797 * the first fragment of a packet.
4801 if (iplen < iphlen + sizeof(struct tcphdr))
4802 return IPPROTO_DONE;
4803 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4804 return IPPROTO_DONE;
4805 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4806 thoff = th->th_off << 2;
4807 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4808 return IPPROTO_DONE;
4809 if (m->m_len < hoff + iphlen + thoff)
4810 return IPPROTO_DONE;
4813 if (iplen < iphlen + sizeof(struct udphdr))
4814 return IPPROTO_DONE;
4815 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4816 return IPPROTO_DONE;
4820 return IPPROTO_DONE;
4827 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4829 struct sysctl_oid_list *child;
4830 struct sysctl_ctx_list *ctx;
4831 device_t dev = sc->hn_dev;
4832 #if defined(INET) || defined(INET6)
4833 #if __FreeBSD_version >= 1100095
4840 * Create RXBUF for reception.
4843 * - It is shared by all channels.
4844 * - A large enough buffer is allocated, certain version of NVSes
4845 * may further limit the usable space.
4847 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4848 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4849 BUS_DMA_WAITOK | BUS_DMA_ZERO);
4850 if (sc->hn_rxbuf == NULL) {
4851 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4855 sc->hn_rx_ring_cnt = ring_cnt;
4856 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4858 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4859 M_DEVBUF, M_WAITOK | M_ZERO);
4861 #if defined(INET) || defined(INET6)
4862 #if __FreeBSD_version >= 1100095
4863 lroent_cnt = hn_lro_entry_count;
4864 if (lroent_cnt < TCP_LRO_ENTRIES)
4865 lroent_cnt = TCP_LRO_ENTRIES;
4867 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4869 #endif /* INET || INET6 */
4871 ctx = device_get_sysctl_ctx(dev);
4872 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4874 /* Create dev.hn.UNIT.rx sysctl tree */
4875 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4876 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4878 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4879 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4881 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4882 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4883 &rxr->hn_br_dma, BUS_DMA_WAITOK);
4884 if (rxr->hn_br == NULL) {
4885 device_printf(dev, "allocate bufring failed\n");
4889 if (hn_trust_hosttcp)
4890 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4891 if (hn_trust_hostudp)
4892 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4893 if (hn_trust_hostip)
4894 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4895 rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4896 rxr->hn_ifp = sc->hn_ifp;
4897 if (i < sc->hn_tx_ring_cnt)
4898 rxr->hn_txr = &sc->hn_tx_ring[i];
4899 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4900 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4902 rxr->hn_rxbuf = sc->hn_rxbuf;
4907 #if defined(INET) || defined(INET6)
4908 #if __FreeBSD_version >= 1100095
4909 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4910 hn_lro_mbufq_depth);
4912 tcp_lro_init(&rxr->hn_lro);
4913 rxr->hn_lro.ifp = sc->hn_ifp;
4915 #if __FreeBSD_version >= 1100099
4916 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4917 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4919 #endif /* INET || INET6 */
4921 if (sc->hn_rx_sysctl_tree != NULL) {
4925 * Create per RX ring sysctl tree:
4926 * dev.hn.UNIT.rx.RINGID
4928 snprintf(name, sizeof(name), "%d", i);
4929 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4930 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4931 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4933 if (rxr->hn_rx_sysctl_tree != NULL) {
4934 SYSCTL_ADD_ULONG(ctx,
4935 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4936 OID_AUTO, "packets", CTLFLAG_RW,
4937 &rxr->hn_pkts, "# of packets received");
4938 SYSCTL_ADD_ULONG(ctx,
4939 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4940 OID_AUTO, "rss_pkts", CTLFLAG_RW,
4942 "# of packets w/ RSS info received");
4944 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4945 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
4946 &rxr->hn_pktbuf_len, 0,
4947 "Temporary channel packet buffer length");
4952 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
4953 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4954 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
4955 #if __FreeBSD_version < 1100095
4956 hn_rx_stat_int_sysctl,
4958 hn_rx_stat_u64_sysctl,
4960 "LU", "LRO queued");
4961 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
4962 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4963 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
4964 #if __FreeBSD_version < 1100095
4965 hn_rx_stat_int_sysctl,
4967 hn_rx_stat_u64_sysctl,
4969 "LU", "LRO flushed");
4970 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
4971 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4972 __offsetof(struct hn_rx_ring, hn_lro_tried),
4973 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
4974 #if __FreeBSD_version >= 1100099
4975 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
4976 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4977 hn_lro_lenlim_sysctl, "IU",
4978 "Max # of data bytes to be aggregated by LRO");
4979 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
4980 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4981 hn_lro_ackcnt_sysctl, "I",
4982 "Max # of ACKs to be aggregated by LRO");
4984 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
4985 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
4986 hn_trust_hcsum_sysctl, "I",
4987 "Trust tcp segement verification on host side, "
4988 "when csum info is missing");
4989 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
4990 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
4991 hn_trust_hcsum_sysctl, "I",
4992 "Trust udp datagram verification on host side, "
4993 "when csum info is missing");
4994 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
4995 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
4996 hn_trust_hcsum_sysctl, "I",
4997 "Trust ip packet verification on host side, "
4998 "when csum info is missing");
4999 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
5000 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5001 __offsetof(struct hn_rx_ring, hn_csum_ip),
5002 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
5003 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
5004 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5005 __offsetof(struct hn_rx_ring, hn_csum_tcp),
5006 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
5007 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
5008 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5009 __offsetof(struct hn_rx_ring, hn_csum_udp),
5010 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
5011 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
5012 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5013 __offsetof(struct hn_rx_ring, hn_csum_trusted),
5014 hn_rx_stat_ulong_sysctl, "LU",
5015 "# of packets that we trust host's csum verification");
5016 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
5017 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5018 __offsetof(struct hn_rx_ring, hn_small_pkts),
5019 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
5020 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
5021 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5022 __offsetof(struct hn_rx_ring, hn_ack_failed),
5023 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
5024 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
5025 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
5026 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
5027 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
5033 hn_destroy_rx_data(struct hn_softc *sc)
5037 if (sc->hn_rxbuf != NULL) {
5038 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
5039 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
5041 device_printf(sc->hn_dev, "RXBUF is referenced\n");
5042 sc->hn_rxbuf = NULL;
5045 if (sc->hn_rx_ring_cnt == 0)
5048 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5049 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5051 if (rxr->hn_br == NULL)
5053 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
5054 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
5056 device_printf(sc->hn_dev,
5057 "%dth channel bufring is referenced", i);
5061 #if defined(INET) || defined(INET6)
5062 tcp_lro_free(&rxr->hn_lro);
5064 free(rxr->hn_pktbuf, M_DEVBUF);
5066 free(sc->hn_rx_ring, M_DEVBUF);
5067 sc->hn_rx_ring = NULL;
5069 sc->hn_rx_ring_cnt = 0;
5070 sc->hn_rx_ring_inuse = 0;
5074 hn_tx_ring_create(struct hn_softc *sc, int id)
5076 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5077 device_t dev = sc->hn_dev;
5078 bus_dma_tag_t parent_dtag;
5082 txr->hn_tx_idx = id;
5084 #ifndef HN_USE_TXDESC_BUFRING
5085 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5087 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5089 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5090 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5091 M_DEVBUF, M_WAITOK | M_ZERO);
5092 #ifndef HN_USE_TXDESC_BUFRING
5093 SLIST_INIT(&txr->hn_txlist);
5095 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5096 M_WAITOK, &txr->hn_tx_lock);
5099 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5100 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5101 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5103 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5106 #ifdef HN_IFSTART_SUPPORT
5107 if (hn_use_if_start) {
5108 txr->hn_txeof = hn_start_txeof;
5109 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5110 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5116 txr->hn_txeof = hn_xmit_txeof;
5117 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5118 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5120 br_depth = hn_get_txswq_depth(txr);
5121 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5122 M_WAITOK, &txr->hn_tx_lock);
5125 txr->hn_direct_tx_size = hn_direct_tx_size;
5128 * Always schedule transmission instead of trying to do direct
5129 * transmission. This one gives the best performance so far.
5131 txr->hn_sched_tx = 1;
5133 parent_dtag = bus_get_dma_tag(dev);
5135 /* DMA tag for RNDIS packet messages. */
5136 error = bus_dma_tag_create(parent_dtag, /* parent */
5137 HN_RNDIS_PKT_ALIGN, /* alignment */
5138 HN_RNDIS_PKT_BOUNDARY, /* boundary */
5139 BUS_SPACE_MAXADDR, /* lowaddr */
5140 BUS_SPACE_MAXADDR, /* highaddr */
5141 NULL, NULL, /* filter, filterarg */
5142 HN_RNDIS_PKT_LEN, /* maxsize */
5144 HN_RNDIS_PKT_LEN, /* maxsegsize */
5146 NULL, /* lockfunc */
5147 NULL, /* lockfuncarg */
5148 &txr->hn_tx_rndis_dtag);
5150 device_printf(dev, "failed to create rndis dmatag\n");
5154 /* DMA tag for data. */
5155 error = bus_dma_tag_create(parent_dtag, /* parent */
5157 HN_TX_DATA_BOUNDARY, /* boundary */
5158 BUS_SPACE_MAXADDR, /* lowaddr */
5159 BUS_SPACE_MAXADDR, /* highaddr */
5160 NULL, NULL, /* filter, filterarg */
5161 HN_TX_DATA_MAXSIZE, /* maxsize */
5162 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
5163 HN_TX_DATA_SEGSIZE, /* maxsegsize */
5165 NULL, /* lockfunc */
5166 NULL, /* lockfuncarg */
5167 &txr->hn_tx_data_dtag);
5169 device_printf(dev, "failed to create data dmatag\n");
5173 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5174 struct hn_txdesc *txd = &txr->hn_txdesc[i];
5177 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5178 STAILQ_INIT(&txd->agg_list);
5181 * Allocate and load RNDIS packet message.
5183 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5184 (void **)&txd->rndis_pkt,
5185 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5186 &txd->rndis_pkt_dmap);
5189 "failed to allocate rndis_packet_msg, %d\n", i);
5193 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5194 txd->rndis_pkt_dmap,
5195 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5196 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5200 "failed to load rndis_packet_msg, %d\n", i);
5201 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5202 txd->rndis_pkt, txd->rndis_pkt_dmap);
5206 /* DMA map for TX data. */
5207 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5211 "failed to allocate tx data dmamap\n");
5212 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5213 txd->rndis_pkt_dmap);
5214 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5215 txd->rndis_pkt, txd->rndis_pkt_dmap);
5219 /* All set, put it to list */
5220 txd->flags |= HN_TXD_FLAG_ONLIST;
5221 #ifndef HN_USE_TXDESC_BUFRING
5222 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5224 buf_ring_enqueue(txr->hn_txdesc_br, txd);
5227 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5229 if (sc->hn_tx_sysctl_tree != NULL) {
5230 struct sysctl_oid_list *child;
5231 struct sysctl_ctx_list *ctx;
5235 * Create per TX ring sysctl tree:
5236 * dev.hn.UNIT.tx.RINGID
5238 ctx = device_get_sysctl_ctx(dev);
5239 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5241 snprintf(name, sizeof(name), "%d", id);
5242 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5243 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5245 if (txr->hn_tx_sysctl_tree != NULL) {
5246 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5249 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5250 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5251 "# of available TX descs");
5253 #ifdef HN_IFSTART_SUPPORT
5254 if (!hn_use_if_start)
5257 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5258 CTLFLAG_RD, &txr->hn_oactive, 0,
5261 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5262 CTLFLAG_RW, &txr->hn_pkts,
5263 "# of packets transmitted");
5264 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5265 CTLFLAG_RW, &txr->hn_sends, "# of sends");
5273 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5275 struct hn_tx_ring *txr = txd->txr;
5277 KASSERT(txd->m == NULL, ("still has mbuf installed"));
5278 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5280 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5281 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5282 txd->rndis_pkt_dmap);
5283 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5287 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5290 KASSERT(txd->refs == 0 || txd->refs == 1,
5291 ("invalid txd refs %d", txd->refs));
5293 /* Aggregated txds will be freed by their aggregating txd. */
5294 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5297 freed = hn_txdesc_put(txr, txd);
5298 KASSERT(freed, ("can't free txdesc"));
5303 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5307 if (txr->hn_txdesc == NULL)
5312 * Because the freeing of aggregated txds will be deferred
5313 * to the aggregating txd, two passes are used here:
5314 * - The first pass GCes any pending txds. This GC is necessary,
5315 * since if the channels are revoked, hypervisor will not
5316 * deliver send-done for all pending txds.
5317 * - The second pass frees the busdma stuffs, i.e. after all txds
5320 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5321 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5322 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5323 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5325 if (txr->hn_tx_data_dtag != NULL)
5326 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5327 if (txr->hn_tx_rndis_dtag != NULL)
5328 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5330 #ifdef HN_USE_TXDESC_BUFRING
5331 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5334 free(txr->hn_txdesc, M_DEVBUF);
5335 txr->hn_txdesc = NULL;
5337 if (txr->hn_mbuf_br != NULL)
5338 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5340 #ifndef HN_USE_TXDESC_BUFRING
5341 mtx_destroy(&txr->hn_txlist_spin);
5343 mtx_destroy(&txr->hn_tx_lock);
5347 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5349 struct sysctl_oid_list *child;
5350 struct sysctl_ctx_list *ctx;
5354 * Create TXBUF for chimney sending.
5356 * NOTE: It is shared by all channels.
5358 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5359 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5360 BUS_DMA_WAITOK | BUS_DMA_ZERO);
5361 if (sc->hn_chim == NULL) {
5362 device_printf(sc->hn_dev, "allocate txbuf failed\n");
5366 sc->hn_tx_ring_cnt = ring_cnt;
5367 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5369 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5370 M_DEVBUF, M_WAITOK | M_ZERO);
5372 ctx = device_get_sysctl_ctx(sc->hn_dev);
5373 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5375 /* Create dev.hn.UNIT.tx sysctl tree */
5376 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5377 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5379 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5382 error = hn_tx_ring_create(sc, i);
5387 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5388 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5389 __offsetof(struct hn_tx_ring, hn_no_txdescs),
5390 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5391 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5392 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5393 __offsetof(struct hn_tx_ring, hn_send_failed),
5394 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5395 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5396 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5397 __offsetof(struct hn_tx_ring, hn_txdma_failed),
5398 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5399 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5400 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5401 __offsetof(struct hn_tx_ring, hn_flush_failed),
5402 hn_tx_stat_ulong_sysctl, "LU",
5403 "# of packet transmission aggregation flush failure");
5404 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5405 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5406 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5407 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5408 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5409 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5410 __offsetof(struct hn_tx_ring, hn_tx_chimney),
5411 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5412 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5413 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5414 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5415 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5416 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5417 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5418 "# of total TX descs");
5419 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5420 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5421 "Chimney send packet size upper boundary");
5422 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5423 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5424 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5425 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5426 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5427 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5428 hn_tx_conf_int_sysctl, "I",
5429 "Size of the packet for direct transmission");
5430 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5431 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5432 __offsetof(struct hn_tx_ring, hn_sched_tx),
5433 hn_tx_conf_int_sysctl, "I",
5434 "Always schedule transmission "
5435 "instead of doing direct transmission");
5436 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5437 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5438 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5439 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5440 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5441 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5442 "Applied packet transmission aggregation size");
5443 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5444 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5445 hn_txagg_pktmax_sysctl, "I",
5446 "Applied packet transmission aggregation packets");
5447 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5448 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5449 hn_txagg_align_sysctl, "I",
5450 "Applied packet transmission aggregation alignment");
5456 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5460 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5461 sc->hn_tx_ring[i].hn_chim_size = chim_size;
5465 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5467 struct ifnet *ifp = sc->hn_ifp;
5473 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5476 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5477 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5478 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5480 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5481 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5482 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5484 if (tso_maxlen < tso_minlen)
5485 tso_maxlen = tso_minlen;
5486 else if (tso_maxlen > IP_MAXPACKET)
5487 tso_maxlen = IP_MAXPACKET;
5488 if (tso_maxlen > sc->hn_ndis_tso_szmax)
5489 tso_maxlen = sc->hn_ndis_tso_szmax;
5490 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5492 if (hn_xpnt_vf_isready(sc)) {
5493 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5494 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5496 ifp->if_hw_tsomax = hw_tsomax;
5498 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5502 hn_fixup_tx_data(struct hn_softc *sc)
5504 uint64_t csum_assist;
5507 hn_set_chim_size(sc, sc->hn_chim_szmax);
5508 if (hn_tx_chimney_size > 0 &&
5509 hn_tx_chimney_size < sc->hn_chim_szmax)
5510 hn_set_chim_size(sc, hn_tx_chimney_size);
5513 if (sc->hn_caps & HN_CAP_IPCS)
5514 csum_assist |= CSUM_IP;
5515 if (sc->hn_caps & HN_CAP_TCP4CS)
5516 csum_assist |= CSUM_IP_TCP;
5517 if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs)
5518 csum_assist |= CSUM_IP_UDP;
5519 if (sc->hn_caps & HN_CAP_TCP6CS)
5520 csum_assist |= CSUM_IP6_TCP;
5521 if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs)
5522 csum_assist |= CSUM_IP6_UDP;
5523 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5524 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5526 if (sc->hn_caps & HN_CAP_HASHVAL) {
5528 * Support HASHVAL pktinfo on TX path.
5531 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5532 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5533 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5538 hn_destroy_tx_data(struct hn_softc *sc)
5542 if (sc->hn_chim != NULL) {
5543 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5544 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5546 device_printf(sc->hn_dev,
5547 "chimney sending buffer is referenced");
5552 if (sc->hn_tx_ring_cnt == 0)
5555 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5556 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5558 free(sc->hn_tx_ring, M_DEVBUF);
5559 sc->hn_tx_ring = NULL;
5561 sc->hn_tx_ring_cnt = 0;
5562 sc->hn_tx_ring_inuse = 0;
5565 #ifdef HN_IFSTART_SUPPORT
5568 hn_start_taskfunc(void *xtxr, int pending __unused)
5570 struct hn_tx_ring *txr = xtxr;
5572 mtx_lock(&txr->hn_tx_lock);
5573 hn_start_locked(txr, 0);
5574 mtx_unlock(&txr->hn_tx_lock);
5578 hn_start_locked(struct hn_tx_ring *txr, int len)
5580 struct hn_softc *sc = txr->hn_sc;
5581 struct ifnet *ifp = sc->hn_ifp;
5584 KASSERT(hn_use_if_start,
5585 ("hn_start_locked is called, when if_start is disabled"));
5586 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5587 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5588 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5590 if (__predict_false(txr->hn_suspended))
5593 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5597 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5598 struct hn_txdesc *txd;
5599 struct mbuf *m_head;
5602 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5606 if (len > 0 && m_head->m_pkthdr.len > len) {
5608 * This sending could be time consuming; let callers
5609 * dispatch this packet sending (and sending of any
5610 * following up packets) to tx taskqueue.
5612 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5617 #if defined(INET6) || defined(INET)
5618 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5619 m_head = hn_tso_fixup(m_head);
5620 if (__predict_false(m_head == NULL)) {
5621 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5624 } else if (m_head->m_pkthdr.csum_flags &
5625 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5626 m_head = hn_set_hlen(m_head);
5627 if (__predict_false(m_head == NULL)) {
5628 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5634 txd = hn_txdesc_get(txr);
5636 txr->hn_no_txdescs++;
5637 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5638 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5642 error = hn_encap(ifp, txr, txd, &m_head);
5644 /* Both txd and m_head are freed */
5645 KASSERT(txr->hn_agg_txd == NULL,
5646 ("encap failed w/ pending aggregating txdesc"));
5650 if (txr->hn_agg_pktleft == 0) {
5651 if (txr->hn_agg_txd != NULL) {
5652 KASSERT(m_head == NULL,
5653 ("pending mbuf for aggregating txdesc"));
5654 error = hn_flush_txagg(ifp, txr);
5655 if (__predict_false(error)) {
5656 atomic_set_int(&ifp->if_drv_flags,
5661 KASSERT(m_head != NULL, ("mbuf was freed"));
5662 error = hn_txpkt(ifp, txr, txd);
5663 if (__predict_false(error)) {
5664 /* txd is freed, but m_head is not */
5665 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5666 atomic_set_int(&ifp->if_drv_flags,
5674 KASSERT(txr->hn_agg_txd != NULL,
5675 ("no aggregating txdesc"));
5676 KASSERT(m_head == NULL,
5677 ("pending mbuf for aggregating txdesc"));
5682 /* Flush pending aggerated transmission. */
5683 if (txr->hn_agg_txd != NULL)
5684 hn_flush_txagg(ifp, txr);
5689 hn_start(struct ifnet *ifp)
5691 struct hn_softc *sc = ifp->if_softc;
5692 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5694 if (txr->hn_sched_tx)
5697 if (mtx_trylock(&txr->hn_tx_lock)) {
5700 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5701 mtx_unlock(&txr->hn_tx_lock);
5706 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5710 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5712 struct hn_tx_ring *txr = xtxr;
5714 mtx_lock(&txr->hn_tx_lock);
5715 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5716 hn_start_locked(txr, 0);
5717 mtx_unlock(&txr->hn_tx_lock);
5721 hn_start_txeof(struct hn_tx_ring *txr)
5723 struct hn_softc *sc = txr->hn_sc;
5724 struct ifnet *ifp = sc->hn_ifp;
5726 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5728 if (txr->hn_sched_tx)
5731 if (mtx_trylock(&txr->hn_tx_lock)) {
5734 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5735 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5736 mtx_unlock(&txr->hn_tx_lock);
5738 taskqueue_enqueue(txr->hn_tx_taskq,
5744 * Release the OACTIVE earlier, with the hope, that
5745 * others could catch up. The task will clear the
5746 * flag again with the hn_tx_lock to avoid possible
5749 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5750 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5754 #endif /* HN_IFSTART_SUPPORT */
5757 hn_xmit(struct hn_tx_ring *txr, int len)
5759 struct hn_softc *sc = txr->hn_sc;
5760 struct ifnet *ifp = sc->hn_ifp;
5761 struct mbuf *m_head;
5764 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5765 #ifdef HN_IFSTART_SUPPORT
5766 KASSERT(hn_use_if_start == 0,
5767 ("hn_xmit is called, when if_start is enabled"));
5769 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5771 if (__predict_false(txr->hn_suspended))
5774 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5777 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5778 struct hn_txdesc *txd;
5781 if (len > 0 && m_head->m_pkthdr.len > len) {
5783 * This sending could be time consuming; let callers
5784 * dispatch this packet sending (and sending of any
5785 * following up packets) to tx taskqueue.
5787 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5792 txd = hn_txdesc_get(txr);
5794 txr->hn_no_txdescs++;
5795 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5796 txr->hn_oactive = 1;
5800 error = hn_encap(ifp, txr, txd, &m_head);
5802 /* Both txd and m_head are freed; discard */
5803 KASSERT(txr->hn_agg_txd == NULL,
5804 ("encap failed w/ pending aggregating txdesc"));
5805 drbr_advance(ifp, txr->hn_mbuf_br);
5809 if (txr->hn_agg_pktleft == 0) {
5810 if (txr->hn_agg_txd != NULL) {
5811 KASSERT(m_head == NULL,
5812 ("pending mbuf for aggregating txdesc"));
5813 error = hn_flush_txagg(ifp, txr);
5814 if (__predict_false(error)) {
5815 txr->hn_oactive = 1;
5819 KASSERT(m_head != NULL, ("mbuf was freed"));
5820 error = hn_txpkt(ifp, txr, txd);
5821 if (__predict_false(error)) {
5822 /* txd is freed, but m_head is not */
5823 drbr_putback(ifp, txr->hn_mbuf_br,
5825 txr->hn_oactive = 1;
5832 KASSERT(txr->hn_agg_txd != NULL,
5833 ("no aggregating txdesc"));
5834 KASSERT(m_head == NULL,
5835 ("pending mbuf for aggregating txdesc"));
5840 drbr_advance(ifp, txr->hn_mbuf_br);
5843 /* Flush pending aggerated transmission. */
5844 if (txr->hn_agg_txd != NULL)
5845 hn_flush_txagg(ifp, txr);
5850 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5852 struct hn_softc *sc = ifp->if_softc;
5853 struct hn_tx_ring *txr;
5856 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5857 struct rm_priotracker pt;
5859 rm_rlock(&sc->hn_vf_lock, &pt);
5860 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5861 struct mbuf *m_bpf = NULL;
5864 obytes = m->m_pkthdr.len;
5865 if (m->m_flags & M_MCAST)
5868 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5869 if (bpf_peers_present(ifp->if_bpf)) {
5870 m_bpf = m_copypacket(m, M_NOWAIT);
5871 if (m_bpf == NULL) {
5873 * Failed to grab a shallow
5876 ETHER_BPF_MTAP(ifp, m);
5880 ETHER_BPF_MTAP(ifp, m);
5883 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5884 rm_runlock(&sc->hn_vf_lock, &pt);
5886 if (m_bpf != NULL) {
5888 ETHER_BPF_MTAP(ifp, m_bpf);
5892 if (error == ENOBUFS) {
5893 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5895 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5897 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5898 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5900 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5906 rm_runlock(&sc->hn_vf_lock, &pt);
5909 #if defined(INET6) || defined(INET)
5911 * Perform TSO packet header fixup or get l2/l3 header length now,
5912 * since packet headers should be cache-hot.
5914 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5915 m = hn_tso_fixup(m);
5916 if (__predict_false(m == NULL)) {
5917 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5920 } else if (m->m_pkthdr.csum_flags &
5921 (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) {
5923 if (__predict_false(m == NULL)) {
5924 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5931 * Select the TX ring based on flowid
5933 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
5934 #if defined(INET6) || defined(INET)
5937 if (m->m_pkthdr.len < 128 &&
5938 (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
5939 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
5940 m = hn_check_tcpsyn(m, &tcpsyn);
5941 if (__predict_false(m == NULL)) {
5943 IFCOUNTER_OERRORS, 1);
5948 const int tcpsyn = 0;
5953 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
5955 txr = &sc->hn_tx_ring[idx];
5957 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
5959 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5963 if (txr->hn_oactive)
5966 if (txr->hn_sched_tx)
5969 if (mtx_trylock(&txr->hn_tx_lock)) {
5972 sched = hn_xmit(txr, txr->hn_direct_tx_size);
5973 mtx_unlock(&txr->hn_tx_lock);
5978 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5983 hn_tx_ring_qflush(struct hn_tx_ring *txr)
5987 mtx_lock(&txr->hn_tx_lock);
5988 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
5990 mtx_unlock(&txr->hn_tx_lock);
5994 hn_xmit_qflush(struct ifnet *ifp)
5996 struct hn_softc *sc = ifp->if_softc;
5997 struct rm_priotracker pt;
6000 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
6001 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6004 rm_rlock(&sc->hn_vf_lock, &pt);
6005 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
6006 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
6007 rm_runlock(&sc->hn_vf_lock, &pt);
6011 hn_xmit_txeof(struct hn_tx_ring *txr)
6014 if (txr->hn_sched_tx)
6017 if (mtx_trylock(&txr->hn_tx_lock)) {
6020 txr->hn_oactive = 0;
6021 sched = hn_xmit(txr, txr->hn_direct_tx_size);
6022 mtx_unlock(&txr->hn_tx_lock);
6024 taskqueue_enqueue(txr->hn_tx_taskq,
6030 * Release the oactive earlier, with the hope, that
6031 * others could catch up. The task will clear the
6032 * oactive again with the hn_tx_lock to avoid possible
6035 txr->hn_oactive = 0;
6036 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6041 hn_xmit_taskfunc(void *xtxr, int pending __unused)
6043 struct hn_tx_ring *txr = xtxr;
6045 mtx_lock(&txr->hn_tx_lock);
6047 mtx_unlock(&txr->hn_tx_lock);
6051 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
6053 struct hn_tx_ring *txr = xtxr;
6055 mtx_lock(&txr->hn_tx_lock);
6056 txr->hn_oactive = 0;
6058 mtx_unlock(&txr->hn_tx_lock);
6062 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
6064 struct vmbus_chan_br cbr;
6065 struct hn_rx_ring *rxr;
6066 struct hn_tx_ring *txr = NULL;
6069 idx = vmbus_chan_subidx(chan);
6072 * Link this channel to RX/TX ring.
6074 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6075 ("invalid channel index %d, should > 0 && < %d",
6076 idx, sc->hn_rx_ring_inuse));
6077 rxr = &sc->hn_rx_ring[idx];
6078 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
6079 ("RX ring %d already attached", idx));
6080 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
6081 rxr->hn_chan = chan;
6084 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6085 idx, vmbus_chan_id(chan));
6088 if (idx < sc->hn_tx_ring_inuse) {
6089 txr = &sc->hn_tx_ring[idx];
6090 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6091 ("TX ring %d already attached", idx));
6092 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6094 txr->hn_chan = chan;
6096 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6097 idx, vmbus_chan_id(chan));
6101 /* Bind this channel to a proper CPU. */
6102 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6107 cbr.cbr = rxr->hn_br;
6108 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6109 cbr.cbr_txsz = HN_TXBR_SIZE;
6110 cbr.cbr_rxsz = HN_RXBR_SIZE;
6111 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6113 if (error == EISCONN) {
6114 if_printf(sc->hn_ifp, "bufring is connected after "
6115 "chan%u open failure\n", vmbus_chan_id(chan));
6116 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6118 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6119 vmbus_chan_id(chan), error);
6126 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6128 struct hn_rx_ring *rxr;
6131 idx = vmbus_chan_subidx(chan);
6134 * Link this channel to RX/TX ring.
6136 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6137 ("invalid channel index %d, should > 0 && < %d",
6138 idx, sc->hn_rx_ring_inuse));
6139 rxr = &sc->hn_rx_ring[idx];
6140 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6141 ("RX ring %d is not attached", idx));
6142 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6144 if (idx < sc->hn_tx_ring_inuse) {
6145 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6147 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6148 ("TX ring %d is not attached attached", idx));
6149 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6153 * Close this channel.
6156 * Channel closing does _not_ destroy the target channel.
6158 error = vmbus_chan_close_direct(chan);
6159 if (error == EISCONN) {
6160 if_printf(sc->hn_ifp, "chan%u bufring is connected "
6161 "after being closed\n", vmbus_chan_id(chan));
6162 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6164 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6165 vmbus_chan_id(chan), error);
6170 hn_attach_subchans(struct hn_softc *sc)
6172 struct vmbus_channel **subchans;
6173 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6176 KASSERT(subchan_cnt > 0, ("no sub-channels"));
6178 /* Attach the sub-channels. */
6179 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6180 for (i = 0; i < subchan_cnt; ++i) {
6183 error1 = hn_chan_attach(sc, subchans[i]);
6186 /* Move on; all channels will be detached later. */
6189 vmbus_subchan_rel(subchans, subchan_cnt);
6192 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6195 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6203 hn_detach_allchans(struct hn_softc *sc)
6205 struct vmbus_channel **subchans;
6206 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6209 if (subchan_cnt == 0)
6212 /* Detach the sub-channels. */
6213 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6214 for (i = 0; i < subchan_cnt; ++i)
6215 hn_chan_detach(sc, subchans[i]);
6216 vmbus_subchan_rel(subchans, subchan_cnt);
6220 * Detach the primary channel, _after_ all sub-channels
6223 hn_chan_detach(sc, sc->hn_prichan);
6225 /* Wait for sub-channels to be destroyed, if any. */
6226 vmbus_subchan_drain(sc->hn_prichan);
6229 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6230 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6231 HN_RX_FLAG_ATTACHED) == 0,
6232 ("%dth RX ring is still attached", i));
6234 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6235 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6236 HN_TX_FLAG_ATTACHED) == 0,
6237 ("%dth TX ring is still attached", i));
6243 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6245 struct vmbus_channel **subchans;
6246 int nchan, rxr_cnt, error;
6248 nchan = *nsubch + 1;
6251 * Multiple RX/TX rings are not requested.
6258 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6261 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6263 /* No RSS; this is benign. */
6268 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6272 if (nchan > rxr_cnt)
6275 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6281 * Allocate sub-channels from NVS.
6283 *nsubch = nchan - 1;
6284 error = hn_nvs_alloc_subchans(sc, nsubch);
6285 if (error || *nsubch == 0) {
6286 /* Failed to allocate sub-channels. */
6292 * Wait for all sub-channels to become ready before moving on.
6294 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6295 vmbus_subchan_rel(subchans, *nsubch);
6300 hn_synth_attachable(const struct hn_softc *sc)
6304 if (sc->hn_flags & HN_FLAG_ERRORS)
6307 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6308 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6310 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6317 * Make sure that the RX filter is zero after the successful
6318 * RNDIS initialization.
6321 * Under certain conditions on certain versions of Hyper-V,
6322 * the RNDIS rxfilter is _not_ zero on the hypervisor side
6323 * after the successful RNDIS initialization, which breaks
6324 * the assumption of any following code (well, it breaks the
6325 * RNDIS API contract actually). Clear the RNDIS rxfilter
6326 * explicitly, drain packets sneaking through, and drain the
6327 * interrupt taskqueues scheduled due to the stealth packets.
6330 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6334 hn_drain_rxtx(sc, nchan);
6338 hn_synth_attach(struct hn_softc *sc, int mtu)
6340 #define ATTACHED_NVS 0x0002
6341 #define ATTACHED_RNDIS 0x0004
6343 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6344 int error, nsubch, nchan = 1, i, rndis_inited;
6345 uint32_t old_caps, attached = 0;
6347 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6348 ("synthetic parts were attached"));
6350 if (!hn_synth_attachable(sc))
6353 /* Save capabilities for later verification. */
6354 old_caps = sc->hn_caps;
6357 /* Clear RSS stuffs. */
6358 sc->hn_rss_ind_size = 0;
6359 sc->hn_rss_hash = 0;
6360 sc->hn_rss_hcap = 0;
6363 * Attach the primary channel _before_ attaching NVS and RNDIS.
6365 error = hn_chan_attach(sc, sc->hn_prichan);
6372 error = hn_nvs_attach(sc, mtu);
6375 attached |= ATTACHED_NVS;
6378 * Attach RNDIS _after_ NVS is attached.
6380 error = hn_rndis_attach(sc, mtu, &rndis_inited);
6382 attached |= ATTACHED_RNDIS;
6387 * Make sure capabilities are not changed.
6389 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6390 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6391 old_caps, sc->hn_caps);
6397 * Allocate sub-channels for multi-TX/RX rings.
6400 * The # of RX rings that can be used is equivalent to the # of
6401 * channels to be requested.
6403 nsubch = sc->hn_rx_ring_cnt - 1;
6404 error = hn_synth_alloc_subchans(sc, &nsubch);
6407 /* NOTE: _Full_ synthetic parts detach is required now. */
6408 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6411 * Set the # of TX/RX rings that could be used according to
6412 * the # of channels that NVS offered.
6415 hn_set_ring_inuse(sc, nchan);
6417 /* Only the primary channel can be used; done */
6422 * Attach the sub-channels.
6424 * NOTE: hn_set_ring_inuse() _must_ have been called.
6426 error = hn_attach_subchans(sc);
6431 * Configure RSS key and indirect table _after_ all sub-channels
6434 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6436 * RSS key is not set yet; set it to the default RSS key.
6439 if_printf(sc->hn_ifp, "setup default RSS key\n");
6440 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6441 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6444 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6446 * RSS indirect table is not set yet; set it up in round-
6450 if_printf(sc->hn_ifp, "setup default RSS indirect "
6453 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
6454 rss->rss_ind[i] = i % nchan;
6455 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6458 * # of usable channels may be changed, so we have to
6459 * make sure that all entries in RSS indirect table
6462 * NOTE: hn_set_ring_inuse() _must_ have been called.
6464 hn_rss_ind_fixup(sc);
6467 sc->hn_rss_hash = sc->hn_rss_hcap;
6468 if ((sc->hn_flags & HN_FLAG_RXVF) ||
6469 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6470 /* NOTE: Don't reconfigure RSS; will do immediately. */
6471 hn_vf_rss_fixup(sc, false);
6473 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6478 * Fixup transmission aggregation setup.
6481 hn_rndis_init_fixat(sc, nchan);
6485 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6486 hn_rndis_init_fixat(sc, nchan);
6487 hn_synth_detach(sc);
6489 if (attached & ATTACHED_RNDIS) {
6490 hn_rndis_init_fixat(sc, nchan);
6491 hn_rndis_detach(sc);
6493 if (attached & ATTACHED_NVS)
6495 hn_chan_detach(sc, sc->hn_prichan);
6496 /* Restore old capabilities. */
6497 sc->hn_caps = old_caps;
6501 #undef ATTACHED_RNDIS
6507 * The interface must have been suspended though hn_suspend(), before
6508 * this function get called.
6511 hn_synth_detach(struct hn_softc *sc)
6514 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6515 ("synthetic parts were not attached"));
6517 /* Detach the RNDIS first. */
6518 hn_rndis_detach(sc);
6523 /* Detach all of the channels. */
6524 hn_detach_allchans(sc);
6526 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6530 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6532 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6533 ("invalid ring count %d", ring_cnt));
6535 if (sc->hn_tx_ring_cnt > ring_cnt)
6536 sc->hn_tx_ring_inuse = ring_cnt;
6538 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6539 sc->hn_rx_ring_inuse = ring_cnt;
6542 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6543 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6548 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6553 * The TX bufring will not be drained by the hypervisor,
6554 * if the primary channel is revoked.
6556 while (!vmbus_chan_rx_empty(chan) ||
6557 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6558 !vmbus_chan_tx_empty(chan)))
6560 vmbus_chan_intr_drain(chan);
6564 hn_disable_rx(struct hn_softc *sc)
6568 * Disable RX by clearing RX filter forcefully.
6570 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6571 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6574 * Give RNDIS enough time to flush all pending data packets.
6576 pause("waitrx", (200 * hz) / 1000);
6581 * RX/TX _must_ have been suspended/disabled, before this function
6585 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6587 struct vmbus_channel **subch = NULL;
6591 * Drain RX/TX bufrings and interrupts.
6595 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6597 if (subch != NULL) {
6600 for (i = 0; i < nsubch; ++i)
6601 hn_chan_drain(sc, subch[i]);
6603 hn_chan_drain(sc, sc->hn_prichan);
6606 vmbus_subchan_rel(subch, nsubch);
6610 hn_suspend_data(struct hn_softc *sc)
6612 struct hn_tx_ring *txr;
6620 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6621 txr = &sc->hn_tx_ring[i];
6623 mtx_lock(&txr->hn_tx_lock);
6624 txr->hn_suspended = 1;
6625 mtx_unlock(&txr->hn_tx_lock);
6626 /* No one is able send more packets now. */
6629 * Wait for all pending sends to finish.
6632 * We will _not_ receive all pending send-done, if the
6633 * primary channel is revoked.
6635 while (hn_tx_ring_pending(txr) &&
6636 !vmbus_chan_is_revoked(sc->hn_prichan))
6637 pause("hnwtx", 1 /* 1 tick */);
6648 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6651 * Drain any pending TX tasks.
6654 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6655 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6657 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6658 txr = &sc->hn_tx_ring[i];
6660 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6661 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6666 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6669 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6673 hn_suspend_mgmt(struct hn_softc *sc)
6680 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6681 * through hn_mgmt_taskq.
6683 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6684 vmbus_chan_run_task(sc->hn_prichan, &task);
6687 * Make sure that all pending management tasks are completed.
6689 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6690 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6691 taskqueue_drain_all(sc->hn_mgmt_taskq0);
6695 hn_suspend(struct hn_softc *sc)
6698 /* Disable polling. */
6702 * If the non-transparent mode VF is activated, the synthetic
6703 * device is receiving packets, so the data path of the
6704 * synthetic device must be suspended.
6706 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6707 (sc->hn_flags & HN_FLAG_RXVF))
6708 hn_suspend_data(sc);
6709 hn_suspend_mgmt(sc);
6713 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6717 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6718 ("invalid TX ring count %d", tx_ring_cnt));
6720 for (i = 0; i < tx_ring_cnt; ++i) {
6721 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6723 mtx_lock(&txr->hn_tx_lock);
6724 txr->hn_suspended = 0;
6725 mtx_unlock(&txr->hn_tx_lock);
6730 hn_resume_data(struct hn_softc *sc)
6739 hn_rxfilter_config(sc);
6742 * Make sure to clear suspend status on "all" TX rings,
6743 * since hn_tx_ring_inuse can be changed after
6744 * hn_suspend_data().
6746 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6748 #ifdef HN_IFSTART_SUPPORT
6749 if (!hn_use_if_start)
6753 * Flush unused drbrs, since hn_tx_ring_inuse may be
6756 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6757 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6763 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6764 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6767 * Use txeof task, so that any pending oactive can be
6770 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6775 hn_resume_mgmt(struct hn_softc *sc)
6778 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6781 * Kick off network change detection, if it was pending.
6782 * If no network change was pending, start link status
6783 * checks, which is more lightweight than network change
6786 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6787 hn_change_network(sc);
6789 hn_update_link_status(sc);
6793 hn_resume(struct hn_softc *sc)
6797 * If the non-transparent mode VF is activated, the synthetic
6798 * device have to receive packets, so the data path of the
6799 * synthetic device must be resumed.
6801 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6802 (sc->hn_flags & HN_FLAG_RXVF))
6806 * Don't resume link status change if VF is attached/activated.
6807 * - In the non-transparent VF mode, the synthetic device marks
6808 * link down until the VF is deactivated; i.e. VF is down.
6809 * - In transparent VF mode, VF's media status is used until
6810 * the VF is detached.
6812 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6813 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6817 * Re-enable polling if this interface is running and
6818 * the polling is requested.
6820 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6821 hn_polling(sc, sc->hn_pollhz);
6825 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6827 const struct rndis_status_msg *msg;
6830 if (dlen < sizeof(*msg)) {
6831 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6836 switch (msg->rm_status) {
6837 case RNDIS_STATUS_MEDIA_CONNECT:
6838 case RNDIS_STATUS_MEDIA_DISCONNECT:
6839 hn_update_link_status(sc);
6842 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6843 case RNDIS_STATUS_LINK_SPEED_CHANGE:
6844 /* Not really useful; ignore. */
6847 case RNDIS_STATUS_NETWORK_CHANGE:
6848 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6849 if (dlen < ofs + msg->rm_stbuflen ||
6850 msg->rm_stbuflen < sizeof(uint32_t)) {
6851 if_printf(sc->hn_ifp, "network changed\n");
6855 memcpy(&change, ((const uint8_t *)msg) + ofs,
6857 if_printf(sc->hn_ifp, "network changed, change %u\n",
6860 hn_change_network(sc);
6864 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6871 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6873 const struct rndis_pktinfo *pi = info_data;
6876 while (info_dlen != 0) {
6880 if (__predict_false(info_dlen < sizeof(*pi)))
6882 if (__predict_false(info_dlen < pi->rm_size))
6884 info_dlen -= pi->rm_size;
6886 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6888 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6890 dlen = pi->rm_size - pi->rm_pktinfooffset;
6893 switch (pi->rm_type) {
6894 case NDIS_PKTINFO_TYPE_VLAN:
6895 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
6897 info->vlan_info = *((const uint32_t *)data);
6898 mask |= HN_RXINFO_VLAN;
6901 case NDIS_PKTINFO_TYPE_CSUM:
6902 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
6904 info->csum_info = *((const uint32_t *)data);
6905 mask |= HN_RXINFO_CSUM;
6908 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
6909 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
6911 info->hash_value = *((const uint32_t *)data);
6912 mask |= HN_RXINFO_HASHVAL;
6915 case HN_NDIS_PKTINFO_TYPE_HASHINF:
6916 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
6918 info->hash_info = *((const uint32_t *)data);
6919 mask |= HN_RXINFO_HASHINF;
6926 if (mask == HN_RXINFO_ALL) {
6927 /* All found; done */
6931 pi = (const struct rndis_pktinfo *)
6932 ((const uint8_t *)pi + pi->rm_size);
6937 * - If there is no hash value, invalidate the hash info.
6939 if ((mask & HN_RXINFO_HASHVAL) == 0)
6940 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
6944 static __inline bool
6945 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
6948 if (off < check_off) {
6949 if (__predict_true(off + len <= check_off))
6951 } else if (off > check_off) {
6952 if (__predict_true(check_off + check_len <= off))
6959 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
6961 const struct rndis_packet_msg *pkt;
6962 struct hn_rxinfo info;
6963 int data_off, pktinfo_off, data_len, pktinfo_len;
6968 if (__predict_false(dlen < sizeof(*pkt))) {
6969 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
6974 if (__predict_false(dlen < pkt->rm_len)) {
6975 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
6976 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
6979 if (__predict_false(pkt->rm_len <
6980 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
6981 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
6982 "msglen %u, data %u, oob %u, pktinfo %u\n",
6983 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
6984 pkt->rm_pktinfolen);
6987 if (__predict_false(pkt->rm_datalen == 0)) {
6988 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
6995 #define IS_OFFSET_INVALID(ofs) \
6996 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
6997 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
6999 /* XXX Hyper-V does not meet data offset alignment requirement */
7000 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
7001 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7002 "data offset %u\n", pkt->rm_dataoffset);
7005 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
7006 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
7007 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7008 "oob offset %u\n", pkt->rm_oobdataoffset);
7011 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
7012 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
7013 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7014 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
7018 #undef IS_OFFSET_INVALID
7020 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
7021 data_len = pkt->rm_datalen;
7022 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
7023 pktinfo_len = pkt->rm_pktinfolen;
7026 * Check OOB coverage.
7028 if (__predict_false(pkt->rm_oobdatalen != 0)) {
7029 int oob_off, oob_len;
7031 if_printf(rxr->hn_ifp, "got oobdata\n");
7032 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
7033 oob_len = pkt->rm_oobdatalen;
7035 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
7036 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7037 "oob overflow, msglen %u, oob abs %d len %d\n",
7038 pkt->rm_len, oob_off, oob_len);
7043 * Check against data.
7045 if (hn_rndis_check_overlap(oob_off, oob_len,
7046 data_off, data_len)) {
7047 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7048 "oob overlaps data, oob abs %d len %d, "
7049 "data abs %d len %d\n",
7050 oob_off, oob_len, data_off, data_len);
7055 * Check against pktinfo.
7057 if (pktinfo_len != 0 &&
7058 hn_rndis_check_overlap(oob_off, oob_len,
7059 pktinfo_off, pktinfo_len)) {
7060 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7061 "oob overlaps pktinfo, oob abs %d len %d, "
7062 "pktinfo abs %d len %d\n",
7063 oob_off, oob_len, pktinfo_off, pktinfo_len);
7069 * Check per-packet-info coverage and find useful per-packet-info.
7071 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
7072 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
7073 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
7074 if (__predict_true(pktinfo_len != 0)) {
7078 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
7079 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7080 "pktinfo overflow, msglen %u, "
7081 "pktinfo abs %d len %d\n",
7082 pkt->rm_len, pktinfo_off, pktinfo_len);
7087 * Check packet info coverage.
7089 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7090 data_off, data_len);
7091 if (__predict_false(overlap)) {
7092 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7093 "pktinfo overlap data, pktinfo abs %d len %d, "
7094 "data abs %d len %d\n",
7095 pktinfo_off, pktinfo_len, data_off, data_len);
7100 * Find useful per-packet-info.
7102 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7103 pktinfo_len, &info);
7104 if (__predict_false(error)) {
7105 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7111 if (__predict_false(data_off + data_len > pkt->rm_len)) {
7112 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7113 "data overflow, msglen %u, data abs %d len %d\n",
7114 pkt->rm_len, data_off, data_len);
7117 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7120 static __inline void
7121 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7123 const struct rndis_msghdr *hdr;
7125 if (__predict_false(dlen < sizeof(*hdr))) {
7126 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7131 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7132 /* Hot data path. */
7133 hn_rndis_rx_data(rxr, data, dlen);
7138 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7139 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7141 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7145 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7147 const struct hn_nvs_hdr *hdr;
7149 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7150 if_printf(sc->hn_ifp, "invalid nvs notify\n");
7153 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7155 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7156 /* Useless; ignore */
7159 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7163 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7164 const struct vmbus_chanpkt_hdr *pkt)
7166 struct hn_nvs_sendctx *sndc;
7168 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7169 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7170 VMBUS_CHANPKT_DATALEN(pkt));
7173 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7179 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7180 const struct vmbus_chanpkt_hdr *pkthdr)
7182 const struct vmbus_chanpkt_rxbuf *pkt;
7183 const struct hn_nvs_hdr *nvs_hdr;
7186 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7187 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7190 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7192 /* Make sure that this is a RNDIS message. */
7193 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7194 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7199 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7200 if (__predict_false(hlen < sizeof(*pkt))) {
7201 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7204 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7206 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7207 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7212 count = pkt->cp_rxbuf_cnt;
7213 if (__predict_false(hlen <
7214 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7215 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7219 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7220 for (i = 0; i < count; ++i) {
7223 ofs = pkt->cp_rxbuf[i].rb_ofs;
7224 len = pkt->cp_rxbuf[i].rb_len;
7225 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7226 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7227 "ofs %d, len %d\n", i, ofs, len);
7230 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7234 * Ack the consumed RXBUF associated w/ this channel packet,
7235 * so that this RXBUF can be recycled by the hypervisor.
7237 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7241 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7244 struct hn_nvs_rndis_ack ack;
7247 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7248 ack.nvs_status = HN_NVS_STATUS_OK;
7252 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7253 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7254 if (__predict_false(error == EAGAIN)) {
7257 * This should _not_ happen in real world, since the
7258 * consumption of the TX bufring from the TX path is
7261 if (rxr->hn_ack_failed == 0)
7262 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7263 rxr->hn_ack_failed++;
7270 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7275 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7277 struct hn_rx_ring *rxr = xrxr;
7278 struct hn_softc *sc = rxr->hn_ifp->if_softc;
7281 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7284 pktlen = rxr->hn_pktbuf_len;
7285 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7286 if (__predict_false(error == ENOBUFS)) {
7291 * Expand channel packet buffer.
7294 * Use M_WAITOK here, since allocation failure
7297 nlen = rxr->hn_pktbuf_len * 2;
7298 while (nlen < pktlen)
7300 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7302 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7303 rxr->hn_pktbuf_len, nlen);
7305 free(rxr->hn_pktbuf, M_DEVBUF);
7306 rxr->hn_pktbuf = nbuf;
7307 rxr->hn_pktbuf_len = nlen;
7310 } else if (__predict_false(error == EAGAIN)) {
7311 /* No more channel packets; done! */
7314 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7316 switch (pkt->cph_type) {
7317 case VMBUS_CHANPKT_TYPE_COMP:
7318 hn_nvs_handle_comp(sc, chan, pkt);
7321 case VMBUS_CHANPKT_TYPE_RXBUF:
7322 hn_nvs_handle_rxbuf(rxr, chan, pkt);
7325 case VMBUS_CHANPKT_TYPE_INBAND:
7326 hn_nvs_handle_notify(sc, pkt);
7330 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7335 hn_chan_rollup(rxr, rxr->hn_txr);
7339 hn_sysinit(void *arg __unused)
7343 hn_udpcs_fixup = counter_u64_alloc(M_WAITOK);
7345 #ifdef HN_IFSTART_SUPPORT
7347 * Don't use ifnet.if_start if transparent VF mode is requested;
7348 * mainly due to the IFF_DRV_OACTIVE flag.
7350 if (hn_xpnt_vf && hn_use_if_start) {
7351 hn_use_if_start = 0;
7352 printf("hn: tranparent VF mode, if_transmit will be used, "
7353 "instead of if_start\n");
7356 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7357 printf("hn: invalid transparent VF attach routing "
7358 "wait timeout %d, reset to %d\n",
7359 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7360 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7364 * Initialize VF map.
7366 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7367 hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7368 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7372 * Fix the # of TX taskqueues.
7374 if (hn_tx_taskq_cnt <= 0)
7375 hn_tx_taskq_cnt = 1;
7376 else if (hn_tx_taskq_cnt > mp_ncpus)
7377 hn_tx_taskq_cnt = mp_ncpus;
7380 * Fix the TX taskqueue mode.
7382 switch (hn_tx_taskq_mode) {
7383 case HN_TX_TASKQ_M_INDEP:
7384 case HN_TX_TASKQ_M_GLOBAL:
7385 case HN_TX_TASKQ_M_EVTTQ:
7388 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7392 if (vm_guest != VM_GUEST_HV)
7395 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7398 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7399 M_DEVBUF, M_WAITOK);
7400 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7401 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7402 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7403 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7407 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7410 hn_sysuninit(void *arg __unused)
7413 if (hn_tx_taskque != NULL) {
7416 for (i = 0; i < hn_tx_taskq_cnt; ++i)
7417 taskqueue_free(hn_tx_taskque[i]);
7418 free(hn_tx_taskque, M_DEVBUF);
7421 if (hn_vfmap != NULL)
7422 free(hn_vfmap, M_DEVBUF);
7423 rm_destroy(&hn_vfmap_lock);
7425 counter_u64_free(hn_udpcs_fixup);
7427 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);