2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
62 #include <sys/param.h>
64 #include <sys/kernel.h>
65 #include <sys/limits.h>
66 #include <sys/malloc.h>
68 #include <sys/module.h>
70 #include <sys/queue.h>
72 #include <sys/rmlock.h>
75 #include <sys/socket.h>
76 #include <sys/sockio.h>
78 #include <sys/sysctl.h>
79 #include <sys/systm.h>
80 #include <sys/taskqueue.h>
81 #include <sys/buf_ring.h>
82 #include <sys/eventhandler.h>
84 #include <machine/atomic.h>
85 #include <machine/in_cksum.h>
88 #include <net/ethernet.h>
90 #include <net/if_arp.h>
91 #include <net/if_dl.h>
92 #include <net/if_media.h>
93 #include <net/if_types.h>
94 #include <net/if_var.h>
95 #include <net/if_vlan_var.h>
96 #include <net/rndis.h>
98 #include <netinet/in_systm.h>
99 #include <netinet/in.h>
100 #include <netinet/ip.h>
101 #include <netinet/ip6.h>
102 #include <netinet/tcp.h>
103 #include <netinet/tcp_lro.h>
104 #include <netinet/udp.h>
106 #include <dev/hyperv/include/hyperv.h>
107 #include <dev/hyperv/include/hyperv_busdma.h>
108 #include <dev/hyperv/include/vmbus.h>
109 #include <dev/hyperv/include/vmbus_xact.h>
111 #include <dev/hyperv/netvsc/ndis.h>
112 #include <dev/hyperv/netvsc/if_hnreg.h>
113 #include <dev/hyperv/netvsc/if_hnvar.h>
114 #include <dev/hyperv/netvsc/hn_nvs.h>
115 #include <dev/hyperv/netvsc/hn_rndis.h>
117 #include "vmbus_if.h"
119 #define HN_IFSTART_SUPPORT
121 #define HN_RING_CNT_DEF_MAX 8
123 #define HN_VFMAP_SIZE_DEF 8
125 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */
127 /* YYY should get it from the underlying channel */
128 #define HN_TX_DESC_CNT 512
130 #define HN_RNDIS_PKT_LEN \
131 (sizeof(struct rndis_packet_msg) + \
132 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
133 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
134 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
135 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
136 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
137 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
139 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
140 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
141 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
142 /* -1 for RNDIS packet message */
143 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
145 #define HN_DIRECT_TX_SIZE_DEF 128
147 #define HN_EARLY_TXEOF_THRESH 8
149 #define HN_PKTBUF_LEN_DEF (16 * 1024)
151 #define HN_LROENT_CNT_DEF 128
153 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
154 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
155 /* YYY 2*MTU is a bit rough, but should be good enough. */
156 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
158 #define HN_LRO_ACKCNT_DEF 1
160 #define HN_LOCK_INIT(sc) \
161 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
162 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
163 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
164 #define HN_LOCK(sc) \
166 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
169 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
171 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
172 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
173 #define HN_CSUM_IP_HWASSIST(sc) \
174 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
175 #define HN_CSUM_IP6_HWASSIST(sc) \
176 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
178 #define HN_PKTSIZE_MIN(align) \
179 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
180 HN_RNDIS_PKT_LEN, (align))
181 #define HN_PKTSIZE(m, align) \
182 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
184 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
187 #ifndef HN_USE_TXDESC_BUFRING
188 SLIST_ENTRY(hn_txdesc) link;
190 STAILQ_ENTRY(hn_txdesc) agg_link;
192 /* Aggregated txdescs, in sending order. */
193 STAILQ_HEAD(, hn_txdesc) agg_list;
195 /* The oldest packet, if transmission aggregation happens. */
197 struct hn_tx_ring *txr;
199 uint32_t flags; /* HN_TXD_FLAG_ */
200 struct hn_nvs_sendctx send_ctx;
204 bus_dmamap_t data_dmap;
206 bus_addr_t rndis_pkt_paddr;
207 struct rndis_packet_msg *rndis_pkt;
208 bus_dmamap_t rndis_pkt_dmap;
211 #define HN_TXD_FLAG_ONLIST 0x0001
212 #define HN_TXD_FLAG_DMAMAP 0x0002
213 #define HN_TXD_FLAG_ONAGG 0x0004
222 struct hn_rxvf_setarg {
223 struct hn_rx_ring *rxr;
224 struct ifnet *vf_ifp;
227 #define HN_RXINFO_VLAN 0x0001
228 #define HN_RXINFO_CSUM 0x0002
229 #define HN_RXINFO_HASHINF 0x0004
230 #define HN_RXINFO_HASHVAL 0x0008
231 #define HN_RXINFO_ALL \
234 HN_RXINFO_HASHINF | \
237 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
238 #define HN_NDIS_RXCSUM_INFO_INVALID 0
239 #define HN_NDIS_HASH_INFO_INVALID 0
241 static int hn_probe(device_t);
242 static int hn_attach(device_t);
243 static int hn_detach(device_t);
244 static int hn_shutdown(device_t);
245 static void hn_chan_callback(struct vmbus_channel *,
248 static void hn_init(void *);
249 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
250 #ifdef HN_IFSTART_SUPPORT
251 static void hn_start(struct ifnet *);
253 static int hn_transmit(struct ifnet *, struct mbuf *);
254 static void hn_xmit_qflush(struct ifnet *);
255 static int hn_ifmedia_upd(struct ifnet *);
256 static void hn_ifmedia_sts(struct ifnet *,
257 struct ifmediareq *);
259 static void hn_ifnet_event(void *, struct ifnet *, int);
260 static void hn_ifaddr_event(void *, struct ifnet *);
261 static void hn_ifnet_attevent(void *, struct ifnet *);
262 static void hn_ifnet_detevent(void *, struct ifnet *);
263 static void hn_ifnet_lnkevent(void *, struct ifnet *, int);
265 static bool hn_ismyvf(const struct hn_softc *,
266 const struct ifnet *);
267 static void hn_rxvf_change(struct hn_softc *,
268 struct ifnet *, bool);
269 static void hn_rxvf_set(struct hn_softc *, struct ifnet *);
270 static void hn_rxvf_set_task(void *, int);
271 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
272 static int hn_xpnt_vf_iocsetflags(struct hn_softc *);
273 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *,
275 static void hn_xpnt_vf_saveifflags(struct hn_softc *);
276 static bool hn_xpnt_vf_isready(struct hn_softc *);
277 static void hn_xpnt_vf_setready(struct hn_softc *);
278 static void hn_xpnt_vf_init_taskfunc(void *, int);
279 static void hn_xpnt_vf_init(struct hn_softc *);
280 static void hn_xpnt_vf_setenable(struct hn_softc *);
281 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool);
283 static int hn_rndis_rxinfo(const void *, int,
285 static void hn_rndis_rx_data(struct hn_rx_ring *,
287 static void hn_rndis_rx_status(struct hn_softc *,
289 static void hn_rndis_init_fixat(struct hn_softc *, int);
291 static void hn_nvs_handle_notify(struct hn_softc *,
292 const struct vmbus_chanpkt_hdr *);
293 static void hn_nvs_handle_comp(struct hn_softc *,
294 struct vmbus_channel *,
295 const struct vmbus_chanpkt_hdr *);
296 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
297 struct vmbus_channel *,
298 const struct vmbus_chanpkt_hdr *);
299 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
300 struct vmbus_channel *, uint64_t);
302 #if __FreeBSD_version >= 1100099
303 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
304 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
306 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
307 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
308 #if __FreeBSD_version < 1100095
309 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
311 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
313 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
314 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
315 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
316 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
317 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
318 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
319 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
320 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
321 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
322 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
323 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
324 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
325 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
326 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
327 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
328 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
329 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
330 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
331 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
332 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
333 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
335 static void hn_stop(struct hn_softc *, bool);
336 static void hn_init_locked(struct hn_softc *);
337 static int hn_chan_attach(struct hn_softc *,
338 struct vmbus_channel *);
339 static void hn_chan_detach(struct hn_softc *,
340 struct vmbus_channel *);
341 static int hn_attach_subchans(struct hn_softc *);
342 static void hn_detach_allchans(struct hn_softc *);
343 static void hn_chan_rollup(struct hn_rx_ring *,
344 struct hn_tx_ring *);
345 static void hn_set_ring_inuse(struct hn_softc *, int);
346 static int hn_synth_attach(struct hn_softc *, int);
347 static void hn_synth_detach(struct hn_softc *);
348 static int hn_synth_alloc_subchans(struct hn_softc *,
350 static bool hn_synth_attachable(const struct hn_softc *);
351 static void hn_suspend(struct hn_softc *);
352 static void hn_suspend_data(struct hn_softc *);
353 static void hn_suspend_mgmt(struct hn_softc *);
354 static void hn_resume(struct hn_softc *);
355 static void hn_resume_data(struct hn_softc *);
356 static void hn_resume_mgmt(struct hn_softc *);
357 static void hn_suspend_mgmt_taskfunc(void *, int);
358 static void hn_chan_drain(struct hn_softc *,
359 struct vmbus_channel *);
360 static void hn_disable_rx(struct hn_softc *);
361 static void hn_drain_rxtx(struct hn_softc *, int);
362 static void hn_polling(struct hn_softc *, u_int);
363 static void hn_chan_polling(struct vmbus_channel *, u_int);
364 static void hn_mtu_change_fixup(struct hn_softc *);
366 static void hn_update_link_status(struct hn_softc *);
367 static void hn_change_network(struct hn_softc *);
368 static void hn_link_taskfunc(void *, int);
369 static void hn_netchg_init_taskfunc(void *, int);
370 static void hn_netchg_status_taskfunc(void *, int);
371 static void hn_link_status(struct hn_softc *);
373 static int hn_create_rx_data(struct hn_softc *, int);
374 static void hn_destroy_rx_data(struct hn_softc *);
375 static int hn_check_iplen(const struct mbuf *, int);
376 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
377 static int hn_rxfilter_config(struct hn_softc *);
378 static int hn_rss_reconfig(struct hn_softc *);
379 static void hn_rss_ind_fixup(struct hn_softc *);
380 static int hn_rxpkt(struct hn_rx_ring *, const void *,
381 int, const struct hn_rxinfo *);
383 static int hn_tx_ring_create(struct hn_softc *, int);
384 static void hn_tx_ring_destroy(struct hn_tx_ring *);
385 static int hn_create_tx_data(struct hn_softc *, int);
386 static void hn_fixup_tx_data(struct hn_softc *);
387 static void hn_destroy_tx_data(struct hn_softc *);
388 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
389 static void hn_txdesc_gc(struct hn_tx_ring *,
391 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
392 struct hn_txdesc *, struct mbuf **);
393 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
395 static void hn_set_chim_size(struct hn_softc *, int);
396 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
397 static bool hn_tx_ring_pending(struct hn_tx_ring *);
398 static void hn_tx_ring_qflush(struct hn_tx_ring *);
399 static void hn_resume_tx(struct hn_softc *, int);
400 static void hn_set_txagg(struct hn_softc *);
401 static void *hn_try_txagg(struct ifnet *,
402 struct hn_tx_ring *, struct hn_txdesc *,
404 static int hn_get_txswq_depth(const struct hn_tx_ring *);
405 static void hn_txpkt_done(struct hn_nvs_sendctx *,
406 struct hn_softc *, struct vmbus_channel *,
408 static int hn_txpkt_sglist(struct hn_tx_ring *,
410 static int hn_txpkt_chim(struct hn_tx_ring *,
412 static int hn_xmit(struct hn_tx_ring *, int);
413 static void hn_xmit_taskfunc(void *, int);
414 static void hn_xmit_txeof(struct hn_tx_ring *);
415 static void hn_xmit_txeof_taskfunc(void *, int);
416 #ifdef HN_IFSTART_SUPPORT
417 static int hn_start_locked(struct hn_tx_ring *, int);
418 static void hn_start_taskfunc(void *, int);
419 static void hn_start_txeof(struct hn_tx_ring *);
420 static void hn_start_txeof_taskfunc(void *, int);
423 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
424 "Hyper-V network interface");
426 /* Trust tcp segements verification on host side. */
427 static int hn_trust_hosttcp = 1;
428 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
429 &hn_trust_hosttcp, 0,
430 "Trust tcp segement verification on host side, "
431 "when csum info is missing (global setting)");
433 /* Trust udp datagrams verification on host side. */
434 static int hn_trust_hostudp = 1;
435 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
436 &hn_trust_hostudp, 0,
437 "Trust udp datagram verification on host side, "
438 "when csum info is missing (global setting)");
440 /* Trust ip packets verification on host side. */
441 static int hn_trust_hostip = 1;
442 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
444 "Trust ip packet verification on host side, "
445 "when csum info is missing (global setting)");
447 /* Limit TSO burst size */
448 static int hn_tso_maxlen = IP_MAXPACKET;
449 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
450 &hn_tso_maxlen, 0, "TSO burst limit");
452 /* Limit chimney send size */
453 static int hn_tx_chimney_size = 0;
454 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
455 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
457 /* Limit the size of packet for direct transmission */
458 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
459 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
460 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
462 /* # of LRO entries per RX ring */
463 #if defined(INET) || defined(INET6)
464 #if __FreeBSD_version >= 1100095
465 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
466 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
467 &hn_lro_entry_count, 0, "LRO entry count");
471 static int hn_tx_taskq_cnt = 1;
472 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
473 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
475 #define HN_TX_TASKQ_M_INDEP 0
476 #define HN_TX_TASKQ_M_GLOBAL 1
477 #define HN_TX_TASKQ_M_EVTTQ 2
479 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
480 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
481 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
482 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
484 #ifndef HN_USE_TXDESC_BUFRING
485 static int hn_use_txdesc_bufring = 0;
487 static int hn_use_txdesc_bufring = 1;
489 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
490 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
492 #ifdef HN_IFSTART_SUPPORT
493 /* Use ifnet.if_start instead of ifnet.if_transmit */
494 static int hn_use_if_start = 0;
495 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
496 &hn_use_if_start, 0, "Use if_start TX method");
499 /* # of channels to use */
500 static int hn_chan_cnt = 0;
501 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
503 "# of channels to use; each channel has one RX ring and one TX ring");
505 /* # of transmit rings to use */
506 static int hn_tx_ring_cnt = 0;
507 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
508 &hn_tx_ring_cnt, 0, "# of TX rings to use");
510 /* Software TX ring deptch */
511 static int hn_tx_swq_depth = 0;
512 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
513 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
515 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
516 #if __FreeBSD_version >= 1100095
517 static u_int hn_lro_mbufq_depth = 0;
518 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
519 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
522 /* Packet transmission aggregation size limit */
523 static int hn_tx_agg_size = -1;
524 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
525 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
527 /* Packet transmission aggregation count limit */
528 static int hn_tx_agg_pkts = -1;
529 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
530 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
533 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
534 0, 0, hn_vflist_sysctl, "A", "VF list");
537 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
538 0, 0, hn_vfmap_sysctl, "A", "VF mapping");
541 static int hn_xpnt_vf = 0;
542 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
543 &hn_xpnt_vf, 0, "Transparent VF mod");
545 /* Accurate BPF support for Transparent VF */
546 static int hn_xpnt_vf_accbpf = 0;
547 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
548 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
550 /* Extra wait for transparent VF attach routing; unit seconds. */
551 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
552 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
553 &hn_xpnt_vf_attwait, 0,
554 "Extra wait for transparent VF attach routing; unit: seconds");
556 static u_int hn_cpu_index; /* next CPU for channel */
557 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
559 static struct rmlock hn_vfmap_lock;
560 static int hn_vfmap_size;
561 static struct ifnet **hn_vfmap;
564 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
565 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
566 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
567 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
568 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
569 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
572 static const struct hyperv_guid hn_guid = {
574 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
575 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
578 static device_method_t hn_methods[] = {
579 /* Device interface */
580 DEVMETHOD(device_probe, hn_probe),
581 DEVMETHOD(device_attach, hn_attach),
582 DEVMETHOD(device_detach, hn_detach),
583 DEVMETHOD(device_shutdown, hn_shutdown),
587 static driver_t hn_driver = {
590 sizeof(struct hn_softc)
593 static devclass_t hn_devclass;
595 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
596 MODULE_VERSION(hn, 1);
597 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
599 #if __FreeBSD_version >= 1100099
601 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
605 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
606 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
611 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
614 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
615 txd->chim_size == 0, ("invalid rndis sglist txd"));
616 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
617 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
621 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
623 struct hn_nvs_rndis rndis;
625 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
626 txd->chim_size > 0, ("invalid rndis chim txd"));
628 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
629 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
630 rndis.nvs_chim_idx = txd->chim_index;
631 rndis.nvs_chim_sz = txd->chim_size;
633 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
634 &rndis, sizeof(rndis), &txd->send_ctx));
637 static __inline uint32_t
638 hn_chim_alloc(struct hn_softc *sc)
640 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
641 u_long *bmap = sc->hn_chim_bmap;
642 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
644 for (i = 0; i < bmap_cnt; ++i) {
647 idx = ffsl(~bmap[i]);
651 --idx; /* ffsl is 1-based */
652 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
653 ("invalid i %d and idx %d", i, idx));
655 if (atomic_testandset_long(&bmap[i], idx))
658 ret = i * LONG_BIT + idx;
665 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
670 idx = chim_idx / LONG_BIT;
671 KASSERT(idx < sc->hn_chim_bmap_cnt,
672 ("invalid chimney index 0x%x", chim_idx));
674 mask = 1UL << (chim_idx % LONG_BIT);
675 KASSERT(sc->hn_chim_bmap[idx] & mask,
676 ("index bitmap 0x%lx, chimney index %u, "
677 "bitmap idx %d, bitmask 0x%lx",
678 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
680 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
683 #if defined(INET6) || defined(INET)
685 #define PULLUP_HDR(m, len) \
687 if (__predict_false((m)->m_len < (len))) { \
688 (m) = m_pullup((m), (len)); \
695 * NOTE: If this function failed, the m_head would be freed.
697 static __inline struct mbuf *
698 hn_tso_fixup(struct mbuf *m_head)
700 struct ether_vlan_header *evl;
704 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
706 PULLUP_HDR(m_head, sizeof(*evl));
707 evl = mtod(m_head, struct ether_vlan_header *);
708 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
709 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
711 ehlen = ETHER_HDR_LEN;
714 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
718 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
719 ip = mtodo(m_head, ehlen);
720 iphlen = ip->ip_hl << 2;
722 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
723 th = mtodo(m_head, ehlen + iphlen);
727 th->th_sum = in_pseudo(ip->ip_src.s_addr,
728 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
731 #if defined(INET6) && defined(INET)
738 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
739 ip6 = mtodo(m_head, ehlen);
740 if (ip6->ip6_nxt != IPPROTO_TCP) {
745 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
746 th = mtodo(m_head, ehlen + sizeof(*ip6));
749 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
757 * NOTE: If this function failed, the m_head would be freed.
759 static __inline struct mbuf *
760 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
762 const struct ether_vlan_header *evl;
763 const struct tcphdr *th;
768 PULLUP_HDR(m_head, sizeof(*evl));
769 evl = mtod(m_head, const struct ether_vlan_header *);
770 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
771 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
773 ehlen = ETHER_HDR_LEN;
776 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TCP) {
780 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
781 ip = mtodo(m_head, ehlen);
782 iphlen = ip->ip_hl << 2;
784 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
785 th = mtodo(m_head, ehlen + iphlen);
786 if (th->th_flags & TH_SYN)
790 #if defined(INET6) && defined(INET)
795 const struct ip6_hdr *ip6;
797 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
798 ip6 = mtodo(m_head, ehlen);
799 if (ip6->ip6_nxt != IPPROTO_TCP)
802 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
803 th = mtodo(m_head, ehlen + sizeof(*ip6));
804 if (th->th_flags & TH_SYN)
813 #endif /* INET6 || INET */
816 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
822 if (sc->hn_rx_filter != filter) {
823 error = hn_rndis_set_rxfilter(sc, filter);
825 sc->hn_rx_filter = filter;
831 hn_rxfilter_config(struct hn_softc *sc)
833 struct ifnet *ifp = sc->hn_ifp;
839 * If the non-transparent mode VF is activated, we don't know how
840 * its RX filter is configured, so stick the synthetic device in
841 * the promiscous mode.
843 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
844 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
846 filter = NDIS_PACKET_TYPE_DIRECTED;
847 if (ifp->if_flags & IFF_BROADCAST)
848 filter |= NDIS_PACKET_TYPE_BROADCAST;
849 /* TODO: support multicast list */
850 if ((ifp->if_flags & IFF_ALLMULTI) ||
851 !TAILQ_EMPTY(&ifp->if_multiaddrs))
852 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
854 return (hn_set_rxfilter(sc, filter));
858 hn_set_txagg(struct hn_softc *sc)
864 * Setup aggregation size.
866 if (sc->hn_agg_size < 0)
869 size = sc->hn_agg_size;
871 if (sc->hn_rndis_agg_size < size)
872 size = sc->hn_rndis_agg_size;
874 /* NOTE: We only aggregate packets using chimney sending buffers. */
875 if (size > (uint32_t)sc->hn_chim_szmax)
876 size = sc->hn_chim_szmax;
878 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
885 /* NOTE: Type of the per TX ring setting is 'int'. */
890 * Setup aggregation packet count.
892 if (sc->hn_agg_pkts < 0)
895 pkts = sc->hn_agg_pkts;
897 if (sc->hn_rndis_agg_pkts < pkts)
898 pkts = sc->hn_rndis_agg_pkts;
907 /* NOTE: Type of the per TX ring setting is 'short'. */
912 /* NOTE: Type of the per TX ring setting is 'short'. */
913 if (sc->hn_rndis_agg_align > SHRT_MAX) {
920 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
921 size, pkts, sc->hn_rndis_agg_align);
924 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
925 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
927 mtx_lock(&txr->hn_tx_lock);
928 txr->hn_agg_szmax = size;
929 txr->hn_agg_pktmax = pkts;
930 txr->hn_agg_align = sc->hn_rndis_agg_align;
931 mtx_unlock(&txr->hn_tx_lock);
936 hn_get_txswq_depth(const struct hn_tx_ring *txr)
939 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
940 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
941 return txr->hn_txdesc_cnt;
942 return hn_tx_swq_depth;
946 hn_rss_reconfig(struct hn_softc *sc)
952 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
959 * Direct reconfiguration by setting the UNCHG flags does
960 * _not_ work properly.
963 if_printf(sc->hn_ifp, "disable RSS\n");
964 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
966 if_printf(sc->hn_ifp, "RSS disable failed\n");
971 * Reenable the RSS w/ the updated RSS key or indirect
975 if_printf(sc->hn_ifp, "reconfig RSS\n");
976 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
978 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
985 hn_rss_ind_fixup(struct hn_softc *sc)
987 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
990 nchan = sc->hn_rx_ring_inuse;
991 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
994 * Check indirect table to make sure that all channels in it
997 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
998 if (rss->rss_ind[i] >= nchan) {
999 if_printf(sc->hn_ifp,
1000 "RSS indirect table %d fixup: %u -> %d\n",
1001 i, rss->rss_ind[i], nchan - 1);
1002 rss->rss_ind[i] = nchan - 1;
1008 hn_ifmedia_upd(struct ifnet *ifp __unused)
1015 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1017 struct hn_softc *sc = ifp->if_softc;
1019 ifmr->ifm_status = IFM_AVALID;
1020 ifmr->ifm_active = IFM_ETHER;
1022 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1023 ifmr->ifm_active |= IFM_NONE;
1026 ifmr->ifm_status |= IFM_ACTIVE;
1027 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1031 hn_rxvf_set_task(void *xarg, int pending __unused)
1033 struct hn_rxvf_setarg *arg = xarg;
1035 arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1039 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1041 struct hn_rx_ring *rxr;
1042 struct hn_rxvf_setarg arg;
1048 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1050 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1051 rxr = &sc->hn_rx_ring[i];
1053 if (i < sc->hn_rx_ring_inuse) {
1055 arg.vf_ifp = vf_ifp;
1056 vmbus_chan_run_task(rxr->hn_chan, &task);
1058 rxr->hn_rxvf_ifp = vf_ifp;
1064 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1066 const struct ifnet *hn_ifp;
1068 hn_ifp = sc->hn_ifp;
1073 if (ifp->if_alloctype != IFT_ETHER)
1076 /* Ignore lagg/vlan interfaces */
1077 if (strcmp(ifp->if_dname, "lagg") == 0 ||
1078 strcmp(ifp->if_dname, "vlan") == 0)
1081 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1088 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1090 struct ifnet *hn_ifp;
1094 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1097 if (!hn_ismyvf(sc, ifp))
1099 hn_ifp = sc->hn_ifp;
1102 if (sc->hn_flags & HN_FLAG_RXVF)
1105 sc->hn_flags |= HN_FLAG_RXVF;
1106 hn_rxfilter_config(sc);
1108 if (!(sc->hn_flags & HN_FLAG_RXVF))
1111 sc->hn_flags &= ~HN_FLAG_RXVF;
1112 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1113 hn_rxfilter_config(sc);
1115 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1118 hn_nvs_set_datapath(sc,
1119 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1121 hn_rxvf_set(sc, rxvf ? ifp : NULL);
1124 hn_suspend_mgmt(sc);
1125 sc->hn_link_flags &=
1126 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1127 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1132 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1133 rxvf ? "VF_UP" : "VF_DOWN", NULL);
1136 if_printf(hn_ifp, "datapath is switched %s %s\n",
1137 rxvf ? "to" : "from", ifp->if_xname);
1144 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1147 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1149 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1153 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1156 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1160 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1162 struct ifnet *ifp, *vf_ifp;
1168 vf_ifp = sc->hn_vf_ifp;
1171 * Fix up requested capabilities w/ supported capabilities,
1172 * since the supported capabilities could have been changed.
1174 ifr->ifr_reqcap &= ifp->if_capabilities;
1175 /* Pass SIOCSIFCAP to VF. */
1176 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1180 * The error will be propagated to the callers, however, it
1181 * is _not_ useful here.
1185 * Merge VF's enabled capabilities.
1187 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1189 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1190 if (ifp->if_capenable & IFCAP_TXCSUM)
1191 ifp->if_hwassist |= tmp;
1193 ifp->if_hwassist &= ~tmp;
1195 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1196 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1197 ifp->if_hwassist |= tmp;
1199 ifp->if_hwassist &= ~tmp;
1201 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1202 if (ifp->if_capenable & IFCAP_TSO4)
1203 ifp->if_hwassist |= tmp;
1205 ifp->if_hwassist &= ~tmp;
1207 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1208 if (ifp->if_capenable & IFCAP_TSO6)
1209 ifp->if_hwassist |= tmp;
1211 ifp->if_hwassist &= ~tmp;
1217 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1219 struct ifnet *vf_ifp;
1223 vf_ifp = sc->hn_vf_ifp;
1225 memset(&ifr, 0, sizeof(ifr));
1226 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1227 ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1228 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1229 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1233 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1235 struct ifnet *ifp = sc->hn_ifp;
1240 /* XXX vlan(4) style mcast addr maintenance */
1241 if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1242 allmulti = IFF_ALLMULTI;
1244 /* Always set the VF's if_flags */
1245 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1249 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1251 struct rm_priotracker pt;
1252 struct ifnet *hn_ifp = NULL;
1256 * XXX racy, if hn(4) ever detached.
1258 rm_rlock(&hn_vfmap_lock, &pt);
1259 if (vf_ifp->if_index < hn_vfmap_size)
1260 hn_ifp = hn_vfmap[vf_ifp->if_index];
1261 rm_runlock(&hn_vfmap_lock, &pt);
1263 if (hn_ifp != NULL) {
1264 for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1266 * Allow tapping on the VF.
1268 ETHER_BPF_MTAP(vf_ifp, mn);
1273 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1274 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1278 * XXX IFCOUNTER_IMCAST
1279 * This stat updating is kinda invasive, since it
1280 * requires two checks on the mbuf: the length check
1281 * and the ethernet header check. As of this write,
1282 * all multicast packets go directly to hn(4), which
1283 * makes imcast stat updating in the VF a try in vian.
1287 * Fix up rcvif and increase hn(4)'s ipackets.
1289 mn->m_pkthdr.rcvif = hn_ifp;
1290 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1293 * Go through hn(4)'s if_input.
1295 hn_ifp->if_input(hn_ifp, m);
1298 * In the middle of the transition; free this
1303 m->m_nextpkt = NULL;
1311 hn_mtu_change_fixup(struct hn_softc *sc)
1318 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1319 #if __FreeBSD_version >= 1100099
1320 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1321 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1326 hn_xpnt_vf_setready(struct hn_softc *sc)
1328 struct ifnet *ifp, *vf_ifp;
1333 vf_ifp = sc->hn_vf_ifp;
1336 * Mark the VF ready.
1338 sc->hn_vf_rdytick = 0;
1341 * Save information for restoration.
1343 sc->hn_saved_caps = ifp->if_capabilities;
1344 sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1345 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1346 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1349 * Intersect supported/enabled capabilities.
1352 * if_hwassist is not changed here.
1354 ifp->if_capabilities &= vf_ifp->if_capabilities;
1355 ifp->if_capenable &= ifp->if_capabilities;
1360 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1361 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1362 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1363 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1364 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1365 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1368 * Change VF's enabled capabilities.
1370 memset(&ifr, 0, sizeof(ifr));
1371 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1372 ifr.ifr_reqcap = ifp->if_capenable;
1373 hn_xpnt_vf_iocsetcaps(sc, &ifr);
1375 if (ifp->if_mtu != ETHERMTU) {
1381 memset(&ifr, 0, sizeof(ifr));
1382 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1383 ifr.ifr_mtu = ifp->if_mtu;
1384 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1386 if_printf(ifp, "%s SIOCSIFMTU %lu failed\n",
1387 vf_ifp->if_xname, ifp->if_mtu);
1388 if (ifp->if_mtu > ETHERMTU) {
1389 if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1393 * No need to adjust the synthetic parts' MTU;
1394 * failure of the adjustment will cause us
1395 * infinite headache.
1397 ifp->if_mtu = ETHERMTU;
1398 hn_mtu_change_fixup(sc);
1405 hn_xpnt_vf_isready(struct hn_softc *sc)
1410 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1413 if (sc->hn_vf_rdytick == 0)
1416 if (sc->hn_vf_rdytick > ticks)
1419 /* Mark VF as ready. */
1420 hn_xpnt_vf_setready(sc);
1425 hn_xpnt_vf_setenable(struct hn_softc *sc)
1431 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1432 rm_wlock(&sc->hn_vf_lock);
1433 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1434 rm_wunlock(&sc->hn_vf_lock);
1436 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1437 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1441 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1447 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1448 rm_wlock(&sc->hn_vf_lock);
1449 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1451 sc->hn_vf_ifp = NULL;
1452 rm_wunlock(&sc->hn_vf_lock);
1454 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1455 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1459 hn_xpnt_vf_init(struct hn_softc *sc)
1465 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1466 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1469 if_printf(sc->hn_ifp, "try bringing up %s\n",
1470 sc->hn_vf_ifp->if_xname);
1476 hn_xpnt_vf_saveifflags(sc);
1477 sc->hn_vf_ifp->if_flags |= IFF_UP;
1478 error = hn_xpnt_vf_iocsetflags(sc);
1480 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1481 sc->hn_vf_ifp->if_xname, error);
1487 * Datapath setting must happen _after_ bringing the VF up.
1489 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1491 /* Mark transparent mode VF as enabled. */
1492 hn_xpnt_vf_setenable(sc);
1496 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1498 struct hn_softc *sc = xsc;
1502 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1504 if (sc->hn_vf_ifp == NULL)
1506 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1509 if (sc->hn_vf_rdytick != 0) {
1510 /* Mark VF as ready. */
1511 hn_xpnt_vf_setready(sc);
1514 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1516 * Delayed VF initialization.
1519 if_printf(sc->hn_ifp, "delayed initialize %s\n",
1520 sc->hn_vf_ifp->if_xname);
1522 hn_xpnt_vf_init(sc);
1529 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1531 struct hn_softc *sc = xsc;
1535 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1538 if (!hn_ismyvf(sc, ifp))
1541 if (sc->hn_vf_ifp != NULL) {
1542 if_printf(sc->hn_ifp, "%s was attached as VF\n",
1543 sc->hn_vf_ifp->if_xname);
1547 if (hn_xpnt_vf && ifp->if_start != NULL) {
1549 * ifnet.if_start is _not_ supported by transparent
1550 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1552 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1553 "in transparent VF mode.\n", ifp->if_xname);
1557 rm_wlock(&hn_vfmap_lock);
1559 if (ifp->if_index >= hn_vfmap_size) {
1560 struct ifnet **newmap;
1563 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1564 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1567 memcpy(newmap, hn_vfmap,
1568 sizeof(struct ifnet *) * hn_vfmap_size);
1569 free(hn_vfmap, M_DEVBUF);
1571 hn_vfmap_size = newsize;
1573 KASSERT(hn_vfmap[ifp->if_index] == NULL,
1574 ("%s: ifindex %d was mapped to %s",
1575 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1576 hn_vfmap[ifp->if_index] = sc->hn_ifp;
1578 rm_wunlock(&hn_vfmap_lock);
1580 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1581 rm_wlock(&sc->hn_vf_lock);
1582 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1583 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1584 sc->hn_vf_ifp = ifp;
1585 rm_wunlock(&sc->hn_vf_lock);
1591 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1592 * Save vf_ifp's current if_input for later restoration.
1594 sc->hn_vf_input = ifp->if_input;
1595 ifp->if_input = hn_xpnt_vf_input;
1598 * Stop link status management; use the VF's.
1600 hn_suspend_mgmt(sc);
1603 * Give VF sometime to complete its attach routing.
1605 wait_ticks = hn_xpnt_vf_attwait * hz;
1606 sc->hn_vf_rdytick = ticks + wait_ticks;
1608 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1616 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1618 struct hn_softc *sc = xsc;
1622 if (sc->hn_vf_ifp == NULL)
1625 if (!hn_ismyvf(sc, ifp))
1630 * Make sure that the delayed initialization is not running.
1633 * - This lock _must_ be released, since the hn_vf_init task
1634 * will try holding this lock.
1635 * - It is safe to release this lock here, since the
1636 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1638 * XXX racy, if hn(4) ever detached.
1641 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1644 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1645 sc->hn_ifp->if_xname));
1646 ifp->if_input = sc->hn_vf_input;
1647 sc->hn_vf_input = NULL;
1649 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1650 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1652 if (sc->hn_vf_rdytick == 0) {
1654 * The VF was ready; restore some settings.
1656 sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
1659 * There is _no_ need to fixup if_capenable and
1660 * if_hwassist, since the if_capabilities before
1661 * restoration was an intersection of the VF's
1662 * if_capabilites and the synthetic device's
1665 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
1666 sc->hn_ifp->if_hw_tsomaxsegcount =
1667 sc->hn_saved_tsosegcnt;
1668 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
1672 * Resume link status management, which was suspended
1673 * by hn_ifnet_attevent().
1678 /* Mark transparent mode VF as disabled. */
1679 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
1681 rm_wlock(&hn_vfmap_lock);
1683 KASSERT(ifp->if_index < hn_vfmap_size,
1684 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
1685 if (hn_vfmap[ifp->if_index] != NULL) {
1686 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
1687 ("%s: ifindex %d was mapped to %s",
1688 ifp->if_xname, ifp->if_index,
1689 hn_vfmap[ifp->if_index]->if_xname));
1690 hn_vfmap[ifp->if_index] = NULL;
1693 rm_wunlock(&hn_vfmap_lock);
1699 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
1701 struct hn_softc *sc = xsc;
1703 if (sc->hn_vf_ifp == ifp)
1704 if_link_state_change(sc->hn_ifp, link_state);
1708 hn_probe(device_t dev)
1711 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
1712 device_set_desc(dev, "Hyper-V Network Interface");
1713 return BUS_PROBE_DEFAULT;
1719 hn_attach(device_t dev)
1721 struct hn_softc *sc = device_get_softc(dev);
1722 struct sysctl_oid_list *child;
1723 struct sysctl_ctx_list *ctx;
1724 uint8_t eaddr[ETHER_ADDR_LEN];
1725 struct ifnet *ifp = NULL;
1726 int error, ring_cnt, tx_ring_cnt;
1729 sc->hn_prichan = vmbus_get_channel(dev);
1731 rm_init(&sc->hn_vf_lock, "hnvf");
1732 if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
1733 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
1736 * Initialize these tunables once.
1738 sc->hn_agg_size = hn_tx_agg_size;
1739 sc->hn_agg_pkts = hn_tx_agg_pkts;
1742 * Setup taskqueue for transmission.
1744 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
1748 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
1749 M_DEVBUF, M_WAITOK);
1750 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
1751 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
1752 M_WAITOK, taskqueue_thread_enqueue,
1753 &sc->hn_tx_taskqs[i]);
1754 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
1755 "%s tx%d", device_get_nameunit(dev), i);
1757 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
1758 sc->hn_tx_taskqs = hn_tx_taskque;
1762 * Setup taskqueue for mangement tasks, e.g. link status.
1764 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
1765 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
1766 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
1767 device_get_nameunit(dev));
1768 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
1769 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
1770 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
1771 hn_netchg_status_taskfunc, sc);
1775 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
1777 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
1778 taskqueue_thread_enqueue, &sc->hn_vf_taskq);
1779 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
1780 device_get_nameunit(dev));
1781 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
1782 hn_xpnt_vf_init_taskfunc, sc);
1786 * Allocate ifnet and setup its name earlier, so that if_printf
1787 * can be used by functions, which will be called after
1790 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
1792 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
1795 * Initialize ifmedia earlier so that it can be unconditionally
1796 * destroyed, if error happened later on.
1798 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
1801 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
1802 * to use (tx_ring_cnt).
1805 * The # of RX rings to use is same as the # of channels to use.
1807 ring_cnt = hn_chan_cnt;
1808 if (ring_cnt <= 0) {
1810 ring_cnt = mp_ncpus;
1811 if (ring_cnt > HN_RING_CNT_DEF_MAX)
1812 ring_cnt = HN_RING_CNT_DEF_MAX;
1813 } else if (ring_cnt > mp_ncpus) {
1814 ring_cnt = mp_ncpus;
1817 tx_ring_cnt = hn_tx_ring_cnt;
1818 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
1819 tx_ring_cnt = ring_cnt;
1820 #ifdef HN_IFSTART_SUPPORT
1821 if (hn_use_if_start) {
1822 /* ifnet.if_start only needs one TX ring. */
1828 * Set the leader CPU for channels.
1830 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1833 * Create enough TX/RX rings, even if only limited number of
1834 * channels can be allocated.
1836 error = hn_create_tx_data(sc, tx_ring_cnt);
1839 error = hn_create_rx_data(sc, ring_cnt);
1844 * Create transaction context for NVS and RNDIS transactions.
1846 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1847 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1848 if (sc->hn_xact == NULL) {
1854 * Install orphan handler for the revocation of this device's
1858 * The processing order is critical here:
1859 * Install the orphan handler, _before_ testing whether this
1860 * device's primary channel has been revoked or not.
1862 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1863 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1869 * Attach the synthetic parts, i.e. NVS and RNDIS.
1871 error = hn_synth_attach(sc, ETHERMTU);
1875 error = hn_rndis_get_eaddr(sc, eaddr);
1879 #if __FreeBSD_version >= 1100099
1880 if (sc->hn_rx_ring_inuse > 1) {
1882 * Reduce TCP segment aggregation limit for multiple
1883 * RX rings to increase ACK timeliness.
1885 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1890 * Fixup TX stuffs after synthetic parts are attached.
1892 hn_fixup_tx_data(sc);
1894 ctx = device_get_sysctl_ctx(dev);
1895 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1896 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1897 &sc->hn_nvs_ver, 0, "NVS version");
1898 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1899 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1900 hn_ndis_version_sysctl, "A", "NDIS version");
1901 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1902 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1903 hn_caps_sysctl, "A", "capabilities");
1904 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1905 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1906 hn_hwassist_sysctl, "A", "hwassist");
1907 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
1908 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
1909 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
1910 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
1911 "max # of TSO segments");
1912 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
1913 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
1914 "max size of TSO segment");
1915 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1916 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1917 hn_rxfilter_sysctl, "A", "rxfilter");
1918 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1919 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1920 hn_rss_hash_sysctl, "A", "RSS hash");
1921 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1922 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1923 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1924 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1925 hn_rss_key_sysctl, "IU", "RSS key");
1926 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1927 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1928 hn_rss_ind_sysctl, "IU", "RSS indirect table");
1929 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1930 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1931 "RNDIS offered packet transmission aggregation size limit");
1932 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1933 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1934 "RNDIS offered packet transmission aggregation count limit");
1935 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1936 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1937 "RNDIS packet transmission aggregation alignment");
1938 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1939 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1940 hn_txagg_size_sysctl, "I",
1941 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1942 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1943 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1944 hn_txagg_pkts_sysctl, "I",
1945 "Packet transmission aggregation packets, "
1946 "0 -- disable, -1 -- auto");
1947 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1948 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1949 hn_polling_sysctl, "I",
1950 "Polling frequency: [100,1000000], 0 disable polling");
1951 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
1952 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1953 hn_vf_sysctl, "A", "Virtual Function's name");
1955 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
1956 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1957 hn_rxvf_sysctl, "A", "activated Virtual Function's name");
1959 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
1960 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1961 hn_xpnt_vf_enabled_sysctl, "I",
1962 "Transparent VF enabled");
1963 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
1964 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1965 hn_xpnt_vf_accbpf_sysctl, "I",
1966 "Accurate BPF for transparent VF");
1970 * Setup the ifmedia, which has been initialized earlier.
1972 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1973 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1974 /* XXX ifmedia_set really should do this for us */
1975 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1978 * Setup the ifnet for this interface.
1982 ifp->if_baudrate = IF_Gbps(10);
1984 /* if_baudrate is 32bits on 32bit system. */
1985 ifp->if_baudrate = IF_Gbps(1);
1987 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1988 ifp->if_ioctl = hn_ioctl;
1989 ifp->if_init = hn_init;
1990 #ifdef HN_IFSTART_SUPPORT
1991 if (hn_use_if_start) {
1992 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1994 ifp->if_start = hn_start;
1995 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1996 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1997 IFQ_SET_READY(&ifp->if_snd);
2001 ifp->if_transmit = hn_transmit;
2002 ifp->if_qflush = hn_xmit_qflush;
2005 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2007 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2008 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2010 if (sc->hn_caps & HN_CAP_VLAN) {
2011 /* XXX not sure about VLAN_MTU. */
2012 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2015 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2016 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2017 ifp->if_capabilities |= IFCAP_TXCSUM;
2018 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2019 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2020 if (sc->hn_caps & HN_CAP_TSO4) {
2021 ifp->if_capabilities |= IFCAP_TSO4;
2022 ifp->if_hwassist |= CSUM_IP_TSO;
2024 if (sc->hn_caps & HN_CAP_TSO6) {
2025 ifp->if_capabilities |= IFCAP_TSO6;
2026 ifp->if_hwassist |= CSUM_IP6_TSO;
2029 /* Enable all available capabilities by default. */
2030 ifp->if_capenable = ifp->if_capabilities;
2033 * Disable IPv6 TSO and TXCSUM by default, they still can
2034 * be enabled through SIOCSIFCAP.
2036 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2037 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2039 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2041 * Lock hn_set_tso_maxsize() to simplify its
2045 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2047 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2048 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2051 ether_ifattach(ifp, eaddr);
2053 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2054 if_printf(ifp, "TSO segcnt %u segsz %u\n",
2055 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2058 /* Inform the upper layer about the long frame support. */
2059 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2062 * Kick off link status check.
2064 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2065 hn_update_link_status(sc);
2068 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2069 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2070 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2071 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2073 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2074 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2079 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2080 * since interface's LLADDR is needed; interface LLADDR is not
2081 * available when ifnet_arrival event is triggered.
2083 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2084 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2085 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2086 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2090 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2091 hn_synth_detach(sc);
2097 hn_detach(device_t dev)
2099 struct hn_softc *sc = device_get_softc(dev);
2100 struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2102 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2104 * In case that the vmbus missed the orphan handler
2107 vmbus_xact_ctx_orphan(sc->hn_xact);
2110 if (sc->hn_ifaddr_evthand != NULL)
2111 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2112 if (sc->hn_ifnet_evthand != NULL)
2113 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2114 if (sc->hn_ifnet_atthand != NULL) {
2115 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2116 sc->hn_ifnet_atthand);
2118 if (sc->hn_ifnet_dethand != NULL) {
2119 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2120 sc->hn_ifnet_dethand);
2122 if (sc->hn_ifnet_lnkhand != NULL)
2123 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2125 vf_ifp = sc->hn_vf_ifp;
2126 __compiler_membar();
2128 hn_ifnet_detevent(sc, vf_ifp);
2130 if (device_is_attached(dev)) {
2132 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2133 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2137 * hn_stop() only suspends data, so managment
2138 * stuffs have to be suspended manually here.
2140 hn_suspend_mgmt(sc);
2141 hn_synth_detach(sc);
2144 ether_ifdetach(ifp);
2147 ifmedia_removeall(&sc->hn_media);
2148 hn_destroy_rx_data(sc);
2149 hn_destroy_tx_data(sc);
2151 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2154 for (i = 0; i < hn_tx_taskq_cnt; ++i)
2155 taskqueue_free(sc->hn_tx_taskqs[i]);
2156 free(sc->hn_tx_taskqs, M_DEVBUF);
2158 taskqueue_free(sc->hn_mgmt_taskq0);
2159 if (sc->hn_vf_taskq != NULL)
2160 taskqueue_free(sc->hn_vf_taskq);
2162 if (sc->hn_xact != NULL) {
2164 * Uninstall the orphan handler _before_ the xact is
2167 vmbus_chan_unset_orphan(sc->hn_prichan);
2168 vmbus_xact_ctx_destroy(sc->hn_xact);
2173 HN_LOCK_DESTROY(sc);
2174 rm_destroy(&sc->hn_vf_lock);
2179 hn_shutdown(device_t dev)
2186 hn_link_status(struct hn_softc *sc)
2188 uint32_t link_status;
2191 error = hn_rndis_get_linkstatus(sc, &link_status);
2193 /* XXX what to do? */
2197 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2198 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2200 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2201 if_link_state_change(sc->hn_ifp,
2202 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2203 LINK_STATE_UP : LINK_STATE_DOWN);
2207 hn_link_taskfunc(void *xsc, int pending __unused)
2209 struct hn_softc *sc = xsc;
2211 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2217 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2219 struct hn_softc *sc = xsc;
2221 /* Prevent any link status checks from running. */
2222 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2225 * Fake up a [link down --> link up] state change; 5 seconds
2226 * delay is used, which closely simulates miibus reaction
2227 * upon link down event.
2229 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2230 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2231 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2232 &sc->hn_netchg_status, 5 * hz);
2236 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2238 struct hn_softc *sc = xsc;
2240 /* Re-allow link status checks. */
2241 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2246 hn_update_link_status(struct hn_softc *sc)
2249 if (sc->hn_mgmt_taskq != NULL)
2250 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2254 hn_change_network(struct hn_softc *sc)
2257 if (sc->hn_mgmt_taskq != NULL)
2258 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2262 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2263 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2265 struct mbuf *m = *m_head;
2268 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2270 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2271 m, segs, nsegs, BUS_DMA_NOWAIT);
2272 if (error == EFBIG) {
2275 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2279 *m_head = m = m_new;
2280 txr->hn_tx_collapsed++;
2282 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2283 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2286 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2287 BUS_DMASYNC_PREWRITE);
2288 txd->flags |= HN_TXD_FLAG_DMAMAP;
2294 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2297 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2298 ("put an onlist txd %#x", txd->flags));
2299 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2300 ("put an onagg txd %#x", txd->flags));
2302 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2303 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2306 if (!STAILQ_EMPTY(&txd->agg_list)) {
2307 struct hn_txdesc *tmp_txd;
2309 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2312 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2313 ("resursive aggregation on aggregated txdesc"));
2314 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2315 ("not aggregated txdesc"));
2316 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2317 ("aggregated txdesc uses dmamap"));
2318 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2319 ("aggregated txdesc consumes "
2320 "chimney sending buffer"));
2321 KASSERT(tmp_txd->chim_size == 0,
2322 ("aggregated txdesc has non-zero "
2323 "chimney sending size"));
2325 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2326 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2327 freed = hn_txdesc_put(txr, tmp_txd);
2328 KASSERT(freed, ("failed to free aggregated txdesc"));
2332 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2333 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2334 ("chim txd uses dmamap"));
2335 hn_chim_free(txr->hn_sc, txd->chim_index);
2336 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2338 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2339 bus_dmamap_sync(txr->hn_tx_data_dtag,
2340 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2341 bus_dmamap_unload(txr->hn_tx_data_dtag,
2343 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2346 if (txd->m != NULL) {
2351 txd->flags |= HN_TXD_FLAG_ONLIST;
2352 #ifndef HN_USE_TXDESC_BUFRING
2353 mtx_lock_spin(&txr->hn_txlist_spin);
2354 KASSERT(txr->hn_txdesc_avail >= 0 &&
2355 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2356 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2357 txr->hn_txdesc_avail++;
2358 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2359 mtx_unlock_spin(&txr->hn_txlist_spin);
2360 #else /* HN_USE_TXDESC_BUFRING */
2362 atomic_add_int(&txr->hn_txdesc_avail, 1);
2364 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2365 #endif /* !HN_USE_TXDESC_BUFRING */
2370 static __inline struct hn_txdesc *
2371 hn_txdesc_get(struct hn_tx_ring *txr)
2373 struct hn_txdesc *txd;
2375 #ifndef HN_USE_TXDESC_BUFRING
2376 mtx_lock_spin(&txr->hn_txlist_spin);
2377 txd = SLIST_FIRST(&txr->hn_txlist);
2379 KASSERT(txr->hn_txdesc_avail > 0,
2380 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2381 txr->hn_txdesc_avail--;
2382 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2384 mtx_unlock_spin(&txr->hn_txlist_spin);
2386 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2390 #ifdef HN_USE_TXDESC_BUFRING
2392 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2394 #endif /* HN_USE_TXDESC_BUFRING */
2395 KASSERT(txd->m == NULL && txd->refs == 0 &&
2396 STAILQ_EMPTY(&txd->agg_list) &&
2397 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2398 txd->chim_size == 0 &&
2399 (txd->flags & HN_TXD_FLAG_ONLIST) &&
2400 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2401 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2402 txd->flags &= ~HN_TXD_FLAG_ONLIST;
2408 static __inline void
2409 hn_txdesc_hold(struct hn_txdesc *txd)
2412 /* 0->1 transition will never work */
2413 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2414 atomic_add_int(&txd->refs, 1);
2417 static __inline void
2418 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2421 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2422 ("recursive aggregation on aggregating txdesc"));
2424 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2425 ("already aggregated"));
2426 KASSERT(STAILQ_EMPTY(&txd->agg_list),
2427 ("recursive aggregation on to-be-aggregated txdesc"));
2429 txd->flags |= HN_TXD_FLAG_ONAGG;
2430 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2434 hn_tx_ring_pending(struct hn_tx_ring *txr)
2436 bool pending = false;
2438 #ifndef HN_USE_TXDESC_BUFRING
2439 mtx_lock_spin(&txr->hn_txlist_spin);
2440 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2442 mtx_unlock_spin(&txr->hn_txlist_spin);
2444 if (!buf_ring_full(txr->hn_txdesc_br))
2450 static __inline void
2451 hn_txeof(struct hn_tx_ring *txr)
2453 txr->hn_has_txeof = 0;
2458 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2459 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2461 struct hn_txdesc *txd = sndc->hn_cbarg;
2462 struct hn_tx_ring *txr;
2465 KASSERT(txr->hn_chan == chan,
2466 ("channel mismatch, on chan%u, should be chan%u",
2467 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2469 txr->hn_has_txeof = 1;
2470 hn_txdesc_put(txr, txd);
2472 ++txr->hn_txdone_cnt;
2473 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2474 txr->hn_txdone_cnt = 0;
2475 if (txr->hn_oactive)
2481 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2483 #if defined(INET) || defined(INET6)
2484 struct lro_ctrl *lro = &rxr->hn_lro;
2485 struct lro_entry *queued;
2487 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
2488 SLIST_REMOVE_HEAD(&lro->lro_active, next);
2489 tcp_lro_flush(lro, queued);
2495 * 'txr' could be NULL, if multiple channels and
2496 * ifnet.if_start method are enabled.
2498 if (txr == NULL || !txr->hn_has_txeof)
2501 txr->hn_txdone_cnt = 0;
2505 static __inline uint32_t
2506 hn_rndis_pktmsg_offset(uint32_t ofs)
2509 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2510 ("invalid RNDIS packet msg offset %u", ofs));
2511 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2514 static __inline void *
2515 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2516 size_t pi_dlen, uint32_t pi_type)
2518 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2519 struct rndis_pktinfo *pi;
2521 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2522 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2525 * Per-packet-info does not move; it only grows.
2528 * rm_pktinfooffset in this phase counts from the beginning
2529 * of rndis_packet_msg.
2531 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2532 ("%u pktinfo overflows RNDIS packet msg", pi_type));
2533 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2534 pkt->rm_pktinfolen);
2535 pkt->rm_pktinfolen += pi_size;
2537 pi->rm_size = pi_size;
2538 pi->rm_type = pi_type;
2539 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2541 return (pi->rm_data);
2545 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2547 struct hn_txdesc *txd;
2551 txd = txr->hn_agg_txd;
2552 KASSERT(txd != NULL, ("no aggregate txdesc"));
2555 * Since hn_txpkt() will reset this temporary stat, save
2556 * it now, so that oerrors can be updated properly, if
2557 * hn_txpkt() ever fails.
2559 pkts = txr->hn_stat_pkts;
2562 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2563 * failure, save it for later freeing, if hn_txpkt() ever
2567 error = hn_txpkt(ifp, txr, txd);
2568 if (__predict_false(error)) {
2569 /* txd is freed, but m is not. */
2572 txr->hn_flush_failed++;
2573 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2576 /* Reset all aggregation states. */
2577 txr->hn_agg_txd = NULL;
2578 txr->hn_agg_szleft = 0;
2579 txr->hn_agg_pktleft = 0;
2580 txr->hn_agg_prevpkt = NULL;
2586 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2591 if (txr->hn_agg_txd != NULL) {
2592 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2593 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2594 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2598 * Update the previous RNDIS packet's total length,
2599 * it can be increased due to the mandatory alignment
2600 * padding for this RNDIS packet. And update the
2601 * aggregating txdesc's chimney sending buffer size
2605 * Zero-out the padding, as required by the RNDIS spec.
2608 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2609 agg_txd->chim_size += pkt->rm_len - olen;
2611 /* Link this txdesc to the parent. */
2612 hn_txdesc_agg(agg_txd, txd);
2614 chim = (uint8_t *)pkt + pkt->rm_len;
2615 /* Save the current packet for later fixup. */
2616 txr->hn_agg_prevpkt = chim;
2618 txr->hn_agg_pktleft--;
2619 txr->hn_agg_szleft -= pktsize;
2620 if (txr->hn_agg_szleft <=
2621 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2623 * Probably can't aggregate more packets,
2624 * flush this aggregating txdesc proactively.
2626 txr->hn_agg_pktleft = 0;
2631 hn_flush_txagg(ifp, txr);
2633 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
2635 txr->hn_tx_chimney_tried++;
2636 txd->chim_index = hn_chim_alloc(txr->hn_sc);
2637 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
2639 txr->hn_tx_chimney++;
2641 chim = txr->hn_sc->hn_chim +
2642 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
2644 if (txr->hn_agg_pktmax > 1 &&
2645 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2646 txr->hn_agg_txd = txd;
2647 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
2648 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
2649 txr->hn_agg_prevpkt = chim;
2656 * If this function fails, then both txd and m_head0 will be freed.
2659 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2660 struct mbuf **m_head0)
2662 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
2663 int error, nsegs, i;
2664 struct mbuf *m_head = *m_head0;
2665 struct rndis_packet_msg *pkt;
2668 int pkt_hlen, pkt_size;
2670 pkt = txd->rndis_pkt;
2671 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
2672 if (pkt_size < txr->hn_chim_size) {
2673 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
2677 if (txr->hn_agg_txd != NULL)
2678 hn_flush_txagg(ifp, txr);
2681 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
2682 pkt->rm_len = m_head->m_pkthdr.len;
2683 pkt->rm_dataoffset = 0;
2684 pkt->rm_datalen = m_head->m_pkthdr.len;
2685 pkt->rm_oobdataoffset = 0;
2686 pkt->rm_oobdatalen = 0;
2687 pkt->rm_oobdataelements = 0;
2688 pkt->rm_pktinfooffset = sizeof(*pkt);
2689 pkt->rm_pktinfolen = 0;
2690 pkt->rm_vchandle = 0;
2691 pkt->rm_reserved = 0;
2693 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
2695 * Set the hash value for this packet, so that the host could
2696 * dispatch the TX done event for this packet back to this TX
2699 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2700 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
2701 *pi_data = txr->hn_tx_idx;
2704 if (m_head->m_flags & M_VLANTAG) {
2705 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2706 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
2707 *pi_data = NDIS_VLAN_INFO_MAKE(
2708 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
2709 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
2710 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
2713 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
2714 #if defined(INET6) || defined(INET)
2715 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2716 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
2718 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
2719 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
2720 m_head->m_pkthdr.tso_segsz);
2723 #if defined(INET6) && defined(INET)
2728 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
2729 m_head->m_pkthdr.tso_segsz);
2732 #endif /* INET6 || INET */
2733 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
2734 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2735 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
2736 if (m_head->m_pkthdr.csum_flags &
2737 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
2738 *pi_data = NDIS_TXCSUM_INFO_IPV6;
2740 *pi_data = NDIS_TXCSUM_INFO_IPV4;
2741 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
2742 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
2745 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
2746 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
2747 else if (m_head->m_pkthdr.csum_flags &
2748 (CSUM_IP_UDP | CSUM_IP6_UDP))
2749 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
2752 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
2753 /* Fixup RNDIS packet message total length */
2754 pkt->rm_len += pkt_hlen;
2755 /* Convert RNDIS packet message offsets */
2756 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
2757 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
2760 * Fast path: Chimney sending.
2763 struct hn_txdesc *tgt_txd = txd;
2765 if (txr->hn_agg_txd != NULL) {
2766 tgt_txd = txr->hn_agg_txd;
2772 KASSERT(pkt == chim,
2773 ("RNDIS pkt not in chimney sending buffer"));
2774 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
2775 ("chimney sending buffer is not used"));
2776 tgt_txd->chim_size += pkt->rm_len;
2778 m_copydata(m_head, 0, m_head->m_pkthdr.len,
2779 ((uint8_t *)chim) + pkt_hlen);
2781 txr->hn_gpa_cnt = 0;
2782 txr->hn_sendpkt = hn_txpkt_chim;
2786 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
2787 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2788 ("chimney buffer is used"));
2789 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
2791 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
2792 if (__predict_false(error)) {
2796 * This mbuf is not linked w/ the txd yet, so free it now.
2801 freed = hn_txdesc_put(txr, txd);
2803 ("fail to free txd upon txdma error"));
2805 txr->hn_txdma_failed++;
2806 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2811 /* +1 RNDIS packet message */
2812 txr->hn_gpa_cnt = nsegs + 1;
2814 /* send packet with page buffer */
2815 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
2816 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
2817 txr->hn_gpa[0].gpa_len = pkt_hlen;
2820 * Fill the page buffers with mbuf info after the page
2821 * buffer for RNDIS packet message.
2823 for (i = 0; i < nsegs; ++i) {
2824 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
2826 gpa->gpa_page = atop(segs[i].ds_addr);
2827 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
2828 gpa->gpa_len = segs[i].ds_len;
2831 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2833 txr->hn_sendpkt = hn_txpkt_sglist;
2837 /* Set the completion routine */
2838 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
2840 /* Update temporary stats for later use. */
2841 txr->hn_stat_pkts++;
2842 txr->hn_stat_size += m_head->m_pkthdr.len;
2843 if (m_head->m_flags & M_MCAST)
2844 txr->hn_stat_mcasts++;
2851 * If this function fails, then txd will be freed, but the mbuf
2852 * associated w/ the txd will _not_ be freed.
2855 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
2857 int error, send_failed = 0, has_bpf;
2860 has_bpf = bpf_peers_present(ifp->if_bpf);
2863 * Make sure that this txd and any aggregated txds are not
2864 * freed before ETHER_BPF_MTAP.
2866 hn_txdesc_hold(txd);
2868 error = txr->hn_sendpkt(txr, txd);
2871 const struct hn_txdesc *tmp_txd;
2873 ETHER_BPF_MTAP(ifp, txd->m);
2874 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
2875 ETHER_BPF_MTAP(ifp, tmp_txd->m);
2878 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
2879 #ifdef HN_IFSTART_SUPPORT
2880 if (!hn_use_if_start)
2883 if_inc_counter(ifp, IFCOUNTER_OBYTES,
2885 if (txr->hn_stat_mcasts != 0) {
2886 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
2887 txr->hn_stat_mcasts);
2890 txr->hn_pkts += txr->hn_stat_pkts;
2894 hn_txdesc_put(txr, txd);
2896 if (__predict_false(error)) {
2900 * This should "really rarely" happen.
2902 * XXX Too many RX to be acked or too many sideband
2903 * commands to run? Ask netvsc_channel_rollup()
2904 * to kick start later.
2906 txr->hn_has_txeof = 1;
2908 txr->hn_send_failed++;
2911 * Try sending again after set hn_has_txeof;
2912 * in case that we missed the last
2913 * netvsc_channel_rollup().
2917 if_printf(ifp, "send failed\n");
2920 * Caller will perform further processing on the
2921 * associated mbuf, so don't free it in hn_txdesc_put();
2922 * only unload it from the DMA map in hn_txdesc_put(),
2926 freed = hn_txdesc_put(txr, txd);
2928 ("fail to free txd upon send error"));
2930 txr->hn_send_failed++;
2933 /* Reset temporary stats, after this sending is done. */
2934 txr->hn_stat_size = 0;
2935 txr->hn_stat_pkts = 0;
2936 txr->hn_stat_mcasts = 0;
2942 * Append the specified data to the indicated mbuf chain,
2943 * Extend the mbuf chain if the new data does not fit in
2946 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2947 * There should be an equivalent in the kernel mbuf code,
2948 * but there does not appear to be one yet.
2950 * Differs from m_append() in that additional mbufs are
2951 * allocated with cluster size MJUMPAGESIZE, and filled
2954 * Return 1 if able to complete the job; otherwise 0.
2957 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2960 int remainder, space;
2962 for (m = m0; m->m_next != NULL; m = m->m_next)
2965 space = M_TRAILINGSPACE(m);
2968 * Copy into available space.
2970 if (space > remainder)
2972 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2977 while (remainder > 0) {
2979 * Allocate a new mbuf; could check space
2980 * and allocate a cluster instead.
2982 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
2985 n->m_len = min(MJUMPAGESIZE, remainder);
2986 bcopy(cp, mtod(n, caddr_t), n->m_len);
2988 remainder -= n->m_len;
2992 if (m0->m_flags & M_PKTHDR)
2993 m0->m_pkthdr.len += len - remainder;
2995 return (remainder == 0);
2998 #if defined(INET) || defined(INET6)
3000 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3002 #if __FreeBSD_version >= 1100095
3003 if (hn_lro_mbufq_depth) {
3004 tcp_lro_queue_mbuf(lc, m);
3008 return tcp_lro_rx(lc, m, 0);
3013 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3014 const struct hn_rxinfo *info)
3016 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3018 int size, do_lro = 0, do_csum = 1;
3019 int hash_type = M_HASHTYPE_OPAQUE;
3022 * If the non-transparent mode VF is active, inject this packet
3025 ifp = rxr->hn_rxvf_ifp ? rxr->hn_rxvf_ifp : hn_ifp;
3027 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3030 * See the NOTE of hn_rndis_init_fixat(). This
3031 * function can be reached, immediately after the
3032 * RNDIS is initialized but before the ifnet is
3033 * setup on the hn_attach() path; drop the unexpected
3039 if (__predict_false(dlen < ETHER_HDR_LEN)) {
3040 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3044 if (dlen <= MHLEN) {
3045 m_new = m_gethdr(M_NOWAIT, MT_DATA);
3046 if (m_new == NULL) {
3047 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3050 memcpy(mtod(m_new, void *), data, dlen);
3051 m_new->m_pkthdr.len = m_new->m_len = dlen;
3052 rxr->hn_small_pkts++;
3055 * Get an mbuf with a cluster. For packets 2K or less,
3056 * get a standard 2K cluster. For anything larger, get a
3057 * 4K cluster. Any buffers larger than 4K can cause problems
3058 * if looped around to the Hyper-V TX channel, so avoid them.
3061 if (dlen > MCLBYTES) {
3063 size = MJUMPAGESIZE;
3066 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3067 if (m_new == NULL) {
3068 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3072 hv_m_append(m_new, dlen, data);
3074 m_new->m_pkthdr.rcvif = ifp;
3076 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3079 /* receive side checksum offload */
3080 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3081 /* IP csum offload */
3082 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3083 m_new->m_pkthdr.csum_flags |=
3084 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3088 /* TCP/UDP csum offload */
3089 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3090 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3091 m_new->m_pkthdr.csum_flags |=
3092 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3093 m_new->m_pkthdr.csum_data = 0xffff;
3094 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3102 * As of this write (Oct 28th, 2016), host side will turn
3103 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3104 * the do_lro setting here is actually _not_ accurate. We
3105 * depend on the RSS hash type check to reset do_lro.
3107 if ((info->csum_info &
3108 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3109 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3112 const struct ether_header *eh;
3117 /* Checked at the beginning of this function. */
3118 KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
3120 eh = mtod(m_new, struct ether_header *);
3121 etype = ntohs(eh->ether_type);
3122 if (etype == ETHERTYPE_VLAN) {
3123 const struct ether_vlan_header *evl;
3125 hoff = sizeof(*evl);
3126 if (m_new->m_len < hoff)
3128 evl = mtod(m_new, struct ether_vlan_header *);
3129 etype = ntohs(evl->evl_proto);
3132 if (etype == ETHERTYPE_IP) {
3135 pr = hn_check_iplen(m_new, hoff);
3136 if (pr == IPPROTO_TCP) {
3138 (rxr->hn_trust_hcsum &
3139 HN_TRUST_HCSUM_TCP)) {
3140 rxr->hn_csum_trusted++;
3141 m_new->m_pkthdr.csum_flags |=
3142 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3143 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3144 m_new->m_pkthdr.csum_data = 0xffff;
3147 } else if (pr == IPPROTO_UDP) {
3149 (rxr->hn_trust_hcsum &
3150 HN_TRUST_HCSUM_UDP)) {
3151 rxr->hn_csum_trusted++;
3152 m_new->m_pkthdr.csum_flags |=
3153 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3154 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3155 m_new->m_pkthdr.csum_data = 0xffff;
3157 } else if (pr != IPPROTO_DONE && do_csum &&
3158 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3159 rxr->hn_csum_trusted++;
3160 m_new->m_pkthdr.csum_flags |=
3161 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3166 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3167 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3168 NDIS_VLAN_INFO_ID(info->vlan_info),
3169 NDIS_VLAN_INFO_PRI(info->vlan_info),
3170 NDIS_VLAN_INFO_CFI(info->vlan_info));
3171 m_new->m_flags |= M_VLANTAG;
3175 * If VF is activated (tranparent/non-transparent mode does not
3178 * - Don't setup mbuf hash, if 'options RSS' is set.
3180 * In Azure, when VF is activated, TCP SYN and SYN|ACK go
3181 * through hn(4) while the rest of segments and ACKs belonging
3182 * to the same TCP 4-tuple go through the VF. So don't setup
3183 * mbuf hash, if a VF is activated and 'options RSS' is not
3184 * enabled. hn(4) and the VF may use neither the same RSS
3185 * hash key nor the same RSS hash function, so the hash value
3186 * for packets belonging to the same flow could be different!
3190 * hn(4) will only receive broadcast packets, multicast packets,
3191 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3194 * For non-transparent, we definitely _cannot_ enable LRO at
3195 * all, since the LRO flush will use hn(4) as the receiving
3196 * interface; i.e. hn_ifp->if_input(hn_ifp, m).
3198 if (hn_ifp != ifp || (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF)) {
3199 do_lro = 0; /* disable LRO. */
3201 goto skip_hash; /* skip mbuf hash setup */
3205 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3207 m_new->m_pkthdr.flowid = info->hash_value;
3208 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3209 NDIS_HASH_FUNCTION_TOEPLITZ) {
3210 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
3214 * do_lro is resetted, if the hash types are not TCP
3215 * related. See the comment in the above csum_flags
3219 case NDIS_HASH_IPV4:
3220 hash_type = M_HASHTYPE_RSS_IPV4;
3224 case NDIS_HASH_TCP_IPV4:
3225 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3228 case NDIS_HASH_IPV6:
3229 hash_type = M_HASHTYPE_RSS_IPV6;
3233 case NDIS_HASH_IPV6_EX:
3234 hash_type = M_HASHTYPE_RSS_IPV6_EX;
3238 case NDIS_HASH_TCP_IPV6:
3239 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3242 case NDIS_HASH_TCP_IPV6_EX:
3243 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3248 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3250 M_HASHTYPE_SET(m_new, hash_type);
3255 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3256 if (hn_ifp != ifp) {
3257 const struct ether_header *eh;
3260 * Non-transparent mode VF is activated.
3264 * Allow tapping on hn(4).
3266 ETHER_BPF_MTAP(hn_ifp, m_new);
3269 * Update hn(4)'s stats.
3271 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3272 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3273 /* Checked at the beginning of this function. */
3274 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3275 eh = mtod(m_new, struct ether_header *);
3276 if (ETHER_IS_MULTICAST(eh->ether_dhost))
3277 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3281 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3282 #if defined(INET) || defined(INET6)
3283 struct lro_ctrl *lro = &rxr->hn_lro;
3286 rxr->hn_lro_tried++;
3287 if (hn_lro_rx(lro, m_new) == 0) {
3294 ifp->if_input(ifp, m_new);
3300 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3302 struct hn_softc *sc = ifp->if_softc;
3303 struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3304 struct ifnet *vf_ifp;
3305 int mask, error = 0;
3306 struct ifrsskey *ifrk;
3307 struct ifrsshash *ifrh;
3311 if (ifr->ifr_mtu > HN_MTU_MAX) {
3318 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3323 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3324 /* Can't change MTU */
3330 if (ifp->if_mtu == ifr->ifr_mtu) {
3335 if (hn_xpnt_vf_isready(sc)) {
3336 vf_ifp = sc->hn_vf_ifp;
3338 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3339 sizeof(ifr_vf.ifr_name));
3340 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3344 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3345 vf_ifp->if_xname, ifr->ifr_mtu, error);
3351 * Suspend this interface before the synthetic parts
3357 * Detach the synthetics parts, i.e. NVS and RNDIS.
3359 hn_synth_detach(sc);
3362 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3363 * with the new MTU setting.
3365 error = hn_synth_attach(sc, ifr->ifr_mtu);
3372 * Commit the requested MTU, after the synthetic parts
3373 * have been successfully attached.
3375 ifp->if_mtu = ifr->ifr_mtu;
3378 * Synthetic parts' reattach may change the chimney
3379 * sending size; update it.
3381 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3382 hn_set_chim_size(sc, sc->hn_chim_szmax);
3385 * Make sure that various parameters based on MTU are
3386 * still valid, after the MTU change.
3388 hn_mtu_change_fixup(sc);
3391 * All done! Resume the interface now.
3395 if ((sc->hn_flags & HN_FLAG_RXVF) ||
3396 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3398 * Since we have reattached the NVS part,
3399 * change the datapath to VF again; in case
3400 * that it is lost, after the NVS was detached.
3402 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3411 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3416 if (hn_xpnt_vf_isready(sc))
3417 hn_xpnt_vf_saveifflags(sc);
3419 if (ifp->if_flags & IFF_UP) {
3420 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3422 * Caller meight hold mutex, e.g.
3423 * bpf; use busy-wait for the RNDIS
3427 hn_rxfilter_config(sc);
3430 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3431 error = hn_xpnt_vf_iocsetflags(sc);
3436 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3439 sc->hn_if_flags = ifp->if_flags;
3447 if (hn_xpnt_vf_isready(sc)) {
3449 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3450 sizeof(ifr_vf.ifr_name));
3451 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3457 * Fix up requested capabilities w/ supported capabilities,
3458 * since the supported capabilities could have been changed.
3460 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3463 if (mask & IFCAP_TXCSUM) {
3464 ifp->if_capenable ^= IFCAP_TXCSUM;
3465 if (ifp->if_capenable & IFCAP_TXCSUM)
3466 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3468 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3470 if (mask & IFCAP_TXCSUM_IPV6) {
3471 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3472 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3473 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3475 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3478 /* TODO: flip RNDIS offload parameters for RXCSUM. */
3479 if (mask & IFCAP_RXCSUM)
3480 ifp->if_capenable ^= IFCAP_RXCSUM;
3482 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
3483 if (mask & IFCAP_RXCSUM_IPV6)
3484 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3487 if (mask & IFCAP_LRO)
3488 ifp->if_capenable ^= IFCAP_LRO;
3490 if (mask & IFCAP_TSO4) {
3491 ifp->if_capenable ^= IFCAP_TSO4;
3492 if (ifp->if_capenable & IFCAP_TSO4)
3493 ifp->if_hwassist |= CSUM_IP_TSO;
3495 ifp->if_hwassist &= ~CSUM_IP_TSO;
3497 if (mask & IFCAP_TSO6) {
3498 ifp->if_capenable ^= IFCAP_TSO6;
3499 if (ifp->if_capenable & IFCAP_TSO6)
3500 ifp->if_hwassist |= CSUM_IP6_TSO;
3502 ifp->if_hwassist &= ~CSUM_IP6_TSO;
3512 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3516 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3518 * Multicast uses mutex; use busy-wait for
3522 hn_rxfilter_config(sc);
3526 /* XXX vlan(4) style mcast addr maintenance */
3527 if (hn_xpnt_vf_isready(sc)) {
3530 old_if_flags = sc->hn_vf_ifp->if_flags;
3531 hn_xpnt_vf_saveifflags(sc);
3533 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3534 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3536 error = hn_xpnt_vf_iocsetflags(sc);
3545 if (hn_xpnt_vf_isready(sc)) {
3547 * SIOCGIFMEDIA expects ifmediareq, so don't
3548 * create and pass ifr_vf to the VF here; just
3549 * replace the ifr_name.
3551 vf_ifp = sc->hn_vf_ifp;
3552 strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3553 sizeof(ifr->ifr_name));
3554 error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3555 /* Restore the ifr_name. */
3556 strlcpy(ifr->ifr_name, ifp->if_xname,
3557 sizeof(ifr->ifr_name));
3562 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3565 case SIOCGIFRSSHASH:
3566 ifrh = (struct ifrsshash *)data;
3568 if (sc->hn_rx_ring_inuse == 1) {
3570 ifrh->ifrh_func = RSS_FUNC_NONE;
3571 ifrh->ifrh_types = 0;
3575 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3576 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3578 ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3580 ifrh->ifrh_types = 0;
3581 if (sc->hn_rss_hash & NDIS_HASH_IPV4)
3582 ifrh->ifrh_types |= RSS_TYPE_IPV4;
3583 if (sc->hn_rss_hash & NDIS_HASH_TCP_IPV4)
3584 ifrh->ifrh_types |= RSS_TYPE_TCP_IPV4;
3585 if (sc->hn_rss_hash & NDIS_HASH_IPV6)
3586 ifrh->ifrh_types |= RSS_TYPE_IPV6;
3587 if (sc->hn_rss_hash & NDIS_HASH_IPV6_EX)
3588 ifrh->ifrh_types |= RSS_TYPE_IPV6_EX;
3589 if (sc->hn_rss_hash & NDIS_HASH_TCP_IPV6)
3590 ifrh->ifrh_types |= RSS_TYPE_TCP_IPV6;
3591 if (sc->hn_rss_hash & NDIS_HASH_TCP_IPV6_EX)
3592 ifrh->ifrh_types |= RSS_TYPE_TCP_IPV6_EX;
3597 ifrk = (struct ifrsskey *)data;
3599 if (sc->hn_rx_ring_inuse == 1) {
3601 ifrk->ifrk_func = RSS_FUNC_NONE;
3602 ifrk->ifrk_keylen = 0;
3605 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3606 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3608 ifrk->ifrk_func = RSS_FUNC_PRIVATE;
3609 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
3610 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
3611 NDIS_HASH_KEYSIZE_TOEPLITZ);
3616 error = ether_ioctl(ifp, cmd, data);
3623 hn_stop(struct hn_softc *sc, bool detaching)
3625 struct ifnet *ifp = sc->hn_ifp;
3630 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3631 ("synthetic parts were not attached"));
3633 /* Clear RUNNING bit ASAP. */
3634 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3636 /* Disable polling. */
3639 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
3640 KASSERT(sc->hn_vf_ifp != NULL,
3641 ("%s: VF is not attached", ifp->if_xname));
3643 /* Mark transparent mode VF as disabled. */
3644 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
3648 * Datapath setting must happen _before_ bringing
3651 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
3654 * Bring the VF down.
3656 hn_xpnt_vf_saveifflags(sc);
3657 sc->hn_vf_ifp->if_flags &= ~IFF_UP;
3658 hn_xpnt_vf_iocsetflags(sc);
3661 /* Suspend data transfers. */
3662 hn_suspend_data(sc);
3664 /* Clear OACTIVE bit. */
3665 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3666 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3667 sc->hn_tx_ring[i].hn_oactive = 0;
3670 * If the non-transparent mode VF is active, make sure
3671 * that the RX filter still allows packet reception.
3673 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
3674 hn_rxfilter_config(sc);
3678 hn_init_locked(struct hn_softc *sc)
3680 struct ifnet *ifp = sc->hn_ifp;
3685 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
3688 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3691 /* Configure RX filter */
3692 hn_rxfilter_config(sc);
3694 /* Clear OACTIVE bit. */
3695 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3696 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3697 sc->hn_tx_ring[i].hn_oactive = 0;
3699 /* Clear TX 'suspended' bit. */
3700 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
3702 if (hn_xpnt_vf_isready(sc)) {
3703 /* Initialize transparent VF. */
3704 hn_xpnt_vf_init(sc);
3707 /* Everything is ready; unleash! */
3708 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3710 /* Re-enable polling if requested. */
3711 if (sc->hn_pollhz > 0)
3712 hn_polling(sc, sc->hn_pollhz);
3718 struct hn_softc *sc = xsc;
3725 #if __FreeBSD_version >= 1100099
3728 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
3730 struct hn_softc *sc = arg1;
3731 unsigned int lenlim;
3734 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
3735 error = sysctl_handle_int(oidp, &lenlim, 0, req);
3736 if (error || req->newptr == NULL)
3740 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
3741 lenlim > TCP_LRO_LENGTH_MAX) {
3745 hn_set_lro_lenlim(sc, lenlim);
3752 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
3754 struct hn_softc *sc = arg1;
3755 int ackcnt, error, i;
3758 * lro_ackcnt_lim is append count limit,
3759 * +1 to turn it into aggregation limit.
3761 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
3762 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
3763 if (error || req->newptr == NULL)
3766 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
3770 * Convert aggregation limit back to append
3775 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
3776 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
3784 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
3786 struct hn_softc *sc = arg1;
3791 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
3794 error = sysctl_handle_int(oidp, &on, 0, req);
3795 if (error || req->newptr == NULL)
3799 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3800 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3803 rxr->hn_trust_hcsum |= hcsum;
3805 rxr->hn_trust_hcsum &= ~hcsum;
3812 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
3814 struct hn_softc *sc = arg1;
3815 int chim_size, error;
3817 chim_size = sc->hn_tx_ring[0].hn_chim_size;
3818 error = sysctl_handle_int(oidp, &chim_size, 0, req);
3819 if (error || req->newptr == NULL)
3822 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
3826 hn_set_chim_size(sc, chim_size);
3831 #if __FreeBSD_version < 1100095
3833 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
3835 struct hn_softc *sc = arg1;
3836 int ofs = arg2, i, error;
3837 struct hn_rx_ring *rxr;
3841 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
3842 rxr = &sc->hn_rx_ring[i];
3843 stat += *((int *)((uint8_t *)rxr + ofs));
3846 error = sysctl_handle_64(oidp, &stat, 0, req);
3847 if (error || req->newptr == NULL)
3850 /* Zero out this stat. */
3851 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
3852 rxr = &sc->hn_rx_ring[i];
3853 *((int *)((uint8_t *)rxr + ofs)) = 0;
3859 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
3861 struct hn_softc *sc = arg1;
3862 int ofs = arg2, i, error;
3863 struct hn_rx_ring *rxr;
3867 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3868 rxr = &sc->hn_rx_ring[i];
3869 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
3872 error = sysctl_handle_64(oidp, &stat, 0, req);
3873 if (error || req->newptr == NULL)
3876 /* Zero out this stat. */
3877 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3878 rxr = &sc->hn_rx_ring[i];
3879 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
3887 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3889 struct hn_softc *sc = arg1;
3890 int ofs = arg2, i, error;
3891 struct hn_rx_ring *rxr;
3895 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3896 rxr = &sc->hn_rx_ring[i];
3897 stat += *((u_long *)((uint8_t *)rxr + ofs));
3900 error = sysctl_handle_long(oidp, &stat, 0, req);
3901 if (error || req->newptr == NULL)
3904 /* Zero out this stat. */
3905 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3906 rxr = &sc->hn_rx_ring[i];
3907 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
3913 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3915 struct hn_softc *sc = arg1;
3916 int ofs = arg2, i, error;
3917 struct hn_tx_ring *txr;
3921 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3922 txr = &sc->hn_tx_ring[i];
3923 stat += *((u_long *)((uint8_t *)txr + ofs));
3926 error = sysctl_handle_long(oidp, &stat, 0, req);
3927 if (error || req->newptr == NULL)
3930 /* Zero out this stat. */
3931 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3932 txr = &sc->hn_tx_ring[i];
3933 *((u_long *)((uint8_t *)txr + ofs)) = 0;
3939 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
3941 struct hn_softc *sc = arg1;
3942 int ofs = arg2, i, error, conf;
3943 struct hn_tx_ring *txr;
3945 txr = &sc->hn_tx_ring[0];
3946 conf = *((int *)((uint8_t *)txr + ofs));
3948 error = sysctl_handle_int(oidp, &conf, 0, req);
3949 if (error || req->newptr == NULL)
3953 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3954 txr = &sc->hn_tx_ring[i];
3955 *((int *)((uint8_t *)txr + ofs)) = conf;
3963 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
3965 struct hn_softc *sc = arg1;
3968 size = sc->hn_agg_size;
3969 error = sysctl_handle_int(oidp, &size, 0, req);
3970 if (error || req->newptr == NULL)
3974 sc->hn_agg_size = size;
3982 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
3984 struct hn_softc *sc = arg1;
3987 pkts = sc->hn_agg_pkts;
3988 error = sysctl_handle_int(oidp, &pkts, 0, req);
3989 if (error || req->newptr == NULL)
3993 sc->hn_agg_pkts = pkts;
4001 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4003 struct hn_softc *sc = arg1;
4006 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4007 return (sysctl_handle_int(oidp, &pkts, 0, req));
4011 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4013 struct hn_softc *sc = arg1;
4016 align = sc->hn_tx_ring[0].hn_agg_align;
4017 return (sysctl_handle_int(oidp, &align, 0, req));
4021 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4024 vmbus_chan_poll_disable(chan);
4026 vmbus_chan_poll_enable(chan, pollhz);
4030 hn_polling(struct hn_softc *sc, u_int pollhz)
4032 int nsubch = sc->hn_rx_ring_inuse - 1;
4037 struct vmbus_channel **subch;
4040 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4041 for (i = 0; i < nsubch; ++i)
4042 hn_chan_polling(subch[i], pollhz);
4043 vmbus_subchan_rel(subch, nsubch);
4045 hn_chan_polling(sc->hn_prichan, pollhz);
4049 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4051 struct hn_softc *sc = arg1;
4054 pollhz = sc->hn_pollhz;
4055 error = sysctl_handle_int(oidp, &pollhz, 0, req);
4056 if (error || req->newptr == NULL)
4060 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4064 if (sc->hn_pollhz != pollhz) {
4065 sc->hn_pollhz = pollhz;
4066 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4067 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4068 hn_polling(sc, sc->hn_pollhz);
4076 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4078 struct hn_softc *sc = arg1;
4081 snprintf(verstr, sizeof(verstr), "%u.%u",
4082 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4083 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4084 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4088 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4090 struct hn_softc *sc = arg1;
4097 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4098 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4102 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4104 struct hn_softc *sc = arg1;
4105 char assist_str[128];
4109 hwassist = sc->hn_ifp->if_hwassist;
4111 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4112 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4116 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4118 struct hn_softc *sc = arg1;
4119 char filter_str[128];
4123 filter = sc->hn_rx_filter;
4125 snprintf(filter_str, sizeof(filter_str), "%b", filter,
4127 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4131 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4133 struct hn_softc *sc = arg1;
4138 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4139 if (error || req->newptr == NULL)
4142 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4145 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4147 if (sc->hn_rx_ring_inuse > 1) {
4148 error = hn_rss_reconfig(sc);
4150 /* Not RSS capable, at least for now; just save the RSS key. */
4159 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4161 struct hn_softc *sc = arg1;
4166 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4167 if (error || req->newptr == NULL)
4171 * Don't allow RSS indirect table change, if this interface is not
4172 * RSS capable currently.
4174 if (sc->hn_rx_ring_inuse == 1) {
4179 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4182 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4184 hn_rss_ind_fixup(sc);
4185 error = hn_rss_reconfig(sc);
4192 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4194 struct hn_softc *sc = arg1;
4199 hash = sc->hn_rss_hash;
4201 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4202 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4206 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4208 struct hn_softc *sc = arg1;
4209 char vf_name[IFNAMSIZ + 1];
4210 struct ifnet *vf_ifp;
4214 vf_ifp = sc->hn_vf_ifp;
4216 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4218 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4222 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4224 struct hn_softc *sc = arg1;
4225 char vf_name[IFNAMSIZ + 1];
4226 struct ifnet *vf_ifp;
4230 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4232 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4234 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4238 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4240 struct rm_priotracker pt;
4245 error = sysctl_wire_old_buffer(req, 0);
4249 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4253 rm_rlock(&hn_vfmap_lock, &pt);
4256 for (i = 0; i < hn_vfmap_size; ++i) {
4259 if (hn_vfmap[i] == NULL)
4262 ifp = ifnet_byindex(i);
4265 sbuf_printf(sb, "%s", ifp->if_xname);
4267 sbuf_printf(sb, " %s", ifp->if_xname);
4272 rm_runlock(&hn_vfmap_lock, &pt);
4274 error = sbuf_finish(sb);
4280 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4282 struct rm_priotracker pt;
4287 error = sysctl_wire_old_buffer(req, 0);
4291 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4295 rm_rlock(&hn_vfmap_lock, &pt);
4298 for (i = 0; i < hn_vfmap_size; ++i) {
4299 struct ifnet *ifp, *hn_ifp;
4301 hn_ifp = hn_vfmap[i];
4305 ifp = ifnet_byindex(i);
4308 sbuf_printf(sb, "%s:%s", ifp->if_xname,
4311 sbuf_printf(sb, " %s:%s", ifp->if_xname,
4318 rm_runlock(&hn_vfmap_lock, &pt);
4320 error = sbuf_finish(sb);
4326 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4328 struct hn_softc *sc = arg1;
4329 int error, onoff = 0;
4331 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4333 error = sysctl_handle_int(oidp, &onoff, 0, req);
4334 if (error || req->newptr == NULL)
4338 /* NOTE: hn_vf_lock for hn_transmit() */
4339 rm_wlock(&sc->hn_vf_lock);
4341 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4343 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4344 rm_wunlock(&sc->hn_vf_lock);
4351 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4353 struct hn_softc *sc = arg1;
4356 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4358 return (sysctl_handle_int(oidp, &enabled, 0, req));
4362 hn_check_iplen(const struct mbuf *m, int hoff)
4364 const struct ip *ip;
4365 int len, iphlen, iplen;
4366 const struct tcphdr *th;
4367 int thoff; /* TCP data offset */
4369 len = hoff + sizeof(struct ip);
4371 /* The packet must be at least the size of an IP header. */
4372 if (m->m_pkthdr.len < len)
4373 return IPPROTO_DONE;
4375 /* The fixed IP header must reside completely in the first mbuf. */
4377 return IPPROTO_DONE;
4379 ip = mtodo(m, hoff);
4381 /* Bound check the packet's stated IP header length. */
4382 iphlen = ip->ip_hl << 2;
4383 if (iphlen < sizeof(struct ip)) /* minimum header length */
4384 return IPPROTO_DONE;
4386 /* The full IP header must reside completely in the one mbuf. */
4387 if (m->m_len < hoff + iphlen)
4388 return IPPROTO_DONE;
4390 iplen = ntohs(ip->ip_len);
4393 * Check that the amount of data in the buffers is as
4394 * at least much as the IP header would have us expect.
4396 if (m->m_pkthdr.len < hoff + iplen)
4397 return IPPROTO_DONE;
4400 * Ignore IP fragments.
4402 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4403 return IPPROTO_DONE;
4406 * The TCP/IP or UDP/IP header must be entirely contained within
4407 * the first fragment of a packet.
4411 if (iplen < iphlen + sizeof(struct tcphdr))
4412 return IPPROTO_DONE;
4413 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4414 return IPPROTO_DONE;
4415 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4416 thoff = th->th_off << 2;
4417 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4418 return IPPROTO_DONE;
4419 if (m->m_len < hoff + iphlen + thoff)
4420 return IPPROTO_DONE;
4423 if (iplen < iphlen + sizeof(struct udphdr))
4424 return IPPROTO_DONE;
4425 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4426 return IPPROTO_DONE;
4430 return IPPROTO_DONE;
4437 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4439 struct sysctl_oid_list *child;
4440 struct sysctl_ctx_list *ctx;
4441 device_t dev = sc->hn_dev;
4442 #if defined(INET) || defined(INET6)
4443 #if __FreeBSD_version >= 1100095
4450 * Create RXBUF for reception.
4453 * - It is shared by all channels.
4454 * - A large enough buffer is allocated, certain version of NVSes
4455 * may further limit the usable space.
4457 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4458 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4459 BUS_DMA_WAITOK | BUS_DMA_ZERO);
4460 if (sc->hn_rxbuf == NULL) {
4461 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4465 sc->hn_rx_ring_cnt = ring_cnt;
4466 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4468 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4469 M_DEVBUF, M_WAITOK | M_ZERO);
4471 #if defined(INET) || defined(INET6)
4472 #if __FreeBSD_version >= 1100095
4473 lroent_cnt = hn_lro_entry_count;
4474 if (lroent_cnt < TCP_LRO_ENTRIES)
4475 lroent_cnt = TCP_LRO_ENTRIES;
4477 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4479 #endif /* INET || INET6 */
4481 ctx = device_get_sysctl_ctx(dev);
4482 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4484 /* Create dev.hn.UNIT.rx sysctl tree */
4485 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4486 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4488 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4489 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4491 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4492 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4493 &rxr->hn_br_dma, BUS_DMA_WAITOK);
4494 if (rxr->hn_br == NULL) {
4495 device_printf(dev, "allocate bufring failed\n");
4499 if (hn_trust_hosttcp)
4500 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4501 if (hn_trust_hostudp)
4502 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4503 if (hn_trust_hostip)
4504 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4505 rxr->hn_ifp = sc->hn_ifp;
4506 if (i < sc->hn_tx_ring_cnt)
4507 rxr->hn_txr = &sc->hn_tx_ring[i];
4508 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4509 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4511 rxr->hn_rxbuf = sc->hn_rxbuf;
4516 #if defined(INET) || defined(INET6)
4517 #if __FreeBSD_version >= 1100095
4518 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4519 hn_lro_mbufq_depth);
4521 tcp_lro_init(&rxr->hn_lro);
4522 rxr->hn_lro.ifp = sc->hn_ifp;
4524 #if __FreeBSD_version >= 1100099
4525 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4526 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4528 #endif /* INET || INET6 */
4530 if (sc->hn_rx_sysctl_tree != NULL) {
4534 * Create per RX ring sysctl tree:
4535 * dev.hn.UNIT.rx.RINGID
4537 snprintf(name, sizeof(name), "%d", i);
4538 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4539 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4540 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4542 if (rxr->hn_rx_sysctl_tree != NULL) {
4543 SYSCTL_ADD_ULONG(ctx,
4544 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4545 OID_AUTO, "packets", CTLFLAG_RW,
4546 &rxr->hn_pkts, "# of packets received");
4547 SYSCTL_ADD_ULONG(ctx,
4548 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4549 OID_AUTO, "rss_pkts", CTLFLAG_RW,
4551 "# of packets w/ RSS info received");
4553 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4554 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
4555 &rxr->hn_pktbuf_len, 0,
4556 "Temporary channel packet buffer length");
4561 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
4562 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4563 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
4564 #if __FreeBSD_version < 1100095
4565 hn_rx_stat_int_sysctl,
4567 hn_rx_stat_u64_sysctl,
4569 "LU", "LRO queued");
4570 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
4571 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4572 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
4573 #if __FreeBSD_version < 1100095
4574 hn_rx_stat_int_sysctl,
4576 hn_rx_stat_u64_sysctl,
4578 "LU", "LRO flushed");
4579 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
4580 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4581 __offsetof(struct hn_rx_ring, hn_lro_tried),
4582 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
4583 #if __FreeBSD_version >= 1100099
4584 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
4585 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4586 hn_lro_lenlim_sysctl, "IU",
4587 "Max # of data bytes to be aggregated by LRO");
4588 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
4589 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4590 hn_lro_ackcnt_sysctl, "I",
4591 "Max # of ACKs to be aggregated by LRO");
4593 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
4594 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
4595 hn_trust_hcsum_sysctl, "I",
4596 "Trust tcp segement verification on host side, "
4597 "when csum info is missing");
4598 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
4599 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
4600 hn_trust_hcsum_sysctl, "I",
4601 "Trust udp datagram verification on host side, "
4602 "when csum info is missing");
4603 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
4604 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
4605 hn_trust_hcsum_sysctl, "I",
4606 "Trust ip packet verification on host side, "
4607 "when csum info is missing");
4608 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
4609 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4610 __offsetof(struct hn_rx_ring, hn_csum_ip),
4611 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
4612 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
4613 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4614 __offsetof(struct hn_rx_ring, hn_csum_tcp),
4615 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
4616 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
4617 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4618 __offsetof(struct hn_rx_ring, hn_csum_udp),
4619 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
4620 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
4621 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4622 __offsetof(struct hn_rx_ring, hn_csum_trusted),
4623 hn_rx_stat_ulong_sysctl, "LU",
4624 "# of packets that we trust host's csum verification");
4625 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
4626 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4627 __offsetof(struct hn_rx_ring, hn_small_pkts),
4628 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
4629 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
4630 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4631 __offsetof(struct hn_rx_ring, hn_ack_failed),
4632 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
4633 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
4634 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
4635 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
4636 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
4642 hn_destroy_rx_data(struct hn_softc *sc)
4646 if (sc->hn_rxbuf != NULL) {
4647 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
4648 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
4650 device_printf(sc->hn_dev, "RXBUF is referenced\n");
4651 sc->hn_rxbuf = NULL;
4654 if (sc->hn_rx_ring_cnt == 0)
4657 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4658 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4660 if (rxr->hn_br == NULL)
4662 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
4663 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
4665 device_printf(sc->hn_dev,
4666 "%dth channel bufring is referenced", i);
4670 #if defined(INET) || defined(INET6)
4671 tcp_lro_free(&rxr->hn_lro);
4673 free(rxr->hn_pktbuf, M_DEVBUF);
4675 free(sc->hn_rx_ring, M_DEVBUF);
4676 sc->hn_rx_ring = NULL;
4678 sc->hn_rx_ring_cnt = 0;
4679 sc->hn_rx_ring_inuse = 0;
4683 hn_tx_ring_create(struct hn_softc *sc, int id)
4685 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
4686 device_t dev = sc->hn_dev;
4687 bus_dma_tag_t parent_dtag;
4691 txr->hn_tx_idx = id;
4693 #ifndef HN_USE_TXDESC_BUFRING
4694 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
4696 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
4698 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
4699 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
4700 M_DEVBUF, M_WAITOK | M_ZERO);
4701 #ifndef HN_USE_TXDESC_BUFRING
4702 SLIST_INIT(&txr->hn_txlist);
4704 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
4705 M_WAITOK, &txr->hn_tx_lock);
4708 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
4709 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
4710 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
4712 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
4715 #ifdef HN_IFSTART_SUPPORT
4716 if (hn_use_if_start) {
4717 txr->hn_txeof = hn_start_txeof;
4718 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
4719 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
4725 txr->hn_txeof = hn_xmit_txeof;
4726 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
4727 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
4729 br_depth = hn_get_txswq_depth(txr);
4730 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
4731 M_WAITOK, &txr->hn_tx_lock);
4734 txr->hn_direct_tx_size = hn_direct_tx_size;
4737 * Always schedule transmission instead of trying to do direct
4738 * transmission. This one gives the best performance so far.
4740 txr->hn_sched_tx = 1;
4742 parent_dtag = bus_get_dma_tag(dev);
4744 /* DMA tag for RNDIS packet messages. */
4745 error = bus_dma_tag_create(parent_dtag, /* parent */
4746 HN_RNDIS_PKT_ALIGN, /* alignment */
4747 HN_RNDIS_PKT_BOUNDARY, /* boundary */
4748 BUS_SPACE_MAXADDR, /* lowaddr */
4749 BUS_SPACE_MAXADDR, /* highaddr */
4750 NULL, NULL, /* filter, filterarg */
4751 HN_RNDIS_PKT_LEN, /* maxsize */
4753 HN_RNDIS_PKT_LEN, /* maxsegsize */
4755 NULL, /* lockfunc */
4756 NULL, /* lockfuncarg */
4757 &txr->hn_tx_rndis_dtag);
4759 device_printf(dev, "failed to create rndis dmatag\n");
4763 /* DMA tag for data. */
4764 error = bus_dma_tag_create(parent_dtag, /* parent */
4766 HN_TX_DATA_BOUNDARY, /* boundary */
4767 BUS_SPACE_MAXADDR, /* lowaddr */
4768 BUS_SPACE_MAXADDR, /* highaddr */
4769 NULL, NULL, /* filter, filterarg */
4770 HN_TX_DATA_MAXSIZE, /* maxsize */
4771 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
4772 HN_TX_DATA_SEGSIZE, /* maxsegsize */
4774 NULL, /* lockfunc */
4775 NULL, /* lockfuncarg */
4776 &txr->hn_tx_data_dtag);
4778 device_printf(dev, "failed to create data dmatag\n");
4782 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
4783 struct hn_txdesc *txd = &txr->hn_txdesc[i];
4786 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
4787 STAILQ_INIT(&txd->agg_list);
4790 * Allocate and load RNDIS packet message.
4792 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
4793 (void **)&txd->rndis_pkt,
4794 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
4795 &txd->rndis_pkt_dmap);
4798 "failed to allocate rndis_packet_msg, %d\n", i);
4802 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
4803 txd->rndis_pkt_dmap,
4804 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
4805 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
4809 "failed to load rndis_packet_msg, %d\n", i);
4810 bus_dmamem_free(txr->hn_tx_rndis_dtag,
4811 txd->rndis_pkt, txd->rndis_pkt_dmap);
4815 /* DMA map for TX data. */
4816 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
4820 "failed to allocate tx data dmamap\n");
4821 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
4822 txd->rndis_pkt_dmap);
4823 bus_dmamem_free(txr->hn_tx_rndis_dtag,
4824 txd->rndis_pkt, txd->rndis_pkt_dmap);
4828 /* All set, put it to list */
4829 txd->flags |= HN_TXD_FLAG_ONLIST;
4830 #ifndef HN_USE_TXDESC_BUFRING
4831 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
4833 buf_ring_enqueue(txr->hn_txdesc_br, txd);
4836 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
4838 if (sc->hn_tx_sysctl_tree != NULL) {
4839 struct sysctl_oid_list *child;
4840 struct sysctl_ctx_list *ctx;
4844 * Create per TX ring sysctl tree:
4845 * dev.hn.UNIT.tx.RINGID
4847 ctx = device_get_sysctl_ctx(dev);
4848 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
4850 snprintf(name, sizeof(name), "%d", id);
4851 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
4852 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4854 if (txr->hn_tx_sysctl_tree != NULL) {
4855 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
4858 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
4859 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
4860 "# of available TX descs");
4862 #ifdef HN_IFSTART_SUPPORT
4863 if (!hn_use_if_start)
4866 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
4867 CTLFLAG_RD, &txr->hn_oactive, 0,
4870 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
4871 CTLFLAG_RW, &txr->hn_pkts,
4872 "# of packets transmitted");
4873 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
4874 CTLFLAG_RW, &txr->hn_sends, "# of sends");
4882 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
4884 struct hn_tx_ring *txr = txd->txr;
4886 KASSERT(txd->m == NULL, ("still has mbuf installed"));
4887 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
4889 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
4890 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
4891 txd->rndis_pkt_dmap);
4892 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
4896 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
4899 KASSERT(txd->refs == 0 || txd->refs == 1,
4900 ("invalid txd refs %d", txd->refs));
4902 /* Aggregated txds will be freed by their aggregating txd. */
4903 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
4906 freed = hn_txdesc_put(txr, txd);
4907 KASSERT(freed, ("can't free txdesc"));
4912 hn_tx_ring_destroy(struct hn_tx_ring *txr)
4916 if (txr->hn_txdesc == NULL)
4921 * Because the freeing of aggregated txds will be deferred
4922 * to the aggregating txd, two passes are used here:
4923 * - The first pass GCes any pending txds. This GC is necessary,
4924 * since if the channels are revoked, hypervisor will not
4925 * deliver send-done for all pending txds.
4926 * - The second pass frees the busdma stuffs, i.e. after all txds
4929 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4930 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
4931 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4932 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
4934 if (txr->hn_tx_data_dtag != NULL)
4935 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
4936 if (txr->hn_tx_rndis_dtag != NULL)
4937 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
4939 #ifdef HN_USE_TXDESC_BUFRING
4940 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
4943 free(txr->hn_txdesc, M_DEVBUF);
4944 txr->hn_txdesc = NULL;
4946 if (txr->hn_mbuf_br != NULL)
4947 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
4949 #ifndef HN_USE_TXDESC_BUFRING
4950 mtx_destroy(&txr->hn_txlist_spin);
4952 mtx_destroy(&txr->hn_tx_lock);
4956 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
4958 struct sysctl_oid_list *child;
4959 struct sysctl_ctx_list *ctx;
4963 * Create TXBUF for chimney sending.
4965 * NOTE: It is shared by all channels.
4967 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
4968 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
4969 BUS_DMA_WAITOK | BUS_DMA_ZERO);
4970 if (sc->hn_chim == NULL) {
4971 device_printf(sc->hn_dev, "allocate txbuf failed\n");
4975 sc->hn_tx_ring_cnt = ring_cnt;
4976 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4978 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
4979 M_DEVBUF, M_WAITOK | M_ZERO);
4981 ctx = device_get_sysctl_ctx(sc->hn_dev);
4982 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
4984 /* Create dev.hn.UNIT.tx sysctl tree */
4985 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
4986 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4988 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4991 error = hn_tx_ring_create(sc, i);
4996 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
4997 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4998 __offsetof(struct hn_tx_ring, hn_no_txdescs),
4999 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5000 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5001 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5002 __offsetof(struct hn_tx_ring, hn_send_failed),
5003 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5004 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5005 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5006 __offsetof(struct hn_tx_ring, hn_txdma_failed),
5007 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5008 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5009 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5010 __offsetof(struct hn_tx_ring, hn_flush_failed),
5011 hn_tx_stat_ulong_sysctl, "LU",
5012 "# of packet transmission aggregation flush failure");
5013 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5014 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5015 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5016 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5017 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5018 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5019 __offsetof(struct hn_tx_ring, hn_tx_chimney),
5020 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5021 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5022 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5023 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5024 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5025 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5026 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5027 "# of total TX descs");
5028 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5029 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5030 "Chimney send packet size upper boundary");
5031 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5032 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5033 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5034 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5035 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5036 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5037 hn_tx_conf_int_sysctl, "I",
5038 "Size of the packet for direct transmission");
5039 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5040 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5041 __offsetof(struct hn_tx_ring, hn_sched_tx),
5042 hn_tx_conf_int_sysctl, "I",
5043 "Always schedule transmission "
5044 "instead of doing direct transmission");
5045 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5046 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5047 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5048 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5049 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5050 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5051 "Applied packet transmission aggregation size");
5052 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5053 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5054 hn_txagg_pktmax_sysctl, "I",
5055 "Applied packet transmission aggregation packets");
5056 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5057 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5058 hn_txagg_align_sysctl, "I",
5059 "Applied packet transmission aggregation alignment");
5065 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5069 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5070 sc->hn_tx_ring[i].hn_chim_size = chim_size;
5074 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5076 struct ifnet *ifp = sc->hn_ifp;
5082 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5085 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5086 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5087 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5089 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5090 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5091 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5093 if (tso_maxlen < tso_minlen)
5094 tso_maxlen = tso_minlen;
5095 else if (tso_maxlen > IP_MAXPACKET)
5096 tso_maxlen = IP_MAXPACKET;
5097 if (tso_maxlen > sc->hn_ndis_tso_szmax)
5098 tso_maxlen = sc->hn_ndis_tso_szmax;
5099 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5101 if (hn_xpnt_vf_isready(sc)) {
5102 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5103 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5105 ifp->if_hw_tsomax = hw_tsomax;
5107 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5111 hn_fixup_tx_data(struct hn_softc *sc)
5113 uint64_t csum_assist;
5116 hn_set_chim_size(sc, sc->hn_chim_szmax);
5117 if (hn_tx_chimney_size > 0 &&
5118 hn_tx_chimney_size < sc->hn_chim_szmax)
5119 hn_set_chim_size(sc, hn_tx_chimney_size);
5122 if (sc->hn_caps & HN_CAP_IPCS)
5123 csum_assist |= CSUM_IP;
5124 if (sc->hn_caps & HN_CAP_TCP4CS)
5125 csum_assist |= CSUM_IP_TCP;
5126 if (sc->hn_caps & HN_CAP_UDP4CS)
5127 csum_assist |= CSUM_IP_UDP;
5128 if (sc->hn_caps & HN_CAP_TCP6CS)
5129 csum_assist |= CSUM_IP6_TCP;
5130 if (sc->hn_caps & HN_CAP_UDP6CS)
5131 csum_assist |= CSUM_IP6_UDP;
5132 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5133 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5135 if (sc->hn_caps & HN_CAP_HASHVAL) {
5137 * Support HASHVAL pktinfo on TX path.
5140 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5141 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5142 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5147 hn_destroy_tx_data(struct hn_softc *sc)
5151 if (sc->hn_chim != NULL) {
5152 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5153 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5155 device_printf(sc->hn_dev,
5156 "chimney sending buffer is referenced");
5161 if (sc->hn_tx_ring_cnt == 0)
5164 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5165 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5167 free(sc->hn_tx_ring, M_DEVBUF);
5168 sc->hn_tx_ring = NULL;
5170 sc->hn_tx_ring_cnt = 0;
5171 sc->hn_tx_ring_inuse = 0;
5174 #ifdef HN_IFSTART_SUPPORT
5177 hn_start_taskfunc(void *xtxr, int pending __unused)
5179 struct hn_tx_ring *txr = xtxr;
5181 mtx_lock(&txr->hn_tx_lock);
5182 hn_start_locked(txr, 0);
5183 mtx_unlock(&txr->hn_tx_lock);
5187 hn_start_locked(struct hn_tx_ring *txr, int len)
5189 struct hn_softc *sc = txr->hn_sc;
5190 struct ifnet *ifp = sc->hn_ifp;
5193 KASSERT(hn_use_if_start,
5194 ("hn_start_locked is called, when if_start is disabled"));
5195 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5196 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5197 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5199 if (__predict_false(txr->hn_suspended))
5202 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5206 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5207 struct hn_txdesc *txd;
5208 struct mbuf *m_head;
5211 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5215 if (len > 0 && m_head->m_pkthdr.len > len) {
5217 * This sending could be time consuming; let callers
5218 * dispatch this packet sending (and sending of any
5219 * following up packets) to tx taskqueue.
5221 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5226 #if defined(INET6) || defined(INET)
5227 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5228 m_head = hn_tso_fixup(m_head);
5229 if (__predict_false(m_head == NULL)) {
5230 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5236 txd = hn_txdesc_get(txr);
5238 txr->hn_no_txdescs++;
5239 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5240 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5244 error = hn_encap(ifp, txr, txd, &m_head);
5246 /* Both txd and m_head are freed */
5247 KASSERT(txr->hn_agg_txd == NULL,
5248 ("encap failed w/ pending aggregating txdesc"));
5252 if (txr->hn_agg_pktleft == 0) {
5253 if (txr->hn_agg_txd != NULL) {
5254 KASSERT(m_head == NULL,
5255 ("pending mbuf for aggregating txdesc"));
5256 error = hn_flush_txagg(ifp, txr);
5257 if (__predict_false(error)) {
5258 atomic_set_int(&ifp->if_drv_flags,
5263 KASSERT(m_head != NULL, ("mbuf was freed"));
5264 error = hn_txpkt(ifp, txr, txd);
5265 if (__predict_false(error)) {
5266 /* txd is freed, but m_head is not */
5267 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5268 atomic_set_int(&ifp->if_drv_flags,
5276 KASSERT(txr->hn_agg_txd != NULL,
5277 ("no aggregating txdesc"));
5278 KASSERT(m_head == NULL,
5279 ("pending mbuf for aggregating txdesc"));
5284 /* Flush pending aggerated transmission. */
5285 if (txr->hn_agg_txd != NULL)
5286 hn_flush_txagg(ifp, txr);
5291 hn_start(struct ifnet *ifp)
5293 struct hn_softc *sc = ifp->if_softc;
5294 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5296 if (txr->hn_sched_tx)
5299 if (mtx_trylock(&txr->hn_tx_lock)) {
5302 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5303 mtx_unlock(&txr->hn_tx_lock);
5308 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5312 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5314 struct hn_tx_ring *txr = xtxr;
5316 mtx_lock(&txr->hn_tx_lock);
5317 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5318 hn_start_locked(txr, 0);
5319 mtx_unlock(&txr->hn_tx_lock);
5323 hn_start_txeof(struct hn_tx_ring *txr)
5325 struct hn_softc *sc = txr->hn_sc;
5326 struct ifnet *ifp = sc->hn_ifp;
5328 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5330 if (txr->hn_sched_tx)
5333 if (mtx_trylock(&txr->hn_tx_lock)) {
5336 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5337 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5338 mtx_unlock(&txr->hn_tx_lock);
5340 taskqueue_enqueue(txr->hn_tx_taskq,
5346 * Release the OACTIVE earlier, with the hope, that
5347 * others could catch up. The task will clear the
5348 * flag again with the hn_tx_lock to avoid possible
5351 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5352 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5356 #endif /* HN_IFSTART_SUPPORT */
5359 hn_xmit(struct hn_tx_ring *txr, int len)
5361 struct hn_softc *sc = txr->hn_sc;
5362 struct ifnet *ifp = sc->hn_ifp;
5363 struct mbuf *m_head;
5366 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5367 #ifdef HN_IFSTART_SUPPORT
5368 KASSERT(hn_use_if_start == 0,
5369 ("hn_xmit is called, when if_start is enabled"));
5371 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5373 if (__predict_false(txr->hn_suspended))
5376 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5379 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5380 struct hn_txdesc *txd;
5383 if (len > 0 && m_head->m_pkthdr.len > len) {
5385 * This sending could be time consuming; let callers
5386 * dispatch this packet sending (and sending of any
5387 * following up packets) to tx taskqueue.
5389 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5394 txd = hn_txdesc_get(txr);
5396 txr->hn_no_txdescs++;
5397 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5398 txr->hn_oactive = 1;
5402 error = hn_encap(ifp, txr, txd, &m_head);
5404 /* Both txd and m_head are freed; discard */
5405 KASSERT(txr->hn_agg_txd == NULL,
5406 ("encap failed w/ pending aggregating txdesc"));
5407 drbr_advance(ifp, txr->hn_mbuf_br);
5411 if (txr->hn_agg_pktleft == 0) {
5412 if (txr->hn_agg_txd != NULL) {
5413 KASSERT(m_head == NULL,
5414 ("pending mbuf for aggregating txdesc"));
5415 error = hn_flush_txagg(ifp, txr);
5416 if (__predict_false(error)) {
5417 txr->hn_oactive = 1;
5421 KASSERT(m_head != NULL, ("mbuf was freed"));
5422 error = hn_txpkt(ifp, txr, txd);
5423 if (__predict_false(error)) {
5424 /* txd is freed, but m_head is not */
5425 drbr_putback(ifp, txr->hn_mbuf_br,
5427 txr->hn_oactive = 1;
5434 KASSERT(txr->hn_agg_txd != NULL,
5435 ("no aggregating txdesc"));
5436 KASSERT(m_head == NULL,
5437 ("pending mbuf for aggregating txdesc"));
5442 drbr_advance(ifp, txr->hn_mbuf_br);
5445 /* Flush pending aggerated transmission. */
5446 if (txr->hn_agg_txd != NULL)
5447 hn_flush_txagg(ifp, txr);
5452 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5454 struct hn_softc *sc = ifp->if_softc;
5455 struct hn_tx_ring *txr;
5458 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5459 struct rm_priotracker pt;
5461 rm_rlock(&sc->hn_vf_lock, &pt);
5462 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5463 struct mbuf *m_bpf = NULL;
5466 obytes = m->m_pkthdr.len;
5467 if (m->m_flags & M_MCAST)
5470 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5471 if (bpf_peers_present(ifp->if_bpf)) {
5472 m_bpf = m_copypacket(m, M_NOWAIT);
5473 if (m_bpf == NULL) {
5475 * Failed to grab a shallow
5478 ETHER_BPF_MTAP(ifp, m);
5482 ETHER_BPF_MTAP(ifp, m);
5485 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5486 rm_runlock(&sc->hn_vf_lock, &pt);
5488 if (m_bpf != NULL) {
5490 ETHER_BPF_MTAP(ifp, m_bpf);
5494 if (error == ENOBUFS) {
5495 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5497 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5499 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5500 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5502 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5508 rm_runlock(&sc->hn_vf_lock, &pt);
5511 #if defined(INET6) || defined(INET)
5513 * Perform TSO packet header fixup now, since the TSO
5514 * packet header should be cache-hot.
5516 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5517 m = hn_tso_fixup(m);
5518 if (__predict_false(m == NULL)) {
5519 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5526 * Select the TX ring based on flowid
5528 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
5529 #if defined(INET6) || defined(INET)
5532 if (m->m_pkthdr.len < 128 &&
5533 (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
5534 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
5535 m = hn_check_tcpsyn(m, &tcpsyn);
5536 if (__predict_false(m == NULL)) {
5538 IFCOUNTER_OERRORS, 1);
5543 const int tcpsyn = 0;
5548 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
5550 txr = &sc->hn_tx_ring[idx];
5552 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
5554 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5558 if (txr->hn_oactive)
5561 if (txr->hn_sched_tx)
5564 if (mtx_trylock(&txr->hn_tx_lock)) {
5567 sched = hn_xmit(txr, txr->hn_direct_tx_size);
5568 mtx_unlock(&txr->hn_tx_lock);
5573 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5578 hn_tx_ring_qflush(struct hn_tx_ring *txr)
5582 mtx_lock(&txr->hn_tx_lock);
5583 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
5585 mtx_unlock(&txr->hn_tx_lock);
5589 hn_xmit_qflush(struct ifnet *ifp)
5591 struct hn_softc *sc = ifp->if_softc;
5592 struct rm_priotracker pt;
5595 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
5596 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5599 rm_rlock(&sc->hn_vf_lock, &pt);
5600 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
5601 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
5602 rm_runlock(&sc->hn_vf_lock, &pt);
5606 hn_xmit_txeof(struct hn_tx_ring *txr)
5609 if (txr->hn_sched_tx)
5612 if (mtx_trylock(&txr->hn_tx_lock)) {
5615 txr->hn_oactive = 0;
5616 sched = hn_xmit(txr, txr->hn_direct_tx_size);
5617 mtx_unlock(&txr->hn_tx_lock);
5619 taskqueue_enqueue(txr->hn_tx_taskq,
5625 * Release the oactive earlier, with the hope, that
5626 * others could catch up. The task will clear the
5627 * oactive again with the hn_tx_lock to avoid possible
5630 txr->hn_oactive = 0;
5631 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5636 hn_xmit_taskfunc(void *xtxr, int pending __unused)
5638 struct hn_tx_ring *txr = xtxr;
5640 mtx_lock(&txr->hn_tx_lock);
5642 mtx_unlock(&txr->hn_tx_lock);
5646 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
5648 struct hn_tx_ring *txr = xtxr;
5650 mtx_lock(&txr->hn_tx_lock);
5651 txr->hn_oactive = 0;
5653 mtx_unlock(&txr->hn_tx_lock);
5657 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
5659 struct vmbus_chan_br cbr;
5660 struct hn_rx_ring *rxr;
5661 struct hn_tx_ring *txr = NULL;
5664 idx = vmbus_chan_subidx(chan);
5667 * Link this channel to RX/TX ring.
5669 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
5670 ("invalid channel index %d, should > 0 && < %d",
5671 idx, sc->hn_rx_ring_inuse));
5672 rxr = &sc->hn_rx_ring[idx];
5673 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
5674 ("RX ring %d already attached", idx));
5675 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
5676 rxr->hn_chan = chan;
5679 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
5680 idx, vmbus_chan_id(chan));
5683 if (idx < sc->hn_tx_ring_inuse) {
5684 txr = &sc->hn_tx_ring[idx];
5685 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
5686 ("TX ring %d already attached", idx));
5687 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
5689 txr->hn_chan = chan;
5691 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
5692 idx, vmbus_chan_id(chan));
5696 /* Bind this channel to a proper CPU. */
5697 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
5702 cbr.cbr = rxr->hn_br;
5703 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
5704 cbr.cbr_txsz = HN_TXBR_SIZE;
5705 cbr.cbr_rxsz = HN_RXBR_SIZE;
5706 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
5708 if (error == EISCONN) {
5709 if_printf(sc->hn_ifp, "bufring is connected after "
5710 "chan%u open failure\n", vmbus_chan_id(chan));
5711 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
5713 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
5714 vmbus_chan_id(chan), error);
5721 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
5723 struct hn_rx_ring *rxr;
5726 idx = vmbus_chan_subidx(chan);
5729 * Link this channel to RX/TX ring.
5731 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
5732 ("invalid channel index %d, should > 0 && < %d",
5733 idx, sc->hn_rx_ring_inuse));
5734 rxr = &sc->hn_rx_ring[idx];
5735 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
5736 ("RX ring %d is not attached", idx));
5737 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
5739 if (idx < sc->hn_tx_ring_inuse) {
5740 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
5742 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
5743 ("TX ring %d is not attached attached", idx));
5744 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
5748 * Close this channel.
5751 * Channel closing does _not_ destroy the target channel.
5753 error = vmbus_chan_close_direct(chan);
5754 if (error == EISCONN) {
5755 if_printf(sc->hn_ifp, "chan%u bufring is connected "
5756 "after being closed\n", vmbus_chan_id(chan));
5757 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
5759 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
5760 vmbus_chan_id(chan), error);
5765 hn_attach_subchans(struct hn_softc *sc)
5767 struct vmbus_channel **subchans;
5768 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
5771 KASSERT(subchan_cnt > 0, ("no sub-channels"));
5773 /* Attach the sub-channels. */
5774 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
5775 for (i = 0; i < subchan_cnt; ++i) {
5778 error1 = hn_chan_attach(sc, subchans[i]);
5781 /* Move on; all channels will be detached later. */
5784 vmbus_subchan_rel(subchans, subchan_cnt);
5787 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
5790 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
5798 hn_detach_allchans(struct hn_softc *sc)
5800 struct vmbus_channel **subchans;
5801 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
5804 if (subchan_cnt == 0)
5807 /* Detach the sub-channels. */
5808 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
5809 for (i = 0; i < subchan_cnt; ++i)
5810 hn_chan_detach(sc, subchans[i]);
5811 vmbus_subchan_rel(subchans, subchan_cnt);
5815 * Detach the primary channel, _after_ all sub-channels
5818 hn_chan_detach(sc, sc->hn_prichan);
5820 /* Wait for sub-channels to be destroyed, if any. */
5821 vmbus_subchan_drain(sc->hn_prichan);
5824 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5825 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
5826 HN_RX_FLAG_ATTACHED) == 0,
5827 ("%dth RX ring is still attached", i));
5829 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5830 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
5831 HN_TX_FLAG_ATTACHED) == 0,
5832 ("%dth TX ring is still attached", i));
5838 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
5840 struct vmbus_channel **subchans;
5841 int nchan, rxr_cnt, error;
5843 nchan = *nsubch + 1;
5846 * Multiple RX/TX rings are not requested.
5853 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
5856 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
5858 /* No RSS; this is benign. */
5863 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
5867 if (nchan > rxr_cnt)
5870 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
5876 * Allocate sub-channels from NVS.
5878 *nsubch = nchan - 1;
5879 error = hn_nvs_alloc_subchans(sc, nsubch);
5880 if (error || *nsubch == 0) {
5881 /* Failed to allocate sub-channels. */
5887 * Wait for all sub-channels to become ready before moving on.
5889 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
5890 vmbus_subchan_rel(subchans, *nsubch);
5895 hn_synth_attachable(const struct hn_softc *sc)
5899 if (sc->hn_flags & HN_FLAG_ERRORS)
5902 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5903 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5905 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
5912 * Make sure that the RX filter is zero after the successful
5913 * RNDIS initialization.
5916 * Under certain conditions on certain versions of Hyper-V,
5917 * the RNDIS rxfilter is _not_ zero on the hypervisor side
5918 * after the successful RNDIS initialization, which breaks
5919 * the assumption of any following code (well, it breaks the
5920 * RNDIS API contract actually). Clear the RNDIS rxfilter
5921 * explicitly, drain packets sneaking through, and drain the
5922 * interrupt taskqueues scheduled due to the stealth packets.
5925 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
5929 hn_drain_rxtx(sc, nchan);
5933 hn_synth_attach(struct hn_softc *sc, int mtu)
5935 #define ATTACHED_NVS 0x0002
5936 #define ATTACHED_RNDIS 0x0004
5938 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
5939 int error, nsubch, nchan = 1, i, rndis_inited;
5940 uint32_t old_caps, attached = 0;
5942 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
5943 ("synthetic parts were attached"));
5945 if (!hn_synth_attachable(sc))
5948 /* Save capabilities for later verification. */
5949 old_caps = sc->hn_caps;
5952 /* Clear RSS stuffs. */
5953 sc->hn_rss_ind_size = 0;
5954 sc->hn_rss_hash = 0;
5957 * Attach the primary channel _before_ attaching NVS and RNDIS.
5959 error = hn_chan_attach(sc, sc->hn_prichan);
5966 error = hn_nvs_attach(sc, mtu);
5969 attached |= ATTACHED_NVS;
5972 * Attach RNDIS _after_ NVS is attached.
5974 error = hn_rndis_attach(sc, mtu, &rndis_inited);
5976 attached |= ATTACHED_RNDIS;
5981 * Make sure capabilities are not changed.
5983 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
5984 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
5985 old_caps, sc->hn_caps);
5991 * Allocate sub-channels for multi-TX/RX rings.
5994 * The # of RX rings that can be used is equivalent to the # of
5995 * channels to be requested.
5997 nsubch = sc->hn_rx_ring_cnt - 1;
5998 error = hn_synth_alloc_subchans(sc, &nsubch);
6001 /* NOTE: _Full_ synthetic parts detach is required now. */
6002 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6005 * Set the # of TX/RX rings that could be used according to
6006 * the # of channels that NVS offered.
6009 hn_set_ring_inuse(sc, nchan);
6011 /* Only the primary channel can be used; done */
6016 * Attach the sub-channels.
6018 * NOTE: hn_set_ring_inuse() _must_ have been called.
6020 error = hn_attach_subchans(sc);
6025 * Configure RSS key and indirect table _after_ all sub-channels
6028 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6030 * RSS key is not set yet; set it to the default RSS key.
6033 if_printf(sc->hn_ifp, "setup default RSS key\n");
6034 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6035 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6038 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6040 * RSS indirect table is not set yet; set it up in round-
6044 if_printf(sc->hn_ifp, "setup default RSS indirect "
6047 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
6048 rss->rss_ind[i] = i % nchan;
6049 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6052 * # of usable channels may be changed, so we have to
6053 * make sure that all entries in RSS indirect table
6056 * NOTE: hn_set_ring_inuse() _must_ have been called.
6058 hn_rss_ind_fixup(sc);
6061 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6066 * Fixup transmission aggregation setup.
6069 hn_rndis_init_fixat(sc, nchan);
6073 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6074 hn_rndis_init_fixat(sc, nchan);
6075 hn_synth_detach(sc);
6077 if (attached & ATTACHED_RNDIS) {
6078 hn_rndis_init_fixat(sc, nchan);
6079 hn_rndis_detach(sc);
6081 if (attached & ATTACHED_NVS)
6083 hn_chan_detach(sc, sc->hn_prichan);
6084 /* Restore old capabilities. */
6085 sc->hn_caps = old_caps;
6089 #undef ATTACHED_RNDIS
6095 * The interface must have been suspended though hn_suspend(), before
6096 * this function get called.
6099 hn_synth_detach(struct hn_softc *sc)
6102 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6103 ("synthetic parts were not attached"));
6105 /* Detach the RNDIS first. */
6106 hn_rndis_detach(sc);
6111 /* Detach all of the channels. */
6112 hn_detach_allchans(sc);
6114 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6118 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6120 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6121 ("invalid ring count %d", ring_cnt));
6123 if (sc->hn_tx_ring_cnt > ring_cnt)
6124 sc->hn_tx_ring_inuse = ring_cnt;
6126 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6127 sc->hn_rx_ring_inuse = ring_cnt;
6130 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6131 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6136 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6141 * The TX bufring will not be drained by the hypervisor,
6142 * if the primary channel is revoked.
6144 while (!vmbus_chan_rx_empty(chan) ||
6145 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6146 !vmbus_chan_tx_empty(chan)))
6148 vmbus_chan_intr_drain(chan);
6152 hn_disable_rx(struct hn_softc *sc)
6156 * Disable RX by clearing RX filter forcefully.
6158 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6159 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6162 * Give RNDIS enough time to flush all pending data packets.
6164 pause("waitrx", (200 * hz) / 1000);
6169 * RX/TX _must_ have been suspended/disabled, before this function
6173 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6175 struct vmbus_channel **subch = NULL;
6179 * Drain RX/TX bufrings and interrupts.
6183 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6185 if (subch != NULL) {
6188 for (i = 0; i < nsubch; ++i)
6189 hn_chan_drain(sc, subch[i]);
6191 hn_chan_drain(sc, sc->hn_prichan);
6194 vmbus_subchan_rel(subch, nsubch);
6198 hn_suspend_data(struct hn_softc *sc)
6200 struct hn_tx_ring *txr;
6208 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6209 txr = &sc->hn_tx_ring[i];
6211 mtx_lock(&txr->hn_tx_lock);
6212 txr->hn_suspended = 1;
6213 mtx_unlock(&txr->hn_tx_lock);
6214 /* No one is able send more packets now. */
6217 * Wait for all pending sends to finish.
6220 * We will _not_ receive all pending send-done, if the
6221 * primary channel is revoked.
6223 while (hn_tx_ring_pending(txr) &&
6224 !vmbus_chan_is_revoked(sc->hn_prichan))
6225 pause("hnwtx", 1 /* 1 tick */);
6236 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6239 * Drain any pending TX tasks.
6242 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6243 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6245 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6246 txr = &sc->hn_tx_ring[i];
6248 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6249 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6254 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6257 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6261 hn_suspend_mgmt(struct hn_softc *sc)
6268 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6269 * through hn_mgmt_taskq.
6271 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6272 vmbus_chan_run_task(sc->hn_prichan, &task);
6275 * Make sure that all pending management tasks are completed.
6277 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6278 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6279 taskqueue_drain_all(sc->hn_mgmt_taskq0);
6283 hn_suspend(struct hn_softc *sc)
6286 /* Disable polling. */
6290 * If the non-transparent mode VF is activated, the synthetic
6291 * device is receiving packets, so the data path of the
6292 * synthetic device must be suspended.
6294 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6295 (sc->hn_flags & HN_FLAG_RXVF))
6296 hn_suspend_data(sc);
6297 hn_suspend_mgmt(sc);
6301 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6305 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6306 ("invalid TX ring count %d", tx_ring_cnt));
6308 for (i = 0; i < tx_ring_cnt; ++i) {
6309 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6311 mtx_lock(&txr->hn_tx_lock);
6312 txr->hn_suspended = 0;
6313 mtx_unlock(&txr->hn_tx_lock);
6318 hn_resume_data(struct hn_softc *sc)
6327 hn_rxfilter_config(sc);
6330 * Make sure to clear suspend status on "all" TX rings,
6331 * since hn_tx_ring_inuse can be changed after
6332 * hn_suspend_data().
6334 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6336 #ifdef HN_IFSTART_SUPPORT
6337 if (!hn_use_if_start)
6341 * Flush unused drbrs, since hn_tx_ring_inuse may be
6344 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6345 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6351 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6352 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6355 * Use txeof task, so that any pending oactive can be
6358 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6363 hn_resume_mgmt(struct hn_softc *sc)
6366 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6369 * Kick off network change detection, if it was pending.
6370 * If no network change was pending, start link status
6371 * checks, which is more lightweight than network change
6374 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6375 hn_change_network(sc);
6377 hn_update_link_status(sc);
6381 hn_resume(struct hn_softc *sc)
6385 * If the non-transparent mode VF is activated, the synthetic
6386 * device have to receive packets, so the data path of the
6387 * synthetic device must be resumed.
6389 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6390 (sc->hn_flags & HN_FLAG_RXVF))
6394 * Don't resume link status change if VF is attached/activated.
6395 * - In the non-transparent VF mode, the synthetic device marks
6396 * link down until the VF is deactivated; i.e. VF is down.
6397 * - In transparent VF mode, VF's media status is used until
6398 * the VF is detached.
6400 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6401 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6405 * Re-enable polling if this interface is running and
6406 * the polling is requested.
6408 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6409 hn_polling(sc, sc->hn_pollhz);
6413 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6415 const struct rndis_status_msg *msg;
6418 if (dlen < sizeof(*msg)) {
6419 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6424 switch (msg->rm_status) {
6425 case RNDIS_STATUS_MEDIA_CONNECT:
6426 case RNDIS_STATUS_MEDIA_DISCONNECT:
6427 hn_update_link_status(sc);
6430 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6431 case RNDIS_STATUS_LINK_SPEED_CHANGE:
6432 /* Not really useful; ignore. */
6435 case RNDIS_STATUS_NETWORK_CHANGE:
6436 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6437 if (dlen < ofs + msg->rm_stbuflen ||
6438 msg->rm_stbuflen < sizeof(uint32_t)) {
6439 if_printf(sc->hn_ifp, "network changed\n");
6443 memcpy(&change, ((const uint8_t *)msg) + ofs,
6445 if_printf(sc->hn_ifp, "network changed, change %u\n",
6448 hn_change_network(sc);
6452 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6459 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6461 const struct rndis_pktinfo *pi = info_data;
6464 while (info_dlen != 0) {
6468 if (__predict_false(info_dlen < sizeof(*pi)))
6470 if (__predict_false(info_dlen < pi->rm_size))
6472 info_dlen -= pi->rm_size;
6474 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6476 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6478 dlen = pi->rm_size - pi->rm_pktinfooffset;
6481 switch (pi->rm_type) {
6482 case NDIS_PKTINFO_TYPE_VLAN:
6483 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
6485 info->vlan_info = *((const uint32_t *)data);
6486 mask |= HN_RXINFO_VLAN;
6489 case NDIS_PKTINFO_TYPE_CSUM:
6490 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
6492 info->csum_info = *((const uint32_t *)data);
6493 mask |= HN_RXINFO_CSUM;
6496 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
6497 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
6499 info->hash_value = *((const uint32_t *)data);
6500 mask |= HN_RXINFO_HASHVAL;
6503 case HN_NDIS_PKTINFO_TYPE_HASHINF:
6504 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
6506 info->hash_info = *((const uint32_t *)data);
6507 mask |= HN_RXINFO_HASHINF;
6514 if (mask == HN_RXINFO_ALL) {
6515 /* All found; done */
6519 pi = (const struct rndis_pktinfo *)
6520 ((const uint8_t *)pi + pi->rm_size);
6525 * - If there is no hash value, invalidate the hash info.
6527 if ((mask & HN_RXINFO_HASHVAL) == 0)
6528 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
6532 static __inline bool
6533 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
6536 if (off < check_off) {
6537 if (__predict_true(off + len <= check_off))
6539 } else if (off > check_off) {
6540 if (__predict_true(check_off + check_len <= off))
6547 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
6549 const struct rndis_packet_msg *pkt;
6550 struct hn_rxinfo info;
6551 int data_off, pktinfo_off, data_len, pktinfo_len;
6556 if (__predict_false(dlen < sizeof(*pkt))) {
6557 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
6562 if (__predict_false(dlen < pkt->rm_len)) {
6563 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
6564 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
6567 if (__predict_false(pkt->rm_len <
6568 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
6569 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
6570 "msglen %u, data %u, oob %u, pktinfo %u\n",
6571 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
6572 pkt->rm_pktinfolen);
6575 if (__predict_false(pkt->rm_datalen == 0)) {
6576 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
6583 #define IS_OFFSET_INVALID(ofs) \
6584 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
6585 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
6587 /* XXX Hyper-V does not meet data offset alignment requirement */
6588 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
6589 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6590 "data offset %u\n", pkt->rm_dataoffset);
6593 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
6594 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
6595 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6596 "oob offset %u\n", pkt->rm_oobdataoffset);
6599 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
6600 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
6601 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6602 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
6606 #undef IS_OFFSET_INVALID
6608 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
6609 data_len = pkt->rm_datalen;
6610 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
6611 pktinfo_len = pkt->rm_pktinfolen;
6614 * Check OOB coverage.
6616 if (__predict_false(pkt->rm_oobdatalen != 0)) {
6617 int oob_off, oob_len;
6619 if_printf(rxr->hn_ifp, "got oobdata\n");
6620 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
6621 oob_len = pkt->rm_oobdatalen;
6623 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
6624 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6625 "oob overflow, msglen %u, oob abs %d len %d\n",
6626 pkt->rm_len, oob_off, oob_len);
6631 * Check against data.
6633 if (hn_rndis_check_overlap(oob_off, oob_len,
6634 data_off, data_len)) {
6635 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6636 "oob overlaps data, oob abs %d len %d, "
6637 "data abs %d len %d\n",
6638 oob_off, oob_len, data_off, data_len);
6643 * Check against pktinfo.
6645 if (pktinfo_len != 0 &&
6646 hn_rndis_check_overlap(oob_off, oob_len,
6647 pktinfo_off, pktinfo_len)) {
6648 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6649 "oob overlaps pktinfo, oob abs %d len %d, "
6650 "pktinfo abs %d len %d\n",
6651 oob_off, oob_len, pktinfo_off, pktinfo_len);
6657 * Check per-packet-info coverage and find useful per-packet-info.
6659 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
6660 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
6661 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
6662 if (__predict_true(pktinfo_len != 0)) {
6666 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
6667 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6668 "pktinfo overflow, msglen %u, "
6669 "pktinfo abs %d len %d\n",
6670 pkt->rm_len, pktinfo_off, pktinfo_len);
6675 * Check packet info coverage.
6677 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
6678 data_off, data_len);
6679 if (__predict_false(overlap)) {
6680 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6681 "pktinfo overlap data, pktinfo abs %d len %d, "
6682 "data abs %d len %d\n",
6683 pktinfo_off, pktinfo_len, data_off, data_len);
6688 * Find useful per-packet-info.
6690 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
6691 pktinfo_len, &info);
6692 if (__predict_false(error)) {
6693 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
6699 if (__predict_false(data_off + data_len > pkt->rm_len)) {
6700 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6701 "data overflow, msglen %u, data abs %d len %d\n",
6702 pkt->rm_len, data_off, data_len);
6705 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
6708 static __inline void
6709 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
6711 const struct rndis_msghdr *hdr;
6713 if (__predict_false(dlen < sizeof(*hdr))) {
6714 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
6719 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
6720 /* Hot data path. */
6721 hn_rndis_rx_data(rxr, data, dlen);
6726 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
6727 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
6729 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
6733 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
6735 const struct hn_nvs_hdr *hdr;
6737 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
6738 if_printf(sc->hn_ifp, "invalid nvs notify\n");
6741 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
6743 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
6744 /* Useless; ignore */
6747 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
6751 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
6752 const struct vmbus_chanpkt_hdr *pkt)
6754 struct hn_nvs_sendctx *sndc;
6756 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
6757 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
6758 VMBUS_CHANPKT_DATALEN(pkt));
6761 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
6767 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
6768 const struct vmbus_chanpkt_hdr *pkthdr)
6770 const struct vmbus_chanpkt_rxbuf *pkt;
6771 const struct hn_nvs_hdr *nvs_hdr;
6774 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
6775 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
6778 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
6780 /* Make sure that this is a RNDIS message. */
6781 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
6782 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
6787 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
6788 if (__predict_false(hlen < sizeof(*pkt))) {
6789 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
6792 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
6794 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
6795 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
6800 count = pkt->cp_rxbuf_cnt;
6801 if (__predict_false(hlen <
6802 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
6803 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
6807 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
6808 for (i = 0; i < count; ++i) {
6811 ofs = pkt->cp_rxbuf[i].rb_ofs;
6812 len = pkt->cp_rxbuf[i].rb_len;
6813 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
6814 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
6815 "ofs %d, len %d\n", i, ofs, len);
6818 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
6822 * Ack the consumed RXBUF associated w/ this channel packet,
6823 * so that this RXBUF can be recycled by the hypervisor.
6825 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
6829 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
6832 struct hn_nvs_rndis_ack ack;
6835 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
6836 ack.nvs_status = HN_NVS_STATUS_OK;
6840 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
6841 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
6842 if (__predict_false(error == EAGAIN)) {
6845 * This should _not_ happen in real world, since the
6846 * consumption of the TX bufring from the TX path is
6849 if (rxr->hn_ack_failed == 0)
6850 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
6851 rxr->hn_ack_failed++;
6858 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
6863 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
6865 struct hn_rx_ring *rxr = xrxr;
6866 struct hn_softc *sc = rxr->hn_ifp->if_softc;
6869 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
6872 pktlen = rxr->hn_pktbuf_len;
6873 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
6874 if (__predict_false(error == ENOBUFS)) {
6879 * Expand channel packet buffer.
6882 * Use M_WAITOK here, since allocation failure
6885 nlen = rxr->hn_pktbuf_len * 2;
6886 while (nlen < pktlen)
6888 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
6890 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
6891 rxr->hn_pktbuf_len, nlen);
6893 free(rxr->hn_pktbuf, M_DEVBUF);
6894 rxr->hn_pktbuf = nbuf;
6895 rxr->hn_pktbuf_len = nlen;
6898 } else if (__predict_false(error == EAGAIN)) {
6899 /* No more channel packets; done! */
6902 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
6904 switch (pkt->cph_type) {
6905 case VMBUS_CHANPKT_TYPE_COMP:
6906 hn_nvs_handle_comp(sc, chan, pkt);
6909 case VMBUS_CHANPKT_TYPE_RXBUF:
6910 hn_nvs_handle_rxbuf(rxr, chan, pkt);
6913 case VMBUS_CHANPKT_TYPE_INBAND:
6914 hn_nvs_handle_notify(sc, pkt);
6918 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
6923 hn_chan_rollup(rxr, rxr->hn_txr);
6927 hn_sysinit(void *arg __unused)
6931 #ifdef HN_IFSTART_SUPPORT
6933 * Don't use ifnet.if_start if transparent VF mode is requested;
6934 * mainly due to the IFF_DRV_OACTIVE flag.
6936 if (hn_xpnt_vf && hn_use_if_start) {
6937 hn_use_if_start = 0;
6938 printf("hn: tranparent VF mode, if_transmit will be used, "
6939 "instead of if_start\n");
6942 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
6943 printf("hn: invalid transparent VF attach routing "
6944 "wait timeout %d, reset to %d\n",
6945 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
6946 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
6950 * Initialize VF map.
6952 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
6953 hn_vfmap_size = HN_VFMAP_SIZE_DEF;
6954 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
6958 * Fix the # of TX taskqueues.
6960 if (hn_tx_taskq_cnt <= 0)
6961 hn_tx_taskq_cnt = 1;
6962 else if (hn_tx_taskq_cnt > mp_ncpus)
6963 hn_tx_taskq_cnt = mp_ncpus;
6966 * Fix the TX taskqueue mode.
6968 switch (hn_tx_taskq_mode) {
6969 case HN_TX_TASKQ_M_INDEP:
6970 case HN_TX_TASKQ_M_GLOBAL:
6971 case HN_TX_TASKQ_M_EVTTQ:
6974 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
6978 if (vm_guest != VM_GUEST_HV)
6981 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
6984 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
6985 M_DEVBUF, M_WAITOK);
6986 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
6987 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
6988 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
6989 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
6993 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
6996 hn_sysuninit(void *arg __unused)
6999 if (hn_tx_taskque != NULL) {
7002 for (i = 0; i < hn_tx_taskq_cnt; ++i)
7003 taskqueue_free(hn_tx_taskque[i]);
7004 free(hn_tx_taskque, M_DEVBUF);
7007 if (hn_vfmap != NULL)
7008 free(hn_vfmap, M_DEVBUF);
7009 rm_destroy(&hn_vfmap_lock);
7011 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);