2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016-2017 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
62 #include <sys/param.h>
64 #include <sys/kernel.h>
65 #include <sys/limits.h>
66 #include <sys/malloc.h>
68 #include <sys/module.h>
70 #include <sys/queue.h>
72 #include <sys/rmlock.h>
75 #include <sys/socket.h>
76 #include <sys/sockio.h>
78 #include <sys/sysctl.h>
79 #include <sys/systm.h>
80 #include <sys/taskqueue.h>
81 #include <sys/buf_ring.h>
82 #include <sys/eventhandler.h>
84 #include <machine/atomic.h>
85 #include <machine/in_cksum.h>
88 #include <net/ethernet.h>
90 #include <net/if_arp.h>
91 #include <net/if_dl.h>
92 #include <net/if_media.h>
93 #include <net/if_types.h>
94 #include <net/if_var.h>
95 #include <net/if_vlan_var.h>
96 #include <net/rndis.h>
98 #include <netinet/in_systm.h>
99 #include <netinet/in.h>
100 #include <netinet/ip.h>
101 #include <netinet/ip6.h>
102 #include <netinet/tcp.h>
103 #include <netinet/tcp_lro.h>
104 #include <netinet/udp.h>
106 #include <dev/hyperv/include/hyperv.h>
107 #include <dev/hyperv/include/hyperv_busdma.h>
108 #include <dev/hyperv/include/vmbus.h>
109 #include <dev/hyperv/include/vmbus_xact.h>
111 #include <dev/hyperv/netvsc/ndis.h>
112 #include <dev/hyperv/netvsc/if_hnreg.h>
113 #include <dev/hyperv/netvsc/if_hnvar.h>
114 #include <dev/hyperv/netvsc/hn_nvs.h>
115 #include <dev/hyperv/netvsc/hn_rndis.h>
117 #include "vmbus_if.h"
119 #define HN_IFSTART_SUPPORT
121 #define HN_RING_CNT_DEF_MAX 8
123 #define HN_VFMAP_SIZE_DEF 8
125 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */
127 /* YYY should get it from the underlying channel */
128 #define HN_TX_DESC_CNT 512
130 #define HN_RNDIS_PKT_LEN \
131 (sizeof(struct rndis_packet_msg) + \
132 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
133 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
134 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
135 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
136 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
137 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
139 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
140 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
141 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
142 /* -1 for RNDIS packet message */
143 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
145 #define HN_DIRECT_TX_SIZE_DEF 128
147 #define HN_EARLY_TXEOF_THRESH 8
149 #define HN_PKTBUF_LEN_DEF (16 * 1024)
151 #define HN_LROENT_CNT_DEF 128
153 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
154 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
155 /* YYY 2*MTU is a bit rough, but should be good enough. */
156 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
158 #define HN_LRO_ACKCNT_DEF 1
160 #define HN_LOCK_INIT(sc) \
161 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
162 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
163 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
164 #define HN_LOCK(sc) \
166 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
169 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
171 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
172 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
173 #define HN_CSUM_IP_HWASSIST(sc) \
174 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
175 #define HN_CSUM_IP6_HWASSIST(sc) \
176 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
178 #define HN_PKTSIZE_MIN(align) \
179 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
180 HN_RNDIS_PKT_LEN, (align))
181 #define HN_PKTSIZE(m, align) \
182 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
184 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
187 #ifndef HN_USE_TXDESC_BUFRING
188 SLIST_ENTRY(hn_txdesc) link;
190 STAILQ_ENTRY(hn_txdesc) agg_link;
192 /* Aggregated txdescs, in sending order. */
193 STAILQ_HEAD(, hn_txdesc) agg_list;
195 /* The oldest packet, if transmission aggregation happens. */
197 struct hn_tx_ring *txr;
199 uint32_t flags; /* HN_TXD_FLAG_ */
200 struct hn_nvs_sendctx send_ctx;
204 bus_dmamap_t data_dmap;
206 bus_addr_t rndis_pkt_paddr;
207 struct rndis_packet_msg *rndis_pkt;
208 bus_dmamap_t rndis_pkt_dmap;
211 #define HN_TXD_FLAG_ONLIST 0x0001
212 #define HN_TXD_FLAG_DMAMAP 0x0002
213 #define HN_TXD_FLAG_ONAGG 0x0004
222 struct hn_rxvf_setarg {
223 struct hn_rx_ring *rxr;
224 struct ifnet *vf_ifp;
227 #define HN_RXINFO_VLAN 0x0001
228 #define HN_RXINFO_CSUM 0x0002
229 #define HN_RXINFO_HASHINF 0x0004
230 #define HN_RXINFO_HASHVAL 0x0008
231 #define HN_RXINFO_ALL \
234 HN_RXINFO_HASHINF | \
237 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
238 #define HN_NDIS_RXCSUM_INFO_INVALID 0
239 #define HN_NDIS_HASH_INFO_INVALID 0
241 static int hn_probe(device_t);
242 static int hn_attach(device_t);
243 static int hn_detach(device_t);
244 static int hn_shutdown(device_t);
245 static void hn_chan_callback(struct vmbus_channel *,
248 static void hn_init(void *);
249 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
250 #ifdef HN_IFSTART_SUPPORT
251 static void hn_start(struct ifnet *);
253 static int hn_transmit(struct ifnet *, struct mbuf *);
254 static void hn_xmit_qflush(struct ifnet *);
255 static int hn_ifmedia_upd(struct ifnet *);
256 static void hn_ifmedia_sts(struct ifnet *,
257 struct ifmediareq *);
259 static void hn_ifnet_event(void *, struct ifnet *, int);
260 static void hn_ifaddr_event(void *, struct ifnet *);
261 static void hn_ifnet_attevent(void *, struct ifnet *);
262 static void hn_ifnet_detevent(void *, struct ifnet *);
263 static void hn_ifnet_lnkevent(void *, struct ifnet *, int);
265 static bool hn_ismyvf(const struct hn_softc *,
266 const struct ifnet *);
267 static void hn_rxvf_change(struct hn_softc *,
268 struct ifnet *, bool);
269 static void hn_rxvf_set(struct hn_softc *, struct ifnet *);
270 static void hn_rxvf_set_task(void *, int);
271 static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
272 static int hn_xpnt_vf_iocsetflags(struct hn_softc *);
273 static int hn_xpnt_vf_iocsetcaps(struct hn_softc *,
275 static void hn_xpnt_vf_saveifflags(struct hn_softc *);
276 static bool hn_xpnt_vf_isready(struct hn_softc *);
277 static void hn_xpnt_vf_setready(struct hn_softc *);
278 static void hn_xpnt_vf_init_taskfunc(void *, int);
279 static void hn_xpnt_vf_init(struct hn_softc *);
280 static void hn_xpnt_vf_setenable(struct hn_softc *);
281 static void hn_xpnt_vf_setdisable(struct hn_softc *, bool);
282 static void hn_vf_rss_fixup(struct hn_softc *, bool);
283 static void hn_vf_rss_restore(struct hn_softc *);
285 static int hn_rndis_rxinfo(const void *, int,
287 static void hn_rndis_rx_data(struct hn_rx_ring *,
289 static void hn_rndis_rx_status(struct hn_softc *,
291 static void hn_rndis_init_fixat(struct hn_softc *, int);
293 static void hn_nvs_handle_notify(struct hn_softc *,
294 const struct vmbus_chanpkt_hdr *);
295 static void hn_nvs_handle_comp(struct hn_softc *,
296 struct vmbus_channel *,
297 const struct vmbus_chanpkt_hdr *);
298 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
299 struct vmbus_channel *,
300 const struct vmbus_chanpkt_hdr *);
301 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
302 struct vmbus_channel *, uint64_t);
304 #if __FreeBSD_version >= 1100099
305 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
306 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
308 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
309 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
310 #if __FreeBSD_version < 1100095
311 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
313 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
315 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
316 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
317 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
318 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
319 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
320 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
321 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
322 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
323 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
324 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
325 static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS);
326 static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS);
327 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
328 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
329 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
330 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
331 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
332 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
333 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
334 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
335 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
336 static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
337 static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
339 static void hn_stop(struct hn_softc *, bool);
340 static void hn_init_locked(struct hn_softc *);
341 static int hn_chan_attach(struct hn_softc *,
342 struct vmbus_channel *);
343 static void hn_chan_detach(struct hn_softc *,
344 struct vmbus_channel *);
345 static int hn_attach_subchans(struct hn_softc *);
346 static void hn_detach_allchans(struct hn_softc *);
347 static void hn_chan_rollup(struct hn_rx_ring *,
348 struct hn_tx_ring *);
349 static void hn_set_ring_inuse(struct hn_softc *, int);
350 static int hn_synth_attach(struct hn_softc *, int);
351 static void hn_synth_detach(struct hn_softc *);
352 static int hn_synth_alloc_subchans(struct hn_softc *,
354 static bool hn_synth_attachable(const struct hn_softc *);
355 static void hn_suspend(struct hn_softc *);
356 static void hn_suspend_data(struct hn_softc *);
357 static void hn_suspend_mgmt(struct hn_softc *);
358 static void hn_resume(struct hn_softc *);
359 static void hn_resume_data(struct hn_softc *);
360 static void hn_resume_mgmt(struct hn_softc *);
361 static void hn_suspend_mgmt_taskfunc(void *, int);
362 static void hn_chan_drain(struct hn_softc *,
363 struct vmbus_channel *);
364 static void hn_disable_rx(struct hn_softc *);
365 static void hn_drain_rxtx(struct hn_softc *, int);
366 static void hn_polling(struct hn_softc *, u_int);
367 static void hn_chan_polling(struct vmbus_channel *, u_int);
368 static void hn_mtu_change_fixup(struct hn_softc *);
370 static void hn_update_link_status(struct hn_softc *);
371 static void hn_change_network(struct hn_softc *);
372 static void hn_link_taskfunc(void *, int);
373 static void hn_netchg_init_taskfunc(void *, int);
374 static void hn_netchg_status_taskfunc(void *, int);
375 static void hn_link_status(struct hn_softc *);
377 static int hn_create_rx_data(struct hn_softc *, int);
378 static void hn_destroy_rx_data(struct hn_softc *);
379 static int hn_check_iplen(const struct mbuf *, int);
380 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
381 static int hn_rxfilter_config(struct hn_softc *);
382 static int hn_rss_reconfig(struct hn_softc *);
383 static void hn_rss_ind_fixup(struct hn_softc *);
384 static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t);
385 static int hn_rxpkt(struct hn_rx_ring *, const void *,
386 int, const struct hn_rxinfo *);
387 static uint32_t hn_rss_type_fromndis(uint32_t);
388 static uint32_t hn_rss_type_tondis(uint32_t);
390 static int hn_tx_ring_create(struct hn_softc *, int);
391 static void hn_tx_ring_destroy(struct hn_tx_ring *);
392 static int hn_create_tx_data(struct hn_softc *, int);
393 static void hn_fixup_tx_data(struct hn_softc *);
394 static void hn_destroy_tx_data(struct hn_softc *);
395 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
396 static void hn_txdesc_gc(struct hn_tx_ring *,
398 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
399 struct hn_txdesc *, struct mbuf **);
400 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
402 static void hn_set_chim_size(struct hn_softc *, int);
403 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
404 static bool hn_tx_ring_pending(struct hn_tx_ring *);
405 static void hn_tx_ring_qflush(struct hn_tx_ring *);
406 static void hn_resume_tx(struct hn_softc *, int);
407 static void hn_set_txagg(struct hn_softc *);
408 static void *hn_try_txagg(struct ifnet *,
409 struct hn_tx_ring *, struct hn_txdesc *,
411 static int hn_get_txswq_depth(const struct hn_tx_ring *);
412 static void hn_txpkt_done(struct hn_nvs_sendctx *,
413 struct hn_softc *, struct vmbus_channel *,
415 static int hn_txpkt_sglist(struct hn_tx_ring *,
417 static int hn_txpkt_chim(struct hn_tx_ring *,
419 static int hn_xmit(struct hn_tx_ring *, int);
420 static void hn_xmit_taskfunc(void *, int);
421 static void hn_xmit_txeof(struct hn_tx_ring *);
422 static void hn_xmit_txeof_taskfunc(void *, int);
423 #ifdef HN_IFSTART_SUPPORT
424 static int hn_start_locked(struct hn_tx_ring *, int);
425 static void hn_start_taskfunc(void *, int);
426 static void hn_start_txeof(struct hn_tx_ring *);
427 static void hn_start_txeof_taskfunc(void *, int);
430 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
431 "Hyper-V network interface");
433 /* Trust tcp segements verification on host side. */
434 static int hn_trust_hosttcp = 1;
435 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
436 &hn_trust_hosttcp, 0,
437 "Trust tcp segement verification on host side, "
438 "when csum info is missing (global setting)");
440 /* Trust udp datagrams verification on host side. */
441 static int hn_trust_hostudp = 1;
442 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
443 &hn_trust_hostudp, 0,
444 "Trust udp datagram verification on host side, "
445 "when csum info is missing (global setting)");
447 /* Trust ip packets verification on host side. */
448 static int hn_trust_hostip = 1;
449 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
451 "Trust ip packet verification on host side, "
452 "when csum info is missing (global setting)");
454 /* Limit TSO burst size */
455 static int hn_tso_maxlen = IP_MAXPACKET;
456 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
457 &hn_tso_maxlen, 0, "TSO burst limit");
459 /* Limit chimney send size */
460 static int hn_tx_chimney_size = 0;
461 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
462 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
464 /* Limit the size of packet for direct transmission */
465 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
466 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
467 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
469 /* # of LRO entries per RX ring */
470 #if defined(INET) || defined(INET6)
471 #if __FreeBSD_version >= 1100095
472 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
473 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
474 &hn_lro_entry_count, 0, "LRO entry count");
478 static int hn_tx_taskq_cnt = 1;
479 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
480 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
482 #define HN_TX_TASKQ_M_INDEP 0
483 #define HN_TX_TASKQ_M_GLOBAL 1
484 #define HN_TX_TASKQ_M_EVTTQ 2
486 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
487 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
488 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
489 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
491 #ifndef HN_USE_TXDESC_BUFRING
492 static int hn_use_txdesc_bufring = 0;
494 static int hn_use_txdesc_bufring = 1;
496 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
497 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
499 #ifdef HN_IFSTART_SUPPORT
500 /* Use ifnet.if_start instead of ifnet.if_transmit */
501 static int hn_use_if_start = 0;
502 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
503 &hn_use_if_start, 0, "Use if_start TX method");
506 /* # of channels to use */
507 static int hn_chan_cnt = 0;
508 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
510 "# of channels to use; each channel has one RX ring and one TX ring");
512 /* # of transmit rings to use */
513 static int hn_tx_ring_cnt = 0;
514 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
515 &hn_tx_ring_cnt, 0, "# of TX rings to use");
517 /* Software TX ring deptch */
518 static int hn_tx_swq_depth = 0;
519 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
520 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
522 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
523 #if __FreeBSD_version >= 1100095
524 static u_int hn_lro_mbufq_depth = 0;
525 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
526 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
529 /* Packet transmission aggregation size limit */
530 static int hn_tx_agg_size = -1;
531 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
532 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
534 /* Packet transmission aggregation count limit */
535 static int hn_tx_agg_pkts = -1;
536 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
537 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
540 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
541 0, 0, hn_vflist_sysctl, "A", "VF list");
544 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
545 0, 0, hn_vfmap_sysctl, "A", "VF mapping");
548 static int hn_xpnt_vf = 0;
549 SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
550 &hn_xpnt_vf, 0, "Transparent VF mod");
552 /* Accurate BPF support for Transparent VF */
553 static int hn_xpnt_vf_accbpf = 0;
554 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
555 &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
557 /* Extra wait for transparent VF attach routing; unit seconds. */
558 static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
559 SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
560 &hn_xpnt_vf_attwait, 0,
561 "Extra wait for transparent VF attach routing; unit: seconds");
563 static u_int hn_cpu_index; /* next CPU for channel */
564 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
566 static struct rmlock hn_vfmap_lock;
567 static int hn_vfmap_size;
568 static struct ifnet **hn_vfmap;
571 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
572 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
573 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
574 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
575 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
576 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
579 static const struct hyperv_guid hn_guid = {
581 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
582 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
585 static device_method_t hn_methods[] = {
586 /* Device interface */
587 DEVMETHOD(device_probe, hn_probe),
588 DEVMETHOD(device_attach, hn_attach),
589 DEVMETHOD(device_detach, hn_detach),
590 DEVMETHOD(device_shutdown, hn_shutdown),
594 static driver_t hn_driver = {
597 sizeof(struct hn_softc)
600 static devclass_t hn_devclass;
602 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
603 MODULE_VERSION(hn, 1);
604 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
606 #if __FreeBSD_version >= 1100099
608 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
612 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
613 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
618 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
621 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
622 txd->chim_size == 0, ("invalid rndis sglist txd"));
623 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
624 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
628 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
630 struct hn_nvs_rndis rndis;
632 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
633 txd->chim_size > 0, ("invalid rndis chim txd"));
635 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
636 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
637 rndis.nvs_chim_idx = txd->chim_index;
638 rndis.nvs_chim_sz = txd->chim_size;
640 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
641 &rndis, sizeof(rndis), &txd->send_ctx));
644 static __inline uint32_t
645 hn_chim_alloc(struct hn_softc *sc)
647 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
648 u_long *bmap = sc->hn_chim_bmap;
649 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
651 for (i = 0; i < bmap_cnt; ++i) {
654 idx = ffsl(~bmap[i]);
658 --idx; /* ffsl is 1-based */
659 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
660 ("invalid i %d and idx %d", i, idx));
662 if (atomic_testandset_long(&bmap[i], idx))
665 ret = i * LONG_BIT + idx;
672 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
677 idx = chim_idx / LONG_BIT;
678 KASSERT(idx < sc->hn_chim_bmap_cnt,
679 ("invalid chimney index 0x%x", chim_idx));
681 mask = 1UL << (chim_idx % LONG_BIT);
682 KASSERT(sc->hn_chim_bmap[idx] & mask,
683 ("index bitmap 0x%lx, chimney index %u, "
684 "bitmap idx %d, bitmask 0x%lx",
685 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
687 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
690 #if defined(INET6) || defined(INET)
692 #define PULLUP_HDR(m, len) \
694 if (__predict_false((m)->m_len < (len))) { \
695 (m) = m_pullup((m), (len)); \
702 * NOTE: If this function failed, the m_head would be freed.
704 static __inline struct mbuf *
705 hn_tso_fixup(struct mbuf *m_head)
707 struct ether_vlan_header *evl;
711 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
713 PULLUP_HDR(m_head, sizeof(*evl));
714 evl = mtod(m_head, struct ether_vlan_header *);
715 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
716 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
718 ehlen = ETHER_HDR_LEN;
721 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
725 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
726 ip = mtodo(m_head, ehlen);
727 iphlen = ip->ip_hl << 2;
729 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
730 th = mtodo(m_head, ehlen + iphlen);
734 th->th_sum = in_pseudo(ip->ip_src.s_addr,
735 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
738 #if defined(INET6) && defined(INET)
745 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
746 ip6 = mtodo(m_head, ehlen);
747 if (ip6->ip6_nxt != IPPROTO_TCP) {
752 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
753 th = mtodo(m_head, ehlen + sizeof(*ip6));
756 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
764 * NOTE: If this function failed, the m_head would be freed.
766 static __inline struct mbuf *
767 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
769 const struct ether_vlan_header *evl;
770 const struct tcphdr *th;
775 PULLUP_HDR(m_head, sizeof(*evl));
776 evl = mtod(m_head, const struct ether_vlan_header *);
777 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
778 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
780 ehlen = ETHER_HDR_LEN;
783 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TCP) {
787 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
788 ip = mtodo(m_head, ehlen);
789 iphlen = ip->ip_hl << 2;
791 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
792 th = mtodo(m_head, ehlen + iphlen);
793 if (th->th_flags & TH_SYN)
797 #if defined(INET6) && defined(INET)
802 const struct ip6_hdr *ip6;
804 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
805 ip6 = mtodo(m_head, ehlen);
806 if (ip6->ip6_nxt != IPPROTO_TCP)
809 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
810 th = mtodo(m_head, ehlen + sizeof(*ip6));
811 if (th->th_flags & TH_SYN)
820 #endif /* INET6 || INET */
823 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
829 if (sc->hn_rx_filter != filter) {
830 error = hn_rndis_set_rxfilter(sc, filter);
832 sc->hn_rx_filter = filter;
838 hn_rxfilter_config(struct hn_softc *sc)
840 struct ifnet *ifp = sc->hn_ifp;
846 * If the non-transparent mode VF is activated, we don't know how
847 * its RX filter is configured, so stick the synthetic device in
848 * the promiscous mode.
850 if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
851 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
853 filter = NDIS_PACKET_TYPE_DIRECTED;
854 if (ifp->if_flags & IFF_BROADCAST)
855 filter |= NDIS_PACKET_TYPE_BROADCAST;
856 /* TODO: support multicast list */
857 if ((ifp->if_flags & IFF_ALLMULTI) ||
858 !TAILQ_EMPTY(&ifp->if_multiaddrs))
859 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
861 return (hn_set_rxfilter(sc, filter));
865 hn_set_txagg(struct hn_softc *sc)
871 * Setup aggregation size.
873 if (sc->hn_agg_size < 0)
876 size = sc->hn_agg_size;
878 if (sc->hn_rndis_agg_size < size)
879 size = sc->hn_rndis_agg_size;
881 /* NOTE: We only aggregate packets using chimney sending buffers. */
882 if (size > (uint32_t)sc->hn_chim_szmax)
883 size = sc->hn_chim_szmax;
885 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
892 /* NOTE: Type of the per TX ring setting is 'int'. */
897 * Setup aggregation packet count.
899 if (sc->hn_agg_pkts < 0)
902 pkts = sc->hn_agg_pkts;
904 if (sc->hn_rndis_agg_pkts < pkts)
905 pkts = sc->hn_rndis_agg_pkts;
914 /* NOTE: Type of the per TX ring setting is 'short'. */
919 /* NOTE: Type of the per TX ring setting is 'short'. */
920 if (sc->hn_rndis_agg_align > SHRT_MAX) {
927 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
928 size, pkts, sc->hn_rndis_agg_align);
931 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
932 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
934 mtx_lock(&txr->hn_tx_lock);
935 txr->hn_agg_szmax = size;
936 txr->hn_agg_pktmax = pkts;
937 txr->hn_agg_align = sc->hn_rndis_agg_align;
938 mtx_unlock(&txr->hn_tx_lock);
943 hn_get_txswq_depth(const struct hn_tx_ring *txr)
946 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
947 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
948 return txr->hn_txdesc_cnt;
949 return hn_tx_swq_depth;
953 hn_rss_reconfig(struct hn_softc *sc)
959 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
966 * Direct reconfiguration by setting the UNCHG flags does
967 * _not_ work properly.
970 if_printf(sc->hn_ifp, "disable RSS\n");
971 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
973 if_printf(sc->hn_ifp, "RSS disable failed\n");
978 * Reenable the RSS w/ the updated RSS key or indirect
982 if_printf(sc->hn_ifp, "reconfig RSS\n");
983 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
985 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
992 hn_rss_ind_fixup(struct hn_softc *sc)
994 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
997 nchan = sc->hn_rx_ring_inuse;
998 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
1001 * Check indirect table to make sure that all channels in it
1004 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
1005 if (rss->rss_ind[i] >= nchan) {
1006 if_printf(sc->hn_ifp,
1007 "RSS indirect table %d fixup: %u -> %d\n",
1008 i, rss->rss_ind[i], nchan - 1);
1009 rss->rss_ind[i] = nchan - 1;
1015 hn_ifmedia_upd(struct ifnet *ifp __unused)
1022 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
1024 struct hn_softc *sc = ifp->if_softc;
1026 ifmr->ifm_status = IFM_AVALID;
1027 ifmr->ifm_active = IFM_ETHER;
1029 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
1030 ifmr->ifm_active |= IFM_NONE;
1033 ifmr->ifm_status |= IFM_ACTIVE;
1034 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1038 hn_rxvf_set_task(void *xarg, int pending __unused)
1040 struct hn_rxvf_setarg *arg = xarg;
1042 arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1046 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1048 struct hn_rx_ring *rxr;
1049 struct hn_rxvf_setarg arg;
1055 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1057 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1058 rxr = &sc->hn_rx_ring[i];
1060 if (i < sc->hn_rx_ring_inuse) {
1062 arg.vf_ifp = vf_ifp;
1063 vmbus_chan_run_task(rxr->hn_chan, &task);
1065 rxr->hn_rxvf_ifp = vf_ifp;
1071 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1073 const struct ifnet *hn_ifp;
1075 hn_ifp = sc->hn_ifp;
1080 if (ifp->if_alloctype != IFT_ETHER)
1083 /* Ignore lagg/vlan interfaces */
1084 if (strcmp(ifp->if_dname, "lagg") == 0 ||
1085 strcmp(ifp->if_dname, "vlan") == 0)
1088 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1095 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1097 struct ifnet *hn_ifp;
1101 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1104 if (!hn_ismyvf(sc, ifp))
1106 hn_ifp = sc->hn_ifp;
1109 if (sc->hn_flags & HN_FLAG_RXVF)
1112 sc->hn_flags |= HN_FLAG_RXVF;
1113 hn_rxfilter_config(sc);
1115 if (!(sc->hn_flags & HN_FLAG_RXVF))
1118 sc->hn_flags &= ~HN_FLAG_RXVF;
1119 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1120 hn_rxfilter_config(sc);
1122 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1125 hn_nvs_set_datapath(sc,
1126 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
1128 hn_rxvf_set(sc, rxvf ? ifp : NULL);
1131 hn_vf_rss_fixup(sc, true);
1132 hn_suspend_mgmt(sc);
1133 sc->hn_link_flags &=
1134 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1135 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1137 hn_vf_rss_restore(sc);
1141 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1142 rxvf ? "VF_UP" : "VF_DOWN", NULL);
1145 if_printf(hn_ifp, "datapath is switched %s %s\n",
1146 rxvf ? "to" : "from", ifp->if_xname);
1153 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1156 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1158 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1162 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1165 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1169 hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
1171 struct ifnet *ifp, *vf_ifp;
1177 vf_ifp = sc->hn_vf_ifp;
1180 * Fix up requested capabilities w/ supported capabilities,
1181 * since the supported capabilities could have been changed.
1183 ifr->ifr_reqcap &= ifp->if_capabilities;
1184 /* Pass SIOCSIFCAP to VF. */
1185 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
1189 * The error will be propagated to the callers, however, it
1190 * is _not_ useful here.
1194 * Merge VF's enabled capabilities.
1196 ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
1198 tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
1199 if (ifp->if_capenable & IFCAP_TXCSUM)
1200 ifp->if_hwassist |= tmp;
1202 ifp->if_hwassist &= ~tmp;
1204 tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
1205 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
1206 ifp->if_hwassist |= tmp;
1208 ifp->if_hwassist &= ~tmp;
1210 tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
1211 if (ifp->if_capenable & IFCAP_TSO4)
1212 ifp->if_hwassist |= tmp;
1214 ifp->if_hwassist &= ~tmp;
1216 tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
1217 if (ifp->if_capenable & IFCAP_TSO6)
1218 ifp->if_hwassist |= tmp;
1220 ifp->if_hwassist &= ~tmp;
1226 hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
1228 struct ifnet *vf_ifp;
1232 vf_ifp = sc->hn_vf_ifp;
1234 memset(&ifr, 0, sizeof(ifr));
1235 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1236 ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
1237 ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
1238 return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
1242 hn_xpnt_vf_saveifflags(struct hn_softc *sc)
1244 struct ifnet *ifp = sc->hn_ifp;
1249 /* XXX vlan(4) style mcast addr maintenance */
1250 if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
1251 allmulti = IFF_ALLMULTI;
1253 /* Always set the VF's if_flags */
1254 sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
1258 hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
1260 struct rm_priotracker pt;
1261 struct ifnet *hn_ifp = NULL;
1265 * XXX racy, if hn(4) ever detached.
1267 rm_rlock(&hn_vfmap_lock, &pt);
1268 if (vf_ifp->if_index < hn_vfmap_size)
1269 hn_ifp = hn_vfmap[vf_ifp->if_index];
1270 rm_runlock(&hn_vfmap_lock, &pt);
1272 if (hn_ifp != NULL) {
1273 for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
1275 * Allow tapping on the VF.
1277 ETHER_BPF_MTAP(vf_ifp, mn);
1282 if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
1283 if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
1287 * XXX IFCOUNTER_IMCAST
1288 * This stat updating is kinda invasive, since it
1289 * requires two checks on the mbuf: the length check
1290 * and the ethernet header check. As of this write,
1291 * all multicast packets go directly to hn(4), which
1292 * makes imcast stat updating in the VF a try in vian.
1296 * Fix up rcvif and increase hn(4)'s ipackets.
1298 mn->m_pkthdr.rcvif = hn_ifp;
1299 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
1302 * Go through hn(4)'s if_input.
1304 hn_ifp->if_input(hn_ifp, m);
1307 * In the middle of the transition; free this
1312 m->m_nextpkt = NULL;
1320 hn_mtu_change_fixup(struct hn_softc *sc)
1327 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
1328 #if __FreeBSD_version >= 1100099
1329 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
1330 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1335 hn_rss_type_fromndis(uint32_t rss_hash)
1339 if (rss_hash & NDIS_HASH_IPV4)
1340 types |= RSS_TYPE_IPV4;
1341 if (rss_hash & NDIS_HASH_TCP_IPV4)
1342 types |= RSS_TYPE_TCP_IPV4;
1343 if (rss_hash & NDIS_HASH_IPV6)
1344 types |= RSS_TYPE_IPV6;
1345 if (rss_hash & NDIS_HASH_IPV6_EX)
1346 types |= RSS_TYPE_IPV6_EX;
1347 if (rss_hash & NDIS_HASH_TCP_IPV6)
1348 types |= RSS_TYPE_TCP_IPV6;
1349 if (rss_hash & NDIS_HASH_TCP_IPV6_EX)
1350 types |= RSS_TYPE_TCP_IPV6_EX;
1355 hn_rss_type_tondis(uint32_t types)
1357 uint32_t rss_hash = 0;
1360 (RSS_TYPE_UDP_IPV4 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0,
1361 ("UDP4, UDP6 and UDP6EX are not supported"));
1363 if (types & RSS_TYPE_IPV4)
1364 rss_hash |= NDIS_HASH_IPV4;
1365 if (types & RSS_TYPE_TCP_IPV4)
1366 rss_hash |= NDIS_HASH_TCP_IPV4;
1367 if (types & RSS_TYPE_IPV6)
1368 rss_hash |= NDIS_HASH_IPV6;
1369 if (types & RSS_TYPE_IPV6_EX)
1370 rss_hash |= NDIS_HASH_IPV6_EX;
1371 if (types & RSS_TYPE_TCP_IPV6)
1372 rss_hash |= NDIS_HASH_TCP_IPV6;
1373 if (types & RSS_TYPE_TCP_IPV6_EX)
1374 rss_hash |= NDIS_HASH_TCP_IPV6_EX;
1379 hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash)
1385 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1386 sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash;
1390 hn_vf_rss_fixup(struct hn_softc *sc, bool reconf)
1392 struct ifnet *ifp, *vf_ifp;
1393 struct ifrsshash ifrh;
1394 struct ifrsskey ifrk;
1396 uint32_t my_types, diff_types, mbuf_types = 0;
1399 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1400 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1402 if (sc->hn_rx_ring_inuse == 1) {
1403 /* No RSS on synthetic parts; done. */
1406 if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) {
1407 /* Synthetic parts do not support Toeplitz; done. */
1412 vf_ifp = sc->hn_vf_ifp;
1415 * Extract VF's RSS key. Only 40 bytes key for Toeplitz is
1418 memset(&ifrk, 0, sizeof(ifrk));
1419 strlcpy(ifrk.ifrk_name, vf_ifp->if_xname, sizeof(ifrk.ifrk_name));
1420 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSKEY, (caddr_t)&ifrk);
1422 if_printf(ifp, "%s SIOCGRSSKEY failed: %d\n",
1423 vf_ifp->if_xname, error);
1426 if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) {
1427 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1428 vf_ifp->if_xname, ifrk.ifrk_func);
1431 if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) {
1432 if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n",
1433 vf_ifp->if_xname, ifrk.ifrk_keylen);
1438 * Extract VF's RSS hash. Only Toeplitz is supported.
1440 memset(&ifrh, 0, sizeof(ifrh));
1441 strlcpy(ifrh.ifrh_name, vf_ifp->if_xname, sizeof(ifrh.ifrh_name));
1442 error = vf_ifp->if_ioctl(vf_ifp, SIOCGIFRSSHASH, (caddr_t)&ifrh);
1444 if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n",
1445 vf_ifp->if_xname, error);
1448 if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) {
1449 if_printf(ifp, "%s RSS function %u is not Toeplitz\n",
1450 vf_ifp->if_xname, ifrh.ifrh_func);
1454 my_types = hn_rss_type_fromndis(sc->hn_rss_hcap);
1455 if ((ifrh.ifrh_types & my_types) == 0) {
1456 /* This disables RSS; ignore it then */
1457 if_printf(ifp, "%s intersection of RSS types failed. "
1458 "VF %#x, mine %#x\n", vf_ifp->if_xname,
1459 ifrh.ifrh_types, my_types);
1463 diff_types = my_types ^ ifrh.ifrh_types;
1464 my_types &= ifrh.ifrh_types;
1465 mbuf_types = my_types;
1468 * Detect RSS hash value/type confliction.
1471 * We don't disable the hash type, but stop delivery the hash
1472 * value/type through mbufs on RX path.
1474 if ((my_types & RSS_TYPE_IPV4) &&
1475 (diff_types & ifrh.ifrh_types &
1476 (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) {
1477 /* Conflict; disable IPV4 hash type/value delivery. */
1478 if_printf(ifp, "disable IPV4 mbuf hash delivery\n");
1479 mbuf_types &= ~RSS_TYPE_IPV4;
1481 if ((my_types & RSS_TYPE_IPV6) &&
1482 (diff_types & ifrh.ifrh_types &
1483 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1484 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1485 RSS_TYPE_IPV6_EX))) {
1486 /* Conflict; disable IPV6 hash type/value delivery. */
1487 if_printf(ifp, "disable IPV6 mbuf hash delivery\n");
1488 mbuf_types &= ~RSS_TYPE_IPV6;
1490 if ((my_types & RSS_TYPE_IPV6_EX) &&
1491 (diff_types & ifrh.ifrh_types &
1492 (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 |
1493 RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX |
1495 /* Conflict; disable IPV6_EX hash type/value delivery. */
1496 if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n");
1497 mbuf_types &= ~RSS_TYPE_IPV6_EX;
1499 if ((my_types & RSS_TYPE_TCP_IPV6) &&
1500 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) {
1501 /* Conflict; disable TCP_IPV6 hash type/value delivery. */
1502 if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n");
1503 mbuf_types &= ~RSS_TYPE_TCP_IPV6;
1505 if ((my_types & RSS_TYPE_TCP_IPV6_EX) &&
1506 (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) {
1507 /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */
1508 if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n");
1509 mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX;
1511 if ((my_types & RSS_TYPE_UDP_IPV6) &&
1512 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) {
1513 /* Conflict; disable UDP_IPV6 hash type/value delivery. */
1514 if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n");
1515 mbuf_types &= ~RSS_TYPE_UDP_IPV6;
1517 if ((my_types & RSS_TYPE_UDP_IPV6_EX) &&
1518 (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) {
1519 /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */
1520 if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n");
1521 mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX;
1525 * Indirect table does not matter.
1528 sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) |
1529 hn_rss_type_tondis(my_types);
1530 memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key));
1531 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
1534 error = hn_rss_reconfig(sc);
1536 /* XXX roll-back? */
1537 if_printf(ifp, "hn_rss_reconfig failed: %d\n", error);
1538 /* XXX keep going. */
1542 /* Hash deliverability for mbufs. */
1543 hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types));
1547 hn_vf_rss_restore(struct hn_softc *sc)
1551 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
1552 ("%s: synthetic parts are not attached", sc->hn_ifp->if_xname));
1554 if (sc->hn_rx_ring_inuse == 1)
1558 * Restore hash types. Key does _not_ matter.
1560 if (sc->hn_rss_hash != sc->hn_rss_hcap) {
1563 sc->hn_rss_hash = sc->hn_rss_hcap;
1564 error = hn_rss_reconfig(sc);
1566 if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n",
1568 /* XXX keep going. */
1572 /* Hash deliverability for mbufs. */
1573 hn_rss_mbuf_hash(sc, NDIS_HASH_ALL);
1577 hn_xpnt_vf_setready(struct hn_softc *sc)
1579 struct ifnet *ifp, *vf_ifp;
1584 vf_ifp = sc->hn_vf_ifp;
1587 * Mark the VF ready.
1589 sc->hn_vf_rdytick = 0;
1592 * Save information for restoration.
1594 sc->hn_saved_caps = ifp->if_capabilities;
1595 sc->hn_saved_tsomax = ifp->if_hw_tsomax;
1596 sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
1597 sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
1600 * Intersect supported/enabled capabilities.
1603 * if_hwassist is not changed here.
1605 ifp->if_capabilities &= vf_ifp->if_capabilities;
1606 ifp->if_capenable &= ifp->if_capabilities;
1611 if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
1612 ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
1613 if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
1614 ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
1615 if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
1616 ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
1619 * Change VF's enabled capabilities.
1621 memset(&ifr, 0, sizeof(ifr));
1622 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1623 ifr.ifr_reqcap = ifp->if_capenable;
1624 hn_xpnt_vf_iocsetcaps(sc, &ifr);
1626 if (ifp->if_mtu != ETHERMTU) {
1632 memset(&ifr, 0, sizeof(ifr));
1633 strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
1634 ifr.ifr_mtu = ifp->if_mtu;
1635 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
1637 if_printf(ifp, "%s SIOCSIFMTU %lu failed\n",
1638 vf_ifp->if_xname, ifp->if_mtu);
1639 if (ifp->if_mtu > ETHERMTU) {
1640 if_printf(ifp, "change MTU to %d\n", ETHERMTU);
1644 * No need to adjust the synthetic parts' MTU;
1645 * failure of the adjustment will cause us
1646 * infinite headache.
1648 ifp->if_mtu = ETHERMTU;
1649 hn_mtu_change_fixup(sc);
1656 hn_xpnt_vf_isready(struct hn_softc *sc)
1661 if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
1664 if (sc->hn_vf_rdytick == 0)
1667 if (sc->hn_vf_rdytick > ticks)
1670 /* Mark VF as ready. */
1671 hn_xpnt_vf_setready(sc);
1676 hn_xpnt_vf_setenable(struct hn_softc *sc)
1682 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1683 rm_wlock(&sc->hn_vf_lock);
1684 sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
1685 rm_wunlock(&sc->hn_vf_lock);
1687 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1688 sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
1692 hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
1698 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1699 rm_wlock(&sc->hn_vf_lock);
1700 sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
1702 sc->hn_vf_ifp = NULL;
1703 rm_wunlock(&sc->hn_vf_lock);
1705 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
1706 sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
1710 hn_xpnt_vf_init(struct hn_softc *sc)
1716 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1717 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1720 if_printf(sc->hn_ifp, "try bringing up %s\n",
1721 sc->hn_vf_ifp->if_xname);
1727 hn_xpnt_vf_saveifflags(sc);
1728 sc->hn_vf_ifp->if_flags |= IFF_UP;
1729 error = hn_xpnt_vf_iocsetflags(sc);
1731 if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
1732 sc->hn_vf_ifp->if_xname, error);
1738 * Datapath setting must happen _after_ bringing the VF up.
1740 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
1744 * Fixup RSS related bits _after_ the VF is brought up, since
1745 * many VFs generate RSS key during it's initialization.
1747 hn_vf_rss_fixup(sc, true);
1749 /* Mark transparent mode VF as enabled. */
1750 hn_xpnt_vf_setenable(sc);
1754 hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
1756 struct hn_softc *sc = xsc;
1760 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
1762 if (sc->hn_vf_ifp == NULL)
1764 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
1767 if (sc->hn_vf_rdytick != 0) {
1768 /* Mark VF as ready. */
1769 hn_xpnt_vf_setready(sc);
1772 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
1774 * Delayed VF initialization.
1777 if_printf(sc->hn_ifp, "delayed initialize %s\n",
1778 sc->hn_vf_ifp->if_xname);
1780 hn_xpnt_vf_init(sc);
1787 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1789 struct hn_softc *sc = xsc;
1793 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1796 if (!hn_ismyvf(sc, ifp))
1799 if (sc->hn_vf_ifp != NULL) {
1800 if_printf(sc->hn_ifp, "%s was attached as VF\n",
1801 sc->hn_vf_ifp->if_xname);
1805 if (hn_xpnt_vf && ifp->if_start != NULL) {
1807 * ifnet.if_start is _not_ supported by transparent
1808 * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
1810 if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
1811 "in transparent VF mode.\n", ifp->if_xname);
1815 rm_wlock(&hn_vfmap_lock);
1817 if (ifp->if_index >= hn_vfmap_size) {
1818 struct ifnet **newmap;
1821 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1822 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1825 memcpy(newmap, hn_vfmap,
1826 sizeof(struct ifnet *) * hn_vfmap_size);
1827 free(hn_vfmap, M_DEVBUF);
1829 hn_vfmap_size = newsize;
1831 KASSERT(hn_vfmap[ifp->if_index] == NULL,
1832 ("%s: ifindex %d was mapped to %s",
1833 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1834 hn_vfmap[ifp->if_index] = sc->hn_ifp;
1836 rm_wunlock(&hn_vfmap_lock);
1838 /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
1839 rm_wlock(&sc->hn_vf_lock);
1840 KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
1841 ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
1842 sc->hn_vf_ifp = ifp;
1843 rm_wunlock(&sc->hn_vf_lock);
1849 * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
1850 * Save vf_ifp's current if_input for later restoration.
1852 sc->hn_vf_input = ifp->if_input;
1853 ifp->if_input = hn_xpnt_vf_input;
1856 * Stop link status management; use the VF's.
1858 hn_suspend_mgmt(sc);
1861 * Give VF sometime to complete its attach routing.
1863 wait_ticks = hn_xpnt_vf_attwait * hz;
1864 sc->hn_vf_rdytick = ticks + wait_ticks;
1866 taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
1874 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1876 struct hn_softc *sc = xsc;
1880 if (sc->hn_vf_ifp == NULL)
1883 if (!hn_ismyvf(sc, ifp))
1888 * Make sure that the delayed initialization is not running.
1891 * - This lock _must_ be released, since the hn_vf_init task
1892 * will try holding this lock.
1893 * - It is safe to release this lock here, since the
1894 * hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
1896 * XXX racy, if hn(4) ever detached.
1899 taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
1902 KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
1903 sc->hn_ifp->if_xname));
1904 ifp->if_input = sc->hn_vf_input;
1905 sc->hn_vf_input = NULL;
1907 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) &&
1908 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED))
1909 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
1911 if (sc->hn_vf_rdytick == 0) {
1913 * The VF was ready; restore some settings.
1915 sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
1918 * There is _no_ need to fixup if_capenable and
1919 * if_hwassist, since the if_capabilities before
1920 * restoration was an intersection of the VF's
1921 * if_capabilites and the synthetic device's
1924 sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
1925 sc->hn_ifp->if_hw_tsomaxsegcount =
1926 sc->hn_saved_tsosegcnt;
1927 sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
1930 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1932 * Restore RSS settings.
1934 hn_vf_rss_restore(sc);
1937 * Resume link status management, which was suspended
1938 * by hn_ifnet_attevent().
1944 /* Mark transparent mode VF as disabled. */
1945 hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
1947 rm_wlock(&hn_vfmap_lock);
1949 KASSERT(ifp->if_index < hn_vfmap_size,
1950 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
1951 if (hn_vfmap[ifp->if_index] != NULL) {
1952 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
1953 ("%s: ifindex %d was mapped to %s",
1954 ifp->if_xname, ifp->if_index,
1955 hn_vfmap[ifp->if_index]->if_xname));
1956 hn_vfmap[ifp->if_index] = NULL;
1959 rm_wunlock(&hn_vfmap_lock);
1965 hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
1967 struct hn_softc *sc = xsc;
1969 if (sc->hn_vf_ifp == ifp)
1970 if_link_state_change(sc->hn_ifp, link_state);
1974 hn_probe(device_t dev)
1977 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
1978 device_set_desc(dev, "Hyper-V Network Interface");
1979 return BUS_PROBE_DEFAULT;
1985 hn_attach(device_t dev)
1987 struct hn_softc *sc = device_get_softc(dev);
1988 struct sysctl_oid_list *child;
1989 struct sysctl_ctx_list *ctx;
1990 uint8_t eaddr[ETHER_ADDR_LEN];
1991 struct ifnet *ifp = NULL;
1992 int error, ring_cnt, tx_ring_cnt;
1996 sc->hn_prichan = vmbus_get_channel(dev);
1998 rm_init(&sc->hn_vf_lock, "hnvf");
1999 if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
2000 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
2003 * Initialize these tunables once.
2005 sc->hn_agg_size = hn_tx_agg_size;
2006 sc->hn_agg_pkts = hn_tx_agg_pkts;
2009 * Setup taskqueue for transmission.
2011 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
2015 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
2016 M_DEVBUF, M_WAITOK);
2017 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
2018 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
2019 M_WAITOK, taskqueue_thread_enqueue,
2020 &sc->hn_tx_taskqs[i]);
2021 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
2022 "%s tx%d", device_get_nameunit(dev), i);
2024 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
2025 sc->hn_tx_taskqs = hn_tx_taskque;
2029 * Setup taskqueue for mangement tasks, e.g. link status.
2031 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
2032 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
2033 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
2034 device_get_nameunit(dev));
2035 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
2036 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
2037 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
2038 hn_netchg_status_taskfunc, sc);
2042 * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
2044 sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
2045 taskqueue_thread_enqueue, &sc->hn_vf_taskq);
2046 taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
2047 device_get_nameunit(dev));
2048 TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
2049 hn_xpnt_vf_init_taskfunc, sc);
2053 * Allocate ifnet and setup its name earlier, so that if_printf
2054 * can be used by functions, which will be called after
2057 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
2059 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2062 * Initialize ifmedia earlier so that it can be unconditionally
2063 * destroyed, if error happened later on.
2065 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
2068 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
2069 * to use (tx_ring_cnt).
2072 * The # of RX rings to use is same as the # of channels to use.
2074 ring_cnt = hn_chan_cnt;
2075 if (ring_cnt <= 0) {
2077 ring_cnt = mp_ncpus;
2078 if (ring_cnt > HN_RING_CNT_DEF_MAX)
2079 ring_cnt = HN_RING_CNT_DEF_MAX;
2080 } else if (ring_cnt > mp_ncpus) {
2081 ring_cnt = mp_ncpus;
2084 tx_ring_cnt = hn_tx_ring_cnt;
2085 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
2086 tx_ring_cnt = ring_cnt;
2087 #ifdef HN_IFSTART_SUPPORT
2088 if (hn_use_if_start) {
2089 /* ifnet.if_start only needs one TX ring. */
2095 * Set the leader CPU for channels.
2097 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
2100 * Create enough TX/RX rings, even if only limited number of
2101 * channels can be allocated.
2103 error = hn_create_tx_data(sc, tx_ring_cnt);
2106 error = hn_create_rx_data(sc, ring_cnt);
2111 * Create transaction context for NVS and RNDIS transactions.
2113 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
2114 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
2115 if (sc->hn_xact == NULL) {
2121 * Install orphan handler for the revocation of this device's
2125 * The processing order is critical here:
2126 * Install the orphan handler, _before_ testing whether this
2127 * device's primary channel has been revoked or not.
2129 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
2130 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
2136 * Attach the synthetic parts, i.e. NVS and RNDIS.
2138 error = hn_synth_attach(sc, ETHERMTU);
2142 error = hn_rndis_get_eaddr(sc, eaddr);
2146 error = hn_rndis_get_mtu(sc, &mtu);
2149 else if (bootverbose)
2150 device_printf(dev, "RNDIS mtu %u\n", mtu);
2152 #if __FreeBSD_version >= 1100099
2153 if (sc->hn_rx_ring_inuse > 1) {
2155 * Reduce TCP segment aggregation limit for multiple
2156 * RX rings to increase ACK timeliness.
2158 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
2163 * Fixup TX stuffs after synthetic parts are attached.
2165 hn_fixup_tx_data(sc);
2167 ctx = device_get_sysctl_ctx(dev);
2168 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2169 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
2170 &sc->hn_nvs_ver, 0, "NVS version");
2171 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
2172 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2173 hn_ndis_version_sysctl, "A", "NDIS version");
2174 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
2175 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2176 hn_caps_sysctl, "A", "capabilities");
2177 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
2178 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2179 hn_hwassist_sysctl, "A", "hwassist");
2180 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
2181 CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
2182 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
2183 CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
2184 "max # of TSO segments");
2185 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
2186 CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
2187 "max size of TSO segment");
2188 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
2189 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2190 hn_rxfilter_sysctl, "A", "rxfilter");
2191 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
2192 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2193 hn_rss_hash_sysctl, "A", "RSS hash");
2194 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap",
2195 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2196 hn_rss_hcap_sysctl, "A", "RSS hash capabilities");
2197 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash",
2198 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2199 hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs");
2200 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
2201 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
2202 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
2203 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2204 hn_rss_key_sysctl, "IU", "RSS key");
2205 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
2206 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2207 hn_rss_ind_sysctl, "IU", "RSS indirect table");
2208 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
2209 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
2210 "RNDIS offered packet transmission aggregation size limit");
2211 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
2212 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
2213 "RNDIS offered packet transmission aggregation count limit");
2214 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
2215 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
2216 "RNDIS packet transmission aggregation alignment");
2217 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
2218 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2219 hn_txagg_size_sysctl, "I",
2220 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
2221 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
2222 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2223 hn_txagg_pkts_sysctl, "I",
2224 "Packet transmission aggregation packets, "
2225 "0 -- disable, -1 -- auto");
2226 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
2227 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2228 hn_polling_sysctl, "I",
2229 "Polling frequency: [100,1000000], 0 disable polling");
2230 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
2231 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2232 hn_vf_sysctl, "A", "Virtual Function's name");
2234 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
2235 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2236 hn_rxvf_sysctl, "A", "activated Virtual Function's name");
2238 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
2239 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
2240 hn_xpnt_vf_enabled_sysctl, "I",
2241 "Transparent VF enabled");
2242 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
2243 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2244 hn_xpnt_vf_accbpf_sysctl, "I",
2245 "Accurate BPF for transparent VF");
2249 * Setup the ifmedia, which has been initialized earlier.
2251 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
2252 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
2253 /* XXX ifmedia_set really should do this for us */
2254 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
2257 * Setup the ifnet for this interface.
2261 ifp->if_baudrate = IF_Gbps(10);
2263 /* if_baudrate is 32bits on 32bit system. */
2264 ifp->if_baudrate = IF_Gbps(1);
2266 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2267 ifp->if_ioctl = hn_ioctl;
2268 ifp->if_init = hn_init;
2269 #ifdef HN_IFSTART_SUPPORT
2270 if (hn_use_if_start) {
2271 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
2273 ifp->if_start = hn_start;
2274 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
2275 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
2276 IFQ_SET_READY(&ifp->if_snd);
2280 ifp->if_transmit = hn_transmit;
2281 ifp->if_qflush = hn_xmit_qflush;
2284 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
2286 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2287 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
2289 if (sc->hn_caps & HN_CAP_VLAN) {
2290 /* XXX not sure about VLAN_MTU. */
2291 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
2294 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
2295 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
2296 ifp->if_capabilities |= IFCAP_TXCSUM;
2297 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
2298 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
2299 if (sc->hn_caps & HN_CAP_TSO4) {
2300 ifp->if_capabilities |= IFCAP_TSO4;
2301 ifp->if_hwassist |= CSUM_IP_TSO;
2303 if (sc->hn_caps & HN_CAP_TSO6) {
2304 ifp->if_capabilities |= IFCAP_TSO6;
2305 ifp->if_hwassist |= CSUM_IP6_TSO;
2308 /* Enable all available capabilities by default. */
2309 ifp->if_capenable = ifp->if_capabilities;
2312 * Disable IPv6 TSO and TXCSUM by default, they still can
2313 * be enabled through SIOCSIFCAP.
2315 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
2316 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
2318 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
2320 * Lock hn_set_tso_maxsize() to simplify its
2324 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
2326 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
2327 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
2330 ether_ifattach(ifp, eaddr);
2332 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
2333 if_printf(ifp, "TSO segcnt %u segsz %u\n",
2334 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
2336 if (mtu < ETHERMTU) {
2337 if_printf(ifp, "fixup mtu %lu -> %u\n", ifp->if_mtu, mtu);
2341 /* Inform the upper layer about the long frame support. */
2342 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
2345 * Kick off link status check.
2347 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
2348 hn_update_link_status(sc);
2351 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
2352 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
2353 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
2354 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
2356 sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
2357 hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
2362 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
2363 * since interface's LLADDR is needed; interface LLADDR is not
2364 * available when ifnet_arrival event is triggered.
2366 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
2367 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
2368 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
2369 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
2373 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
2374 hn_synth_detach(sc);
2380 hn_detach(device_t dev)
2382 struct hn_softc *sc = device_get_softc(dev);
2383 struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
2385 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
2387 * In case that the vmbus missed the orphan handler
2390 vmbus_xact_ctx_orphan(sc->hn_xact);
2393 if (sc->hn_ifaddr_evthand != NULL)
2394 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
2395 if (sc->hn_ifnet_evthand != NULL)
2396 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
2397 if (sc->hn_ifnet_atthand != NULL) {
2398 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
2399 sc->hn_ifnet_atthand);
2401 if (sc->hn_ifnet_dethand != NULL) {
2402 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2403 sc->hn_ifnet_dethand);
2405 if (sc->hn_ifnet_lnkhand != NULL)
2406 EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
2408 vf_ifp = sc->hn_vf_ifp;
2409 __compiler_membar();
2411 hn_ifnet_detevent(sc, vf_ifp);
2413 if (device_is_attached(dev)) {
2415 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
2416 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2420 * hn_stop() only suspends data, so managment
2421 * stuffs have to be suspended manually here.
2423 hn_suspend_mgmt(sc);
2424 hn_synth_detach(sc);
2427 ether_ifdetach(ifp);
2430 ifmedia_removeall(&sc->hn_media);
2431 hn_destroy_rx_data(sc);
2432 hn_destroy_tx_data(sc);
2434 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
2437 for (i = 0; i < hn_tx_taskq_cnt; ++i)
2438 taskqueue_free(sc->hn_tx_taskqs[i]);
2439 free(sc->hn_tx_taskqs, M_DEVBUF);
2441 taskqueue_free(sc->hn_mgmt_taskq0);
2442 if (sc->hn_vf_taskq != NULL)
2443 taskqueue_free(sc->hn_vf_taskq);
2445 if (sc->hn_xact != NULL) {
2447 * Uninstall the orphan handler _before_ the xact is
2450 vmbus_chan_unset_orphan(sc->hn_prichan);
2451 vmbus_xact_ctx_destroy(sc->hn_xact);
2456 HN_LOCK_DESTROY(sc);
2457 rm_destroy(&sc->hn_vf_lock);
2462 hn_shutdown(device_t dev)
2469 hn_link_status(struct hn_softc *sc)
2471 uint32_t link_status;
2474 error = hn_rndis_get_linkstatus(sc, &link_status);
2476 /* XXX what to do? */
2480 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
2481 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
2483 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2484 if_link_state_change(sc->hn_ifp,
2485 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
2486 LINK_STATE_UP : LINK_STATE_DOWN);
2490 hn_link_taskfunc(void *xsc, int pending __unused)
2492 struct hn_softc *sc = xsc;
2494 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
2500 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
2502 struct hn_softc *sc = xsc;
2504 /* Prevent any link status checks from running. */
2505 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
2508 * Fake up a [link down --> link up] state change; 5 seconds
2509 * delay is used, which closely simulates miibus reaction
2510 * upon link down event.
2512 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
2513 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
2514 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
2515 &sc->hn_netchg_status, 5 * hz);
2519 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
2521 struct hn_softc *sc = xsc;
2523 /* Re-allow link status checks. */
2524 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
2529 hn_update_link_status(struct hn_softc *sc)
2532 if (sc->hn_mgmt_taskq != NULL)
2533 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
2537 hn_change_network(struct hn_softc *sc)
2540 if (sc->hn_mgmt_taskq != NULL)
2541 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
2545 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
2546 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
2548 struct mbuf *m = *m_head;
2551 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
2553 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
2554 m, segs, nsegs, BUS_DMA_NOWAIT);
2555 if (error == EFBIG) {
2558 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
2562 *m_head = m = m_new;
2563 txr->hn_tx_collapsed++;
2565 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
2566 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
2569 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
2570 BUS_DMASYNC_PREWRITE);
2571 txd->flags |= HN_TXD_FLAG_DMAMAP;
2577 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
2580 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
2581 ("put an onlist txd %#x", txd->flags));
2582 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2583 ("put an onagg txd %#x", txd->flags));
2585 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2586 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
2589 if (!STAILQ_EMPTY(&txd->agg_list)) {
2590 struct hn_txdesc *tmp_txd;
2592 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
2595 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
2596 ("resursive aggregation on aggregated txdesc"));
2597 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
2598 ("not aggregated txdesc"));
2599 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2600 ("aggregated txdesc uses dmamap"));
2601 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2602 ("aggregated txdesc consumes "
2603 "chimney sending buffer"));
2604 KASSERT(tmp_txd->chim_size == 0,
2605 ("aggregated txdesc has non-zero "
2606 "chimney sending size"));
2608 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
2609 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
2610 freed = hn_txdesc_put(txr, tmp_txd);
2611 KASSERT(freed, ("failed to free aggregated txdesc"));
2615 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
2616 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
2617 ("chim txd uses dmamap"));
2618 hn_chim_free(txr->hn_sc, txd->chim_index);
2619 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2621 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
2622 bus_dmamap_sync(txr->hn_tx_data_dtag,
2623 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
2624 bus_dmamap_unload(txr->hn_tx_data_dtag,
2626 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
2629 if (txd->m != NULL) {
2634 txd->flags |= HN_TXD_FLAG_ONLIST;
2635 #ifndef HN_USE_TXDESC_BUFRING
2636 mtx_lock_spin(&txr->hn_txlist_spin);
2637 KASSERT(txr->hn_txdesc_avail >= 0 &&
2638 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
2639 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
2640 txr->hn_txdesc_avail++;
2641 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2642 mtx_unlock_spin(&txr->hn_txlist_spin);
2643 #else /* HN_USE_TXDESC_BUFRING */
2645 atomic_add_int(&txr->hn_txdesc_avail, 1);
2647 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2648 #endif /* !HN_USE_TXDESC_BUFRING */
2653 static __inline struct hn_txdesc *
2654 hn_txdesc_get(struct hn_tx_ring *txr)
2656 struct hn_txdesc *txd;
2658 #ifndef HN_USE_TXDESC_BUFRING
2659 mtx_lock_spin(&txr->hn_txlist_spin);
2660 txd = SLIST_FIRST(&txr->hn_txlist);
2662 KASSERT(txr->hn_txdesc_avail > 0,
2663 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
2664 txr->hn_txdesc_avail--;
2665 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2667 mtx_unlock_spin(&txr->hn_txlist_spin);
2669 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
2673 #ifdef HN_USE_TXDESC_BUFRING
2675 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
2677 #endif /* HN_USE_TXDESC_BUFRING */
2678 KASSERT(txd->m == NULL && txd->refs == 0 &&
2679 STAILQ_EMPTY(&txd->agg_list) &&
2680 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
2681 txd->chim_size == 0 &&
2682 (txd->flags & HN_TXD_FLAG_ONLIST) &&
2683 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
2684 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
2685 txd->flags &= ~HN_TXD_FLAG_ONLIST;
2691 static __inline void
2692 hn_txdesc_hold(struct hn_txdesc *txd)
2695 /* 0->1 transition will never work */
2696 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
2697 atomic_add_int(&txd->refs, 1);
2700 static __inline void
2701 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
2704 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2705 ("recursive aggregation on aggregating txdesc"));
2707 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
2708 ("already aggregated"));
2709 KASSERT(STAILQ_EMPTY(&txd->agg_list),
2710 ("recursive aggregation on to-be-aggregated txdesc"));
2712 txd->flags |= HN_TXD_FLAG_ONAGG;
2713 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
2717 hn_tx_ring_pending(struct hn_tx_ring *txr)
2719 bool pending = false;
2721 #ifndef HN_USE_TXDESC_BUFRING
2722 mtx_lock_spin(&txr->hn_txlist_spin);
2723 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
2725 mtx_unlock_spin(&txr->hn_txlist_spin);
2727 if (!buf_ring_full(txr->hn_txdesc_br))
2733 static __inline void
2734 hn_txeof(struct hn_tx_ring *txr)
2736 txr->hn_has_txeof = 0;
2741 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
2742 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
2744 struct hn_txdesc *txd = sndc->hn_cbarg;
2745 struct hn_tx_ring *txr;
2748 KASSERT(txr->hn_chan == chan,
2749 ("channel mismatch, on chan%u, should be chan%u",
2750 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
2752 txr->hn_has_txeof = 1;
2753 hn_txdesc_put(txr, txd);
2755 ++txr->hn_txdone_cnt;
2756 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
2757 txr->hn_txdone_cnt = 0;
2758 if (txr->hn_oactive)
2764 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
2766 #if defined(INET) || defined(INET6)
2767 struct lro_ctrl *lro = &rxr->hn_lro;
2768 struct lro_entry *queued;
2770 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
2771 SLIST_REMOVE_HEAD(&lro->lro_active, next);
2772 tcp_lro_flush(lro, queued);
2778 * 'txr' could be NULL, if multiple channels and
2779 * ifnet.if_start method are enabled.
2781 if (txr == NULL || !txr->hn_has_txeof)
2784 txr->hn_txdone_cnt = 0;
2788 static __inline uint32_t
2789 hn_rndis_pktmsg_offset(uint32_t ofs)
2792 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
2793 ("invalid RNDIS packet msg offset %u", ofs));
2794 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
2797 static __inline void *
2798 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
2799 size_t pi_dlen, uint32_t pi_type)
2801 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
2802 struct rndis_pktinfo *pi;
2804 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
2805 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
2808 * Per-packet-info does not move; it only grows.
2811 * rm_pktinfooffset in this phase counts from the beginning
2812 * of rndis_packet_msg.
2814 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
2815 ("%u pktinfo overflows RNDIS packet msg", pi_type));
2816 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
2817 pkt->rm_pktinfolen);
2818 pkt->rm_pktinfolen += pi_size;
2820 pi->rm_size = pi_size;
2821 pi->rm_type = pi_type;
2822 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
2824 return (pi->rm_data);
2828 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2830 struct hn_txdesc *txd;
2834 txd = txr->hn_agg_txd;
2835 KASSERT(txd != NULL, ("no aggregate txdesc"));
2838 * Since hn_txpkt() will reset this temporary stat, save
2839 * it now, so that oerrors can be updated properly, if
2840 * hn_txpkt() ever fails.
2842 pkts = txr->hn_stat_pkts;
2845 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2846 * failure, save it for later freeing, if hn_txpkt() ever
2850 error = hn_txpkt(ifp, txr, txd);
2851 if (__predict_false(error)) {
2852 /* txd is freed, but m is not. */
2855 txr->hn_flush_failed++;
2856 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2859 /* Reset all aggregation states. */
2860 txr->hn_agg_txd = NULL;
2861 txr->hn_agg_szleft = 0;
2862 txr->hn_agg_pktleft = 0;
2863 txr->hn_agg_prevpkt = NULL;
2869 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2874 if (txr->hn_agg_txd != NULL) {
2875 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2876 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2877 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2881 * Update the previous RNDIS packet's total length,
2882 * it can be increased due to the mandatory alignment
2883 * padding for this RNDIS packet. And update the
2884 * aggregating txdesc's chimney sending buffer size
2888 * Zero-out the padding, as required by the RNDIS spec.
2891 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2892 agg_txd->chim_size += pkt->rm_len - olen;
2894 /* Link this txdesc to the parent. */
2895 hn_txdesc_agg(agg_txd, txd);
2897 chim = (uint8_t *)pkt + pkt->rm_len;
2898 /* Save the current packet for later fixup. */
2899 txr->hn_agg_prevpkt = chim;
2901 txr->hn_agg_pktleft--;
2902 txr->hn_agg_szleft -= pktsize;
2903 if (txr->hn_agg_szleft <=
2904 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2906 * Probably can't aggregate more packets,
2907 * flush this aggregating txdesc proactively.
2909 txr->hn_agg_pktleft = 0;
2914 hn_flush_txagg(ifp, txr);
2916 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
2918 txr->hn_tx_chimney_tried++;
2919 txd->chim_index = hn_chim_alloc(txr->hn_sc);
2920 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
2922 txr->hn_tx_chimney++;
2924 chim = txr->hn_sc->hn_chim +
2925 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
2927 if (txr->hn_agg_pktmax > 1 &&
2928 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2929 txr->hn_agg_txd = txd;
2930 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
2931 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
2932 txr->hn_agg_prevpkt = chim;
2939 * If this function fails, then both txd and m_head0 will be freed.
2942 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2943 struct mbuf **m_head0)
2945 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
2946 int error, nsegs, i;
2947 struct mbuf *m_head = *m_head0;
2948 struct rndis_packet_msg *pkt;
2951 int pkt_hlen, pkt_size;
2953 pkt = txd->rndis_pkt;
2954 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
2955 if (pkt_size < txr->hn_chim_size) {
2956 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
2960 if (txr->hn_agg_txd != NULL)
2961 hn_flush_txagg(ifp, txr);
2964 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
2965 pkt->rm_len = m_head->m_pkthdr.len;
2966 pkt->rm_dataoffset = 0;
2967 pkt->rm_datalen = m_head->m_pkthdr.len;
2968 pkt->rm_oobdataoffset = 0;
2969 pkt->rm_oobdatalen = 0;
2970 pkt->rm_oobdataelements = 0;
2971 pkt->rm_pktinfooffset = sizeof(*pkt);
2972 pkt->rm_pktinfolen = 0;
2973 pkt->rm_vchandle = 0;
2974 pkt->rm_reserved = 0;
2976 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
2978 * Set the hash value for this packet, so that the host could
2979 * dispatch the TX done event for this packet back to this TX
2982 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2983 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
2984 *pi_data = txr->hn_tx_idx;
2987 if (m_head->m_flags & M_VLANTAG) {
2988 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2989 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
2990 *pi_data = NDIS_VLAN_INFO_MAKE(
2991 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
2992 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
2993 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
2996 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
2997 #if defined(INET6) || defined(INET)
2998 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2999 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
3001 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
3002 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
3003 m_head->m_pkthdr.tso_segsz);
3006 #if defined(INET6) && defined(INET)
3011 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
3012 m_head->m_pkthdr.tso_segsz);
3015 #endif /* INET6 || INET */
3016 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
3017 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
3018 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
3019 if (m_head->m_pkthdr.csum_flags &
3020 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
3021 *pi_data = NDIS_TXCSUM_INFO_IPV6;
3023 *pi_data = NDIS_TXCSUM_INFO_IPV4;
3024 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
3025 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
3028 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
3029 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
3030 else if (m_head->m_pkthdr.csum_flags &
3031 (CSUM_IP_UDP | CSUM_IP6_UDP))
3032 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
3035 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
3036 /* Fixup RNDIS packet message total length */
3037 pkt->rm_len += pkt_hlen;
3038 /* Convert RNDIS packet message offsets */
3039 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
3040 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
3043 * Fast path: Chimney sending.
3046 struct hn_txdesc *tgt_txd = txd;
3048 if (txr->hn_agg_txd != NULL) {
3049 tgt_txd = txr->hn_agg_txd;
3055 KASSERT(pkt == chim,
3056 ("RNDIS pkt not in chimney sending buffer"));
3057 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
3058 ("chimney sending buffer is not used"));
3059 tgt_txd->chim_size += pkt->rm_len;
3061 m_copydata(m_head, 0, m_head->m_pkthdr.len,
3062 ((uint8_t *)chim) + pkt_hlen);
3064 txr->hn_gpa_cnt = 0;
3065 txr->hn_sendpkt = hn_txpkt_chim;
3069 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
3070 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
3071 ("chimney buffer is used"));
3072 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
3074 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
3075 if (__predict_false(error)) {
3079 * This mbuf is not linked w/ the txd yet, so free it now.
3084 freed = hn_txdesc_put(txr, txd);
3086 ("fail to free txd upon txdma error"));
3088 txr->hn_txdma_failed++;
3089 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3094 /* +1 RNDIS packet message */
3095 txr->hn_gpa_cnt = nsegs + 1;
3097 /* send packet with page buffer */
3098 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
3099 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
3100 txr->hn_gpa[0].gpa_len = pkt_hlen;
3103 * Fill the page buffers with mbuf info after the page
3104 * buffer for RNDIS packet message.
3106 for (i = 0; i < nsegs; ++i) {
3107 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
3109 gpa->gpa_page = atop(segs[i].ds_addr);
3110 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
3111 gpa->gpa_len = segs[i].ds_len;
3114 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3116 txr->hn_sendpkt = hn_txpkt_sglist;
3120 /* Set the completion routine */
3121 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
3123 /* Update temporary stats for later use. */
3124 txr->hn_stat_pkts++;
3125 txr->hn_stat_size += m_head->m_pkthdr.len;
3126 if (m_head->m_flags & M_MCAST)
3127 txr->hn_stat_mcasts++;
3134 * If this function fails, then txd will be freed, but the mbuf
3135 * associated w/ the txd will _not_ be freed.
3138 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
3140 int error, send_failed = 0, has_bpf;
3143 has_bpf = bpf_peers_present(ifp->if_bpf);
3146 * Make sure that this txd and any aggregated txds are not
3147 * freed before ETHER_BPF_MTAP.
3149 hn_txdesc_hold(txd);
3151 error = txr->hn_sendpkt(txr, txd);
3154 const struct hn_txdesc *tmp_txd;
3156 ETHER_BPF_MTAP(ifp, txd->m);
3157 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
3158 ETHER_BPF_MTAP(ifp, tmp_txd->m);
3161 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
3162 #ifdef HN_IFSTART_SUPPORT
3163 if (!hn_use_if_start)
3166 if_inc_counter(ifp, IFCOUNTER_OBYTES,
3168 if (txr->hn_stat_mcasts != 0) {
3169 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
3170 txr->hn_stat_mcasts);
3173 txr->hn_pkts += txr->hn_stat_pkts;
3177 hn_txdesc_put(txr, txd);
3179 if (__predict_false(error)) {
3183 * This should "really rarely" happen.
3185 * XXX Too many RX to be acked or too many sideband
3186 * commands to run? Ask netvsc_channel_rollup()
3187 * to kick start later.
3189 txr->hn_has_txeof = 1;
3191 txr->hn_send_failed++;
3194 * Try sending again after set hn_has_txeof;
3195 * in case that we missed the last
3196 * netvsc_channel_rollup().
3200 if_printf(ifp, "send failed\n");
3203 * Caller will perform further processing on the
3204 * associated mbuf, so don't free it in hn_txdesc_put();
3205 * only unload it from the DMA map in hn_txdesc_put(),
3209 freed = hn_txdesc_put(txr, txd);
3211 ("fail to free txd upon send error"));
3213 txr->hn_send_failed++;
3216 /* Reset temporary stats, after this sending is done. */
3217 txr->hn_stat_size = 0;
3218 txr->hn_stat_pkts = 0;
3219 txr->hn_stat_mcasts = 0;
3225 * Append the specified data to the indicated mbuf chain,
3226 * Extend the mbuf chain if the new data does not fit in
3229 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
3230 * There should be an equivalent in the kernel mbuf code,
3231 * but there does not appear to be one yet.
3233 * Differs from m_append() in that additional mbufs are
3234 * allocated with cluster size MJUMPAGESIZE, and filled
3237 * Return 1 if able to complete the job; otherwise 0.
3240 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
3243 int remainder, space;
3245 for (m = m0; m->m_next != NULL; m = m->m_next)
3248 space = M_TRAILINGSPACE(m);
3251 * Copy into available space.
3253 if (space > remainder)
3255 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
3260 while (remainder > 0) {
3262 * Allocate a new mbuf; could check space
3263 * and allocate a cluster instead.
3265 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
3268 n->m_len = min(MJUMPAGESIZE, remainder);
3269 bcopy(cp, mtod(n, caddr_t), n->m_len);
3271 remainder -= n->m_len;
3275 if (m0->m_flags & M_PKTHDR)
3276 m0->m_pkthdr.len += len - remainder;
3278 return (remainder == 0);
3281 #if defined(INET) || defined(INET6)
3283 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
3285 #if __FreeBSD_version >= 1100095
3286 if (hn_lro_mbufq_depth) {
3287 tcp_lro_queue_mbuf(lc, m);
3291 return tcp_lro_rx(lc, m, 0);
3296 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
3297 const struct hn_rxinfo *info)
3299 struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
3301 int size, do_lro = 0, do_csum = 1, is_vf = 0;
3302 int hash_type = M_HASHTYPE_NONE;
3305 if (rxr->hn_rxvf_ifp != NULL) {
3307 * Non-transparent mode VF; pretend this packet is from
3310 ifp = rxr->hn_rxvf_ifp;
3312 } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) {
3313 /* Transparent mode VF. */
3317 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
3320 * See the NOTE of hn_rndis_init_fixat(). This
3321 * function can be reached, immediately after the
3322 * RNDIS is initialized but before the ifnet is
3323 * setup on the hn_attach() path; drop the unexpected
3329 if (__predict_false(dlen < ETHER_HDR_LEN)) {
3330 if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
3334 if (dlen <= MHLEN) {
3335 m_new = m_gethdr(M_NOWAIT, MT_DATA);
3336 if (m_new == NULL) {
3337 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3340 memcpy(mtod(m_new, void *), data, dlen);
3341 m_new->m_pkthdr.len = m_new->m_len = dlen;
3342 rxr->hn_small_pkts++;
3345 * Get an mbuf with a cluster. For packets 2K or less,
3346 * get a standard 2K cluster. For anything larger, get a
3347 * 4K cluster. Any buffers larger than 4K can cause problems
3348 * if looped around to the Hyper-V TX channel, so avoid them.
3351 if (dlen > MCLBYTES) {
3353 size = MJUMPAGESIZE;
3356 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
3357 if (m_new == NULL) {
3358 if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
3362 hv_m_append(m_new, dlen, data);
3364 m_new->m_pkthdr.rcvif = ifp;
3366 if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
3369 /* receive side checksum offload */
3370 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
3371 /* IP csum offload */
3372 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
3373 m_new->m_pkthdr.csum_flags |=
3374 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3378 /* TCP/UDP csum offload */
3379 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
3380 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
3381 m_new->m_pkthdr.csum_flags |=
3382 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3383 m_new->m_pkthdr.csum_data = 0xffff;
3384 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
3392 * As of this write (Oct 28th, 2016), host side will turn
3393 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
3394 * the do_lro setting here is actually _not_ accurate. We
3395 * depend on the RSS hash type check to reset do_lro.
3397 if ((info->csum_info &
3398 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
3399 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
3402 const struct ether_header *eh;
3407 /* Checked at the beginning of this function. */
3408 KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
3410 eh = mtod(m_new, struct ether_header *);
3411 etype = ntohs(eh->ether_type);
3412 if (etype == ETHERTYPE_VLAN) {
3413 const struct ether_vlan_header *evl;
3415 hoff = sizeof(*evl);
3416 if (m_new->m_len < hoff)
3418 evl = mtod(m_new, struct ether_vlan_header *);
3419 etype = ntohs(evl->evl_proto);
3422 if (etype == ETHERTYPE_IP) {
3425 pr = hn_check_iplen(m_new, hoff);
3426 if (pr == IPPROTO_TCP) {
3428 (rxr->hn_trust_hcsum &
3429 HN_TRUST_HCSUM_TCP)) {
3430 rxr->hn_csum_trusted++;
3431 m_new->m_pkthdr.csum_flags |=
3432 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3433 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3434 m_new->m_pkthdr.csum_data = 0xffff;
3437 } else if (pr == IPPROTO_UDP) {
3439 (rxr->hn_trust_hcsum &
3440 HN_TRUST_HCSUM_UDP)) {
3441 rxr->hn_csum_trusted++;
3442 m_new->m_pkthdr.csum_flags |=
3443 (CSUM_IP_CHECKED | CSUM_IP_VALID |
3444 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
3445 m_new->m_pkthdr.csum_data = 0xffff;
3447 } else if (pr != IPPROTO_DONE && do_csum &&
3448 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
3449 rxr->hn_csum_trusted++;
3450 m_new->m_pkthdr.csum_flags |=
3451 (CSUM_IP_CHECKED | CSUM_IP_VALID);
3456 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
3457 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
3458 NDIS_VLAN_INFO_ID(info->vlan_info),
3459 NDIS_VLAN_INFO_PRI(info->vlan_info),
3460 NDIS_VLAN_INFO_CFI(info->vlan_info));
3461 m_new->m_flags |= M_VLANTAG;
3465 * If VF is activated (tranparent/non-transparent mode does not
3470 * hn(4) will only receive broadcast packets, multicast packets,
3471 * TCP SYN and SYN|ACK (in Azure), LRO is useless for these
3474 * For non-transparent, we definitely _cannot_ enable LRO at
3475 * all, since the LRO flush will use hn(4) as the receiving
3476 * interface; i.e. hn_ifp->if_input(hn_ifp, m).
3482 * If VF is activated (tranparent/non-transparent mode does not
3483 * matter here), do _not_ mess with unsupported hash types or
3486 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
3488 m_new->m_pkthdr.flowid = info->hash_value;
3490 hash_type = M_HASHTYPE_OPAQUE;
3491 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
3492 NDIS_HASH_FUNCTION_TOEPLITZ) {
3493 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK &
3498 * do_lro is resetted, if the hash types are not TCP
3499 * related. See the comment in the above csum_flags
3503 case NDIS_HASH_IPV4:
3504 hash_type = M_HASHTYPE_RSS_IPV4;
3508 case NDIS_HASH_TCP_IPV4:
3509 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
3512 case NDIS_HASH_IPV6:
3513 hash_type = M_HASHTYPE_RSS_IPV6;
3517 case NDIS_HASH_IPV6_EX:
3518 hash_type = M_HASHTYPE_RSS_IPV6_EX;
3522 case NDIS_HASH_TCP_IPV6:
3523 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
3526 case NDIS_HASH_TCP_IPV6_EX:
3527 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
3531 } else if (!is_vf) {
3532 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
3534 M_HASHTYPE_SET(m_new, hash_type);
3536 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
3537 if (hn_ifp != ifp) {
3538 const struct ether_header *eh;
3541 * Non-transparent mode VF is activated.
3545 * Allow tapping on hn(4).
3547 ETHER_BPF_MTAP(hn_ifp, m_new);
3550 * Update hn(4)'s stats.
3552 if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
3553 if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
3554 /* Checked at the beginning of this function. */
3555 KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
3556 eh = mtod(m_new, struct ether_header *);
3557 if (ETHER_IS_MULTICAST(eh->ether_dhost))
3558 if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
3562 if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
3563 #if defined(INET) || defined(INET6)
3564 struct lro_ctrl *lro = &rxr->hn_lro;
3567 rxr->hn_lro_tried++;
3568 if (hn_lro_rx(lro, m_new) == 0) {
3575 ifp->if_input(ifp, m_new);
3581 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
3583 struct hn_softc *sc = ifp->if_softc;
3584 struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
3585 struct ifnet *vf_ifp;
3586 int mask, error = 0;
3587 struct ifrsskey *ifrk;
3588 struct ifrsshash *ifrh;
3593 if (ifr->ifr_mtu > HN_MTU_MAX) {
3600 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3605 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
3606 /* Can't change MTU */
3612 if (ifp->if_mtu == ifr->ifr_mtu) {
3617 if (hn_xpnt_vf_isready(sc)) {
3618 vf_ifp = sc->hn_vf_ifp;
3620 strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
3621 sizeof(ifr_vf.ifr_name));
3622 error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
3626 if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
3627 vf_ifp->if_xname, ifr->ifr_mtu, error);
3633 * Suspend this interface before the synthetic parts
3639 * Detach the synthetics parts, i.e. NVS and RNDIS.
3641 hn_synth_detach(sc);
3644 * Reattach the synthetic parts, i.e. NVS and RNDIS,
3645 * with the new MTU setting.
3647 error = hn_synth_attach(sc, ifr->ifr_mtu);
3653 error = hn_rndis_get_mtu(sc, &mtu);
3656 else if (bootverbose)
3657 if_printf(ifp, "RNDIS mtu %u\n", mtu);
3660 * Commit the requested MTU, after the synthetic parts
3661 * have been successfully attached.
3663 if (mtu >= ifr->ifr_mtu) {
3666 if_printf(ifp, "fixup mtu %d -> %u\n",
3672 * Synthetic parts' reattach may change the chimney
3673 * sending size; update it.
3675 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
3676 hn_set_chim_size(sc, sc->hn_chim_szmax);
3679 * Make sure that various parameters based on MTU are
3680 * still valid, after the MTU change.
3682 hn_mtu_change_fixup(sc);
3685 * All done! Resume the interface now.
3689 if ((sc->hn_flags & HN_FLAG_RXVF) ||
3690 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
3692 * Since we have reattached the NVS part,
3693 * change the datapath to VF again; in case
3694 * that it is lost, after the NVS was detached.
3696 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
3705 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3710 if (hn_xpnt_vf_isready(sc))
3711 hn_xpnt_vf_saveifflags(sc);
3713 if (ifp->if_flags & IFF_UP) {
3714 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3716 * Caller meight hold mutex, e.g.
3717 * bpf; use busy-wait for the RNDIS
3721 hn_rxfilter_config(sc);
3724 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
3725 error = hn_xpnt_vf_iocsetflags(sc);
3730 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3733 sc->hn_if_flags = ifp->if_flags;
3741 if (hn_xpnt_vf_isready(sc)) {
3743 strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
3744 sizeof(ifr_vf.ifr_name));
3745 error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
3751 * Fix up requested capabilities w/ supported capabilities,
3752 * since the supported capabilities could have been changed.
3754 mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
3757 if (mask & IFCAP_TXCSUM) {
3758 ifp->if_capenable ^= IFCAP_TXCSUM;
3759 if (ifp->if_capenable & IFCAP_TXCSUM)
3760 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
3762 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
3764 if (mask & IFCAP_TXCSUM_IPV6) {
3765 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
3766 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
3767 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
3769 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
3772 /* TODO: flip RNDIS offload parameters for RXCSUM. */
3773 if (mask & IFCAP_RXCSUM)
3774 ifp->if_capenable ^= IFCAP_RXCSUM;
3776 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
3777 if (mask & IFCAP_RXCSUM_IPV6)
3778 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
3781 if (mask & IFCAP_LRO)
3782 ifp->if_capenable ^= IFCAP_LRO;
3784 if (mask & IFCAP_TSO4) {
3785 ifp->if_capenable ^= IFCAP_TSO4;
3786 if (ifp->if_capenable & IFCAP_TSO4)
3787 ifp->if_hwassist |= CSUM_IP_TSO;
3789 ifp->if_hwassist &= ~CSUM_IP_TSO;
3791 if (mask & IFCAP_TSO6) {
3792 ifp->if_capenable ^= IFCAP_TSO6;
3793 if (ifp->if_capenable & IFCAP_TSO6)
3794 ifp->if_hwassist |= CSUM_IP6_TSO;
3796 ifp->if_hwassist &= ~CSUM_IP6_TSO;
3806 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
3810 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3812 * Multicast uses mutex; use busy-wait for
3816 hn_rxfilter_config(sc);
3820 /* XXX vlan(4) style mcast addr maintenance */
3821 if (hn_xpnt_vf_isready(sc)) {
3824 old_if_flags = sc->hn_vf_ifp->if_flags;
3825 hn_xpnt_vf_saveifflags(sc);
3827 if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
3828 ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
3830 error = hn_xpnt_vf_iocsetflags(sc);
3839 if (hn_xpnt_vf_isready(sc)) {
3841 * SIOCGIFMEDIA expects ifmediareq, so don't
3842 * create and pass ifr_vf to the VF here; just
3843 * replace the ifr_name.
3845 vf_ifp = sc->hn_vf_ifp;
3846 strlcpy(ifr->ifr_name, vf_ifp->if_xname,
3847 sizeof(ifr->ifr_name));
3848 error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
3849 /* Restore the ifr_name. */
3850 strlcpy(ifr->ifr_name, ifp->if_xname,
3851 sizeof(ifr->ifr_name));
3856 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
3859 case SIOCGIFRSSHASH:
3860 ifrh = (struct ifrsshash *)data;
3862 if (sc->hn_rx_ring_inuse == 1) {
3864 ifrh->ifrh_func = RSS_FUNC_NONE;
3865 ifrh->ifrh_types = 0;
3869 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3870 ifrh->ifrh_func = RSS_FUNC_TOEPLITZ;
3872 ifrh->ifrh_func = RSS_FUNC_PRIVATE;
3873 ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash);
3878 ifrk = (struct ifrsskey *)data;
3880 if (sc->hn_rx_ring_inuse == 1) {
3882 ifrk->ifrk_func = RSS_FUNC_NONE;
3883 ifrk->ifrk_keylen = 0;
3886 if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ)
3887 ifrk->ifrk_func = RSS_FUNC_TOEPLITZ;
3889 ifrk->ifrk_func = RSS_FUNC_PRIVATE;
3890 ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ;
3891 memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key,
3892 NDIS_HASH_KEYSIZE_TOEPLITZ);
3897 error = ether_ioctl(ifp, cmd, data);
3904 hn_stop(struct hn_softc *sc, bool detaching)
3906 struct ifnet *ifp = sc->hn_ifp;
3911 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
3912 ("synthetic parts were not attached"));
3914 /* Clear RUNNING bit ASAP. */
3915 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3917 /* Disable polling. */
3920 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
3921 KASSERT(sc->hn_vf_ifp != NULL,
3922 ("%s: VF is not attached", ifp->if_xname));
3924 /* Mark transparent mode VF as disabled. */
3925 hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
3929 * Datapath setting must happen _before_ bringing
3932 hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
3935 * Bring the VF down.
3937 hn_xpnt_vf_saveifflags(sc);
3938 sc->hn_vf_ifp->if_flags &= ~IFF_UP;
3939 hn_xpnt_vf_iocsetflags(sc);
3942 /* Suspend data transfers. */
3943 hn_suspend_data(sc);
3945 /* Clear OACTIVE bit. */
3946 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3947 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3948 sc->hn_tx_ring[i].hn_oactive = 0;
3951 * If the non-transparent mode VF is active, make sure
3952 * that the RX filter still allows packet reception.
3954 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
3955 hn_rxfilter_config(sc);
3959 hn_init_locked(struct hn_softc *sc)
3961 struct ifnet *ifp = sc->hn_ifp;
3966 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
3969 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
3972 /* Configure RX filter */
3973 hn_rxfilter_config(sc);
3975 /* Clear OACTIVE bit. */
3976 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3977 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
3978 sc->hn_tx_ring[i].hn_oactive = 0;
3980 /* Clear TX 'suspended' bit. */
3981 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
3983 if (hn_xpnt_vf_isready(sc)) {
3984 /* Initialize transparent VF. */
3985 hn_xpnt_vf_init(sc);
3988 /* Everything is ready; unleash! */
3989 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
3991 /* Re-enable polling if requested. */
3992 if (sc->hn_pollhz > 0)
3993 hn_polling(sc, sc->hn_pollhz);
3999 struct hn_softc *sc = xsc;
4006 #if __FreeBSD_version >= 1100099
4009 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
4011 struct hn_softc *sc = arg1;
4012 unsigned int lenlim;
4015 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
4016 error = sysctl_handle_int(oidp, &lenlim, 0, req);
4017 if (error || req->newptr == NULL)
4021 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
4022 lenlim > TCP_LRO_LENGTH_MAX) {
4026 hn_set_lro_lenlim(sc, lenlim);
4033 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
4035 struct hn_softc *sc = arg1;
4036 int ackcnt, error, i;
4039 * lro_ackcnt_lim is append count limit,
4040 * +1 to turn it into aggregation limit.
4042 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
4043 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
4044 if (error || req->newptr == NULL)
4047 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
4051 * Convert aggregation limit back to append
4056 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
4057 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
4065 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
4067 struct hn_softc *sc = arg1;
4072 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
4075 error = sysctl_handle_int(oidp, &on, 0, req);
4076 if (error || req->newptr == NULL)
4080 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4081 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4084 rxr->hn_trust_hcsum |= hcsum;
4086 rxr->hn_trust_hcsum &= ~hcsum;
4093 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
4095 struct hn_softc *sc = arg1;
4096 int chim_size, error;
4098 chim_size = sc->hn_tx_ring[0].hn_chim_size;
4099 error = sysctl_handle_int(oidp, &chim_size, 0, req);
4100 if (error || req->newptr == NULL)
4103 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
4107 hn_set_chim_size(sc, chim_size);
4112 #if __FreeBSD_version < 1100095
4114 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
4116 struct hn_softc *sc = arg1;
4117 int ofs = arg2, i, error;
4118 struct hn_rx_ring *rxr;
4122 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4123 rxr = &sc->hn_rx_ring[i];
4124 stat += *((int *)((uint8_t *)rxr + ofs));
4127 error = sysctl_handle_64(oidp, &stat, 0, req);
4128 if (error || req->newptr == NULL)
4131 /* Zero out this stat. */
4132 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
4133 rxr = &sc->hn_rx_ring[i];
4134 *((int *)((uint8_t *)rxr + ofs)) = 0;
4140 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
4142 struct hn_softc *sc = arg1;
4143 int ofs = arg2, i, error;
4144 struct hn_rx_ring *rxr;
4148 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4149 rxr = &sc->hn_rx_ring[i];
4150 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
4153 error = sysctl_handle_64(oidp, &stat, 0, req);
4154 if (error || req->newptr == NULL)
4157 /* Zero out this stat. */
4158 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4159 rxr = &sc->hn_rx_ring[i];
4160 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
4168 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4170 struct hn_softc *sc = arg1;
4171 int ofs = arg2, i, error;
4172 struct hn_rx_ring *rxr;
4176 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4177 rxr = &sc->hn_rx_ring[i];
4178 stat += *((u_long *)((uint8_t *)rxr + ofs));
4181 error = sysctl_handle_long(oidp, &stat, 0, req);
4182 if (error || req->newptr == NULL)
4185 /* Zero out this stat. */
4186 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4187 rxr = &sc->hn_rx_ring[i];
4188 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
4194 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
4196 struct hn_softc *sc = arg1;
4197 int ofs = arg2, i, error;
4198 struct hn_tx_ring *txr;
4202 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4203 txr = &sc->hn_tx_ring[i];
4204 stat += *((u_long *)((uint8_t *)txr + ofs));
4207 error = sysctl_handle_long(oidp, &stat, 0, req);
4208 if (error || req->newptr == NULL)
4211 /* Zero out this stat. */
4212 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4213 txr = &sc->hn_tx_ring[i];
4214 *((u_long *)((uint8_t *)txr + ofs)) = 0;
4220 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
4222 struct hn_softc *sc = arg1;
4223 int ofs = arg2, i, error, conf;
4224 struct hn_tx_ring *txr;
4226 txr = &sc->hn_tx_ring[0];
4227 conf = *((int *)((uint8_t *)txr + ofs));
4229 error = sysctl_handle_int(oidp, &conf, 0, req);
4230 if (error || req->newptr == NULL)
4234 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4235 txr = &sc->hn_tx_ring[i];
4236 *((int *)((uint8_t *)txr + ofs)) = conf;
4244 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
4246 struct hn_softc *sc = arg1;
4249 size = sc->hn_agg_size;
4250 error = sysctl_handle_int(oidp, &size, 0, req);
4251 if (error || req->newptr == NULL)
4255 sc->hn_agg_size = size;
4263 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
4265 struct hn_softc *sc = arg1;
4268 pkts = sc->hn_agg_pkts;
4269 error = sysctl_handle_int(oidp, &pkts, 0, req);
4270 if (error || req->newptr == NULL)
4274 sc->hn_agg_pkts = pkts;
4282 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
4284 struct hn_softc *sc = arg1;
4287 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
4288 return (sysctl_handle_int(oidp, &pkts, 0, req));
4292 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
4294 struct hn_softc *sc = arg1;
4297 align = sc->hn_tx_ring[0].hn_agg_align;
4298 return (sysctl_handle_int(oidp, &align, 0, req));
4302 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
4305 vmbus_chan_poll_disable(chan);
4307 vmbus_chan_poll_enable(chan, pollhz);
4311 hn_polling(struct hn_softc *sc, u_int pollhz)
4313 int nsubch = sc->hn_rx_ring_inuse - 1;
4318 struct vmbus_channel **subch;
4321 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4322 for (i = 0; i < nsubch; ++i)
4323 hn_chan_polling(subch[i], pollhz);
4324 vmbus_subchan_rel(subch, nsubch);
4326 hn_chan_polling(sc->hn_prichan, pollhz);
4330 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
4332 struct hn_softc *sc = arg1;
4335 pollhz = sc->hn_pollhz;
4336 error = sysctl_handle_int(oidp, &pollhz, 0, req);
4337 if (error || req->newptr == NULL)
4341 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
4345 if (sc->hn_pollhz != pollhz) {
4346 sc->hn_pollhz = pollhz;
4347 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
4348 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
4349 hn_polling(sc, sc->hn_pollhz);
4357 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
4359 struct hn_softc *sc = arg1;
4362 snprintf(verstr, sizeof(verstr), "%u.%u",
4363 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
4364 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
4365 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
4369 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
4371 struct hn_softc *sc = arg1;
4378 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
4379 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
4383 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
4385 struct hn_softc *sc = arg1;
4386 char assist_str[128];
4390 hwassist = sc->hn_ifp->if_hwassist;
4392 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
4393 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
4397 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
4399 struct hn_softc *sc = arg1;
4400 char filter_str[128];
4404 filter = sc->hn_rx_filter;
4406 snprintf(filter_str, sizeof(filter_str), "%b", filter,
4408 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
4412 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
4414 struct hn_softc *sc = arg1;
4419 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4420 if (error || req->newptr == NULL)
4423 if ((sc->hn_flags & HN_FLAG_RXVF) ||
4424 (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) {
4426 * RSS key is synchronized w/ VF's, don't allow users
4433 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
4436 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4438 if (sc->hn_rx_ring_inuse > 1) {
4439 error = hn_rss_reconfig(sc);
4441 /* Not RSS capable, at least for now; just save the RSS key. */
4450 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
4452 struct hn_softc *sc = arg1;
4457 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4458 if (error || req->newptr == NULL)
4462 * Don't allow RSS indirect table change, if this interface is not
4463 * RSS capable currently.
4465 if (sc->hn_rx_ring_inuse == 1) {
4470 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
4473 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4475 hn_rss_ind_fixup(sc);
4476 error = hn_rss_reconfig(sc);
4483 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
4485 struct hn_softc *sc = arg1;
4490 hash = sc->hn_rss_hash;
4492 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4493 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4497 hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS)
4499 struct hn_softc *sc = arg1;
4504 hash = sc->hn_rss_hcap;
4506 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4507 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4511 hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS)
4513 struct hn_softc *sc = arg1;
4518 hash = sc->hn_rx_ring[0].hn_mbuf_hash;
4520 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
4521 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
4525 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
4527 struct hn_softc *sc = arg1;
4528 char vf_name[IFNAMSIZ + 1];
4529 struct ifnet *vf_ifp;
4533 vf_ifp = sc->hn_vf_ifp;
4535 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4537 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4541 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
4543 struct hn_softc *sc = arg1;
4544 char vf_name[IFNAMSIZ + 1];
4545 struct ifnet *vf_ifp;
4549 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
4551 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
4553 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
4557 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
4559 struct rm_priotracker pt;
4564 error = sysctl_wire_old_buffer(req, 0);
4568 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4572 rm_rlock(&hn_vfmap_lock, &pt);
4575 for (i = 0; i < hn_vfmap_size; ++i) {
4578 if (hn_vfmap[i] == NULL)
4581 ifp = ifnet_byindex(i);
4584 sbuf_printf(sb, "%s", ifp->if_xname);
4586 sbuf_printf(sb, " %s", ifp->if_xname);
4591 rm_runlock(&hn_vfmap_lock, &pt);
4593 error = sbuf_finish(sb);
4599 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
4601 struct rm_priotracker pt;
4606 error = sysctl_wire_old_buffer(req, 0);
4610 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
4614 rm_rlock(&hn_vfmap_lock, &pt);
4617 for (i = 0; i < hn_vfmap_size; ++i) {
4618 struct ifnet *ifp, *hn_ifp;
4620 hn_ifp = hn_vfmap[i];
4624 ifp = ifnet_byindex(i);
4627 sbuf_printf(sb, "%s:%s", ifp->if_xname,
4630 sbuf_printf(sb, " %s:%s", ifp->if_xname,
4637 rm_runlock(&hn_vfmap_lock, &pt);
4639 error = sbuf_finish(sb);
4645 hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
4647 struct hn_softc *sc = arg1;
4648 int error, onoff = 0;
4650 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
4652 error = sysctl_handle_int(oidp, &onoff, 0, req);
4653 if (error || req->newptr == NULL)
4657 /* NOTE: hn_vf_lock for hn_transmit() */
4658 rm_wlock(&sc->hn_vf_lock);
4660 sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
4662 sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
4663 rm_wunlock(&sc->hn_vf_lock);
4670 hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
4672 struct hn_softc *sc = arg1;
4675 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
4677 return (sysctl_handle_int(oidp, &enabled, 0, req));
4681 hn_check_iplen(const struct mbuf *m, int hoff)
4683 const struct ip *ip;
4684 int len, iphlen, iplen;
4685 const struct tcphdr *th;
4686 int thoff; /* TCP data offset */
4688 len = hoff + sizeof(struct ip);
4690 /* The packet must be at least the size of an IP header. */
4691 if (m->m_pkthdr.len < len)
4692 return IPPROTO_DONE;
4694 /* The fixed IP header must reside completely in the first mbuf. */
4696 return IPPROTO_DONE;
4698 ip = mtodo(m, hoff);
4700 /* Bound check the packet's stated IP header length. */
4701 iphlen = ip->ip_hl << 2;
4702 if (iphlen < sizeof(struct ip)) /* minimum header length */
4703 return IPPROTO_DONE;
4705 /* The full IP header must reside completely in the one mbuf. */
4706 if (m->m_len < hoff + iphlen)
4707 return IPPROTO_DONE;
4709 iplen = ntohs(ip->ip_len);
4712 * Check that the amount of data in the buffers is as
4713 * at least much as the IP header would have us expect.
4715 if (m->m_pkthdr.len < hoff + iplen)
4716 return IPPROTO_DONE;
4719 * Ignore IP fragments.
4721 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
4722 return IPPROTO_DONE;
4725 * The TCP/IP or UDP/IP header must be entirely contained within
4726 * the first fragment of a packet.
4730 if (iplen < iphlen + sizeof(struct tcphdr))
4731 return IPPROTO_DONE;
4732 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
4733 return IPPROTO_DONE;
4734 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
4735 thoff = th->th_off << 2;
4736 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
4737 return IPPROTO_DONE;
4738 if (m->m_len < hoff + iphlen + thoff)
4739 return IPPROTO_DONE;
4742 if (iplen < iphlen + sizeof(struct udphdr))
4743 return IPPROTO_DONE;
4744 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
4745 return IPPROTO_DONE;
4749 return IPPROTO_DONE;
4756 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
4758 struct sysctl_oid_list *child;
4759 struct sysctl_ctx_list *ctx;
4760 device_t dev = sc->hn_dev;
4761 #if defined(INET) || defined(INET6)
4762 #if __FreeBSD_version >= 1100095
4769 * Create RXBUF for reception.
4772 * - It is shared by all channels.
4773 * - A large enough buffer is allocated, certain version of NVSes
4774 * may further limit the usable space.
4776 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4777 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
4778 BUS_DMA_WAITOK | BUS_DMA_ZERO);
4779 if (sc->hn_rxbuf == NULL) {
4780 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
4784 sc->hn_rx_ring_cnt = ring_cnt;
4785 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
4787 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
4788 M_DEVBUF, M_WAITOK | M_ZERO);
4790 #if defined(INET) || defined(INET6)
4791 #if __FreeBSD_version >= 1100095
4792 lroent_cnt = hn_lro_entry_count;
4793 if (lroent_cnt < TCP_LRO_ENTRIES)
4794 lroent_cnt = TCP_LRO_ENTRIES;
4796 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
4798 #endif /* INET || INET6 */
4800 ctx = device_get_sysctl_ctx(dev);
4801 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
4803 /* Create dev.hn.UNIT.rx sysctl tree */
4804 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
4805 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4807 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4808 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4810 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
4811 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
4812 &rxr->hn_br_dma, BUS_DMA_WAITOK);
4813 if (rxr->hn_br == NULL) {
4814 device_printf(dev, "allocate bufring failed\n");
4818 if (hn_trust_hosttcp)
4819 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
4820 if (hn_trust_hostudp)
4821 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
4822 if (hn_trust_hostip)
4823 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
4824 rxr->hn_mbuf_hash = NDIS_HASH_ALL;
4825 rxr->hn_ifp = sc->hn_ifp;
4826 if (i < sc->hn_tx_ring_cnt)
4827 rxr->hn_txr = &sc->hn_tx_ring[i];
4828 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
4829 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
4831 rxr->hn_rxbuf = sc->hn_rxbuf;
4836 #if defined(INET) || defined(INET6)
4837 #if __FreeBSD_version >= 1100095
4838 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
4839 hn_lro_mbufq_depth);
4841 tcp_lro_init(&rxr->hn_lro);
4842 rxr->hn_lro.ifp = sc->hn_ifp;
4844 #if __FreeBSD_version >= 1100099
4845 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
4846 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
4848 #endif /* INET || INET6 */
4850 if (sc->hn_rx_sysctl_tree != NULL) {
4854 * Create per RX ring sysctl tree:
4855 * dev.hn.UNIT.rx.RINGID
4857 snprintf(name, sizeof(name), "%d", i);
4858 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
4859 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
4860 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4862 if (rxr->hn_rx_sysctl_tree != NULL) {
4863 SYSCTL_ADD_ULONG(ctx,
4864 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4865 OID_AUTO, "packets", CTLFLAG_RW,
4866 &rxr->hn_pkts, "# of packets received");
4867 SYSCTL_ADD_ULONG(ctx,
4868 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4869 OID_AUTO, "rss_pkts", CTLFLAG_RW,
4871 "# of packets w/ RSS info received");
4873 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
4874 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
4875 &rxr->hn_pktbuf_len, 0,
4876 "Temporary channel packet buffer length");
4881 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
4882 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4883 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
4884 #if __FreeBSD_version < 1100095
4885 hn_rx_stat_int_sysctl,
4887 hn_rx_stat_u64_sysctl,
4889 "LU", "LRO queued");
4890 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
4891 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4892 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
4893 #if __FreeBSD_version < 1100095
4894 hn_rx_stat_int_sysctl,
4896 hn_rx_stat_u64_sysctl,
4898 "LU", "LRO flushed");
4899 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
4900 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4901 __offsetof(struct hn_rx_ring, hn_lro_tried),
4902 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
4903 #if __FreeBSD_version >= 1100099
4904 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
4905 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4906 hn_lro_lenlim_sysctl, "IU",
4907 "Max # of data bytes to be aggregated by LRO");
4908 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
4909 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4910 hn_lro_ackcnt_sysctl, "I",
4911 "Max # of ACKs to be aggregated by LRO");
4913 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
4914 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
4915 hn_trust_hcsum_sysctl, "I",
4916 "Trust tcp segement verification on host side, "
4917 "when csum info is missing");
4918 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
4919 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
4920 hn_trust_hcsum_sysctl, "I",
4921 "Trust udp datagram verification on host side, "
4922 "when csum info is missing");
4923 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
4924 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
4925 hn_trust_hcsum_sysctl, "I",
4926 "Trust ip packet verification on host side, "
4927 "when csum info is missing");
4928 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
4929 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4930 __offsetof(struct hn_rx_ring, hn_csum_ip),
4931 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
4932 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
4933 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4934 __offsetof(struct hn_rx_ring, hn_csum_tcp),
4935 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
4936 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
4937 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4938 __offsetof(struct hn_rx_ring, hn_csum_udp),
4939 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
4940 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
4941 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4942 __offsetof(struct hn_rx_ring, hn_csum_trusted),
4943 hn_rx_stat_ulong_sysctl, "LU",
4944 "# of packets that we trust host's csum verification");
4945 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
4946 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4947 __offsetof(struct hn_rx_ring, hn_small_pkts),
4948 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
4949 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
4950 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4951 __offsetof(struct hn_rx_ring, hn_ack_failed),
4952 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
4953 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
4954 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
4955 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
4956 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
4962 hn_destroy_rx_data(struct hn_softc *sc)
4966 if (sc->hn_rxbuf != NULL) {
4967 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
4968 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
4970 device_printf(sc->hn_dev, "RXBUF is referenced\n");
4971 sc->hn_rxbuf = NULL;
4974 if (sc->hn_rx_ring_cnt == 0)
4977 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4978 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
4980 if (rxr->hn_br == NULL)
4982 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
4983 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
4985 device_printf(sc->hn_dev,
4986 "%dth channel bufring is referenced", i);
4990 #if defined(INET) || defined(INET6)
4991 tcp_lro_free(&rxr->hn_lro);
4993 free(rxr->hn_pktbuf, M_DEVBUF);
4995 free(sc->hn_rx_ring, M_DEVBUF);
4996 sc->hn_rx_ring = NULL;
4998 sc->hn_rx_ring_cnt = 0;
4999 sc->hn_rx_ring_inuse = 0;
5003 hn_tx_ring_create(struct hn_softc *sc, int id)
5005 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
5006 device_t dev = sc->hn_dev;
5007 bus_dma_tag_t parent_dtag;
5011 txr->hn_tx_idx = id;
5013 #ifndef HN_USE_TXDESC_BUFRING
5014 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
5016 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
5018 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
5019 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
5020 M_DEVBUF, M_WAITOK | M_ZERO);
5021 #ifndef HN_USE_TXDESC_BUFRING
5022 SLIST_INIT(&txr->hn_txlist);
5024 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
5025 M_WAITOK, &txr->hn_tx_lock);
5028 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
5029 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
5030 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
5032 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
5035 #ifdef HN_IFSTART_SUPPORT
5036 if (hn_use_if_start) {
5037 txr->hn_txeof = hn_start_txeof;
5038 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
5039 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
5045 txr->hn_txeof = hn_xmit_txeof;
5046 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
5047 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
5049 br_depth = hn_get_txswq_depth(txr);
5050 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
5051 M_WAITOK, &txr->hn_tx_lock);
5054 txr->hn_direct_tx_size = hn_direct_tx_size;
5057 * Always schedule transmission instead of trying to do direct
5058 * transmission. This one gives the best performance so far.
5060 txr->hn_sched_tx = 1;
5062 parent_dtag = bus_get_dma_tag(dev);
5064 /* DMA tag for RNDIS packet messages. */
5065 error = bus_dma_tag_create(parent_dtag, /* parent */
5066 HN_RNDIS_PKT_ALIGN, /* alignment */
5067 HN_RNDIS_PKT_BOUNDARY, /* boundary */
5068 BUS_SPACE_MAXADDR, /* lowaddr */
5069 BUS_SPACE_MAXADDR, /* highaddr */
5070 NULL, NULL, /* filter, filterarg */
5071 HN_RNDIS_PKT_LEN, /* maxsize */
5073 HN_RNDIS_PKT_LEN, /* maxsegsize */
5075 NULL, /* lockfunc */
5076 NULL, /* lockfuncarg */
5077 &txr->hn_tx_rndis_dtag);
5079 device_printf(dev, "failed to create rndis dmatag\n");
5083 /* DMA tag for data. */
5084 error = bus_dma_tag_create(parent_dtag, /* parent */
5086 HN_TX_DATA_BOUNDARY, /* boundary */
5087 BUS_SPACE_MAXADDR, /* lowaddr */
5088 BUS_SPACE_MAXADDR, /* highaddr */
5089 NULL, NULL, /* filter, filterarg */
5090 HN_TX_DATA_MAXSIZE, /* maxsize */
5091 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
5092 HN_TX_DATA_SEGSIZE, /* maxsegsize */
5094 NULL, /* lockfunc */
5095 NULL, /* lockfuncarg */
5096 &txr->hn_tx_data_dtag);
5098 device_printf(dev, "failed to create data dmatag\n");
5102 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
5103 struct hn_txdesc *txd = &txr->hn_txdesc[i];
5106 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
5107 STAILQ_INIT(&txd->agg_list);
5110 * Allocate and load RNDIS packet message.
5112 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
5113 (void **)&txd->rndis_pkt,
5114 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
5115 &txd->rndis_pkt_dmap);
5118 "failed to allocate rndis_packet_msg, %d\n", i);
5122 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
5123 txd->rndis_pkt_dmap,
5124 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
5125 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
5129 "failed to load rndis_packet_msg, %d\n", i);
5130 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5131 txd->rndis_pkt, txd->rndis_pkt_dmap);
5135 /* DMA map for TX data. */
5136 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
5140 "failed to allocate tx data dmamap\n");
5141 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
5142 txd->rndis_pkt_dmap);
5143 bus_dmamem_free(txr->hn_tx_rndis_dtag,
5144 txd->rndis_pkt, txd->rndis_pkt_dmap);
5148 /* All set, put it to list */
5149 txd->flags |= HN_TXD_FLAG_ONLIST;
5150 #ifndef HN_USE_TXDESC_BUFRING
5151 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
5153 buf_ring_enqueue(txr->hn_txdesc_br, txd);
5156 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
5158 if (sc->hn_tx_sysctl_tree != NULL) {
5159 struct sysctl_oid_list *child;
5160 struct sysctl_ctx_list *ctx;
5164 * Create per TX ring sysctl tree:
5165 * dev.hn.UNIT.tx.RINGID
5167 ctx = device_get_sysctl_ctx(dev);
5168 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
5170 snprintf(name, sizeof(name), "%d", id);
5171 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
5172 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5174 if (txr->hn_tx_sysctl_tree != NULL) {
5175 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
5178 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
5179 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
5180 "# of available TX descs");
5182 #ifdef HN_IFSTART_SUPPORT
5183 if (!hn_use_if_start)
5186 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
5187 CTLFLAG_RD, &txr->hn_oactive, 0,
5190 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
5191 CTLFLAG_RW, &txr->hn_pkts,
5192 "# of packets transmitted");
5193 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
5194 CTLFLAG_RW, &txr->hn_sends, "# of sends");
5202 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
5204 struct hn_tx_ring *txr = txd->txr;
5206 KASSERT(txd->m == NULL, ("still has mbuf installed"));
5207 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
5209 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
5210 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
5211 txd->rndis_pkt_dmap);
5212 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
5216 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
5219 KASSERT(txd->refs == 0 || txd->refs == 1,
5220 ("invalid txd refs %d", txd->refs));
5222 /* Aggregated txds will be freed by their aggregating txd. */
5223 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
5226 freed = hn_txdesc_put(txr, txd);
5227 KASSERT(freed, ("can't free txdesc"));
5232 hn_tx_ring_destroy(struct hn_tx_ring *txr)
5236 if (txr->hn_txdesc == NULL)
5241 * Because the freeing of aggregated txds will be deferred
5242 * to the aggregating txd, two passes are used here:
5243 * - The first pass GCes any pending txds. This GC is necessary,
5244 * since if the channels are revoked, hypervisor will not
5245 * deliver send-done for all pending txds.
5246 * - The second pass frees the busdma stuffs, i.e. after all txds
5249 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5250 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
5251 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
5252 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
5254 if (txr->hn_tx_data_dtag != NULL)
5255 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
5256 if (txr->hn_tx_rndis_dtag != NULL)
5257 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
5259 #ifdef HN_USE_TXDESC_BUFRING
5260 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
5263 free(txr->hn_txdesc, M_DEVBUF);
5264 txr->hn_txdesc = NULL;
5266 if (txr->hn_mbuf_br != NULL)
5267 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
5269 #ifndef HN_USE_TXDESC_BUFRING
5270 mtx_destroy(&txr->hn_txlist_spin);
5272 mtx_destroy(&txr->hn_tx_lock);
5276 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
5278 struct sysctl_oid_list *child;
5279 struct sysctl_ctx_list *ctx;
5283 * Create TXBUF for chimney sending.
5285 * NOTE: It is shared by all channels.
5287 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
5288 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
5289 BUS_DMA_WAITOK | BUS_DMA_ZERO);
5290 if (sc->hn_chim == NULL) {
5291 device_printf(sc->hn_dev, "allocate txbuf failed\n");
5295 sc->hn_tx_ring_cnt = ring_cnt;
5296 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5298 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
5299 M_DEVBUF, M_WAITOK | M_ZERO);
5301 ctx = device_get_sysctl_ctx(sc->hn_dev);
5302 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
5304 /* Create dev.hn.UNIT.tx sysctl tree */
5305 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
5306 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
5308 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
5311 error = hn_tx_ring_create(sc, i);
5316 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
5317 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5318 __offsetof(struct hn_tx_ring, hn_no_txdescs),
5319 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
5320 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
5321 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5322 __offsetof(struct hn_tx_ring, hn_send_failed),
5323 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
5324 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
5325 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5326 __offsetof(struct hn_tx_ring, hn_txdma_failed),
5327 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
5328 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
5329 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5330 __offsetof(struct hn_tx_ring, hn_flush_failed),
5331 hn_tx_stat_ulong_sysctl, "LU",
5332 "# of packet transmission aggregation flush failure");
5333 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
5334 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5335 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
5336 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
5337 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
5338 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5339 __offsetof(struct hn_tx_ring, hn_tx_chimney),
5340 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
5341 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
5342 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5343 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
5344 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
5345 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
5346 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
5347 "# of total TX descs");
5348 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
5349 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
5350 "Chimney send packet size upper boundary");
5351 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
5352 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
5353 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
5354 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
5355 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5356 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
5357 hn_tx_conf_int_sysctl, "I",
5358 "Size of the packet for direct transmission");
5359 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
5360 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
5361 __offsetof(struct hn_tx_ring, hn_sched_tx),
5362 hn_tx_conf_int_sysctl, "I",
5363 "Always schedule transmission "
5364 "instead of doing direct transmission");
5365 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
5366 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
5367 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
5368 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
5369 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
5370 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
5371 "Applied packet transmission aggregation size");
5372 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
5373 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5374 hn_txagg_pktmax_sysctl, "I",
5375 "Applied packet transmission aggregation packets");
5376 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
5377 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
5378 hn_txagg_align_sysctl, "I",
5379 "Applied packet transmission aggregation alignment");
5385 hn_set_chim_size(struct hn_softc *sc, int chim_size)
5389 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5390 sc->hn_tx_ring[i].hn_chim_size = chim_size;
5394 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
5396 struct ifnet *ifp = sc->hn_ifp;
5402 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
5405 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
5406 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
5407 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
5409 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
5410 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
5411 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
5413 if (tso_maxlen < tso_minlen)
5414 tso_maxlen = tso_minlen;
5415 else if (tso_maxlen > IP_MAXPACKET)
5416 tso_maxlen = IP_MAXPACKET;
5417 if (tso_maxlen > sc->hn_ndis_tso_szmax)
5418 tso_maxlen = sc->hn_ndis_tso_szmax;
5419 hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
5421 if (hn_xpnt_vf_isready(sc)) {
5422 if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
5423 hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
5425 ifp->if_hw_tsomax = hw_tsomax;
5427 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
5431 hn_fixup_tx_data(struct hn_softc *sc)
5433 uint64_t csum_assist;
5436 hn_set_chim_size(sc, sc->hn_chim_szmax);
5437 if (hn_tx_chimney_size > 0 &&
5438 hn_tx_chimney_size < sc->hn_chim_szmax)
5439 hn_set_chim_size(sc, hn_tx_chimney_size);
5442 if (sc->hn_caps & HN_CAP_IPCS)
5443 csum_assist |= CSUM_IP;
5444 if (sc->hn_caps & HN_CAP_TCP4CS)
5445 csum_assist |= CSUM_IP_TCP;
5446 if (sc->hn_caps & HN_CAP_UDP4CS)
5447 csum_assist |= CSUM_IP_UDP;
5448 if (sc->hn_caps & HN_CAP_TCP6CS)
5449 csum_assist |= CSUM_IP6_TCP;
5450 if (sc->hn_caps & HN_CAP_UDP6CS)
5451 csum_assist |= CSUM_IP6_UDP;
5452 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5453 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
5455 if (sc->hn_caps & HN_CAP_HASHVAL) {
5457 * Support HASHVAL pktinfo on TX path.
5460 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
5461 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5462 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
5467 hn_destroy_tx_data(struct hn_softc *sc)
5471 if (sc->hn_chim != NULL) {
5472 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
5473 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
5475 device_printf(sc->hn_dev,
5476 "chimney sending buffer is referenced");
5481 if (sc->hn_tx_ring_cnt == 0)
5484 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
5485 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
5487 free(sc->hn_tx_ring, M_DEVBUF);
5488 sc->hn_tx_ring = NULL;
5490 sc->hn_tx_ring_cnt = 0;
5491 sc->hn_tx_ring_inuse = 0;
5494 #ifdef HN_IFSTART_SUPPORT
5497 hn_start_taskfunc(void *xtxr, int pending __unused)
5499 struct hn_tx_ring *txr = xtxr;
5501 mtx_lock(&txr->hn_tx_lock);
5502 hn_start_locked(txr, 0);
5503 mtx_unlock(&txr->hn_tx_lock);
5507 hn_start_locked(struct hn_tx_ring *txr, int len)
5509 struct hn_softc *sc = txr->hn_sc;
5510 struct ifnet *ifp = sc->hn_ifp;
5513 KASSERT(hn_use_if_start,
5514 ("hn_start_locked is called, when if_start is disabled"));
5515 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5516 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5517 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5519 if (__predict_false(txr->hn_suspended))
5522 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
5526 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
5527 struct hn_txdesc *txd;
5528 struct mbuf *m_head;
5531 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
5535 if (len > 0 && m_head->m_pkthdr.len > len) {
5537 * This sending could be time consuming; let callers
5538 * dispatch this packet sending (and sending of any
5539 * following up packets) to tx taskqueue.
5541 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5546 #if defined(INET6) || defined(INET)
5547 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
5548 m_head = hn_tso_fixup(m_head);
5549 if (__predict_false(m_head == NULL)) {
5550 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5556 txd = hn_txdesc_get(txr);
5558 txr->hn_no_txdescs++;
5559 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5560 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5564 error = hn_encap(ifp, txr, txd, &m_head);
5566 /* Both txd and m_head are freed */
5567 KASSERT(txr->hn_agg_txd == NULL,
5568 ("encap failed w/ pending aggregating txdesc"));
5572 if (txr->hn_agg_pktleft == 0) {
5573 if (txr->hn_agg_txd != NULL) {
5574 KASSERT(m_head == NULL,
5575 ("pending mbuf for aggregating txdesc"));
5576 error = hn_flush_txagg(ifp, txr);
5577 if (__predict_false(error)) {
5578 atomic_set_int(&ifp->if_drv_flags,
5583 KASSERT(m_head != NULL, ("mbuf was freed"));
5584 error = hn_txpkt(ifp, txr, txd);
5585 if (__predict_false(error)) {
5586 /* txd is freed, but m_head is not */
5587 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
5588 atomic_set_int(&ifp->if_drv_flags,
5596 KASSERT(txr->hn_agg_txd != NULL,
5597 ("no aggregating txdesc"));
5598 KASSERT(m_head == NULL,
5599 ("pending mbuf for aggregating txdesc"));
5604 /* Flush pending aggerated transmission. */
5605 if (txr->hn_agg_txd != NULL)
5606 hn_flush_txagg(ifp, txr);
5611 hn_start(struct ifnet *ifp)
5613 struct hn_softc *sc = ifp->if_softc;
5614 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
5616 if (txr->hn_sched_tx)
5619 if (mtx_trylock(&txr->hn_tx_lock)) {
5622 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5623 mtx_unlock(&txr->hn_tx_lock);
5628 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5632 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
5634 struct hn_tx_ring *txr = xtxr;
5636 mtx_lock(&txr->hn_tx_lock);
5637 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
5638 hn_start_locked(txr, 0);
5639 mtx_unlock(&txr->hn_tx_lock);
5643 hn_start_txeof(struct hn_tx_ring *txr)
5645 struct hn_softc *sc = txr->hn_sc;
5646 struct ifnet *ifp = sc->hn_ifp;
5648 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
5650 if (txr->hn_sched_tx)
5653 if (mtx_trylock(&txr->hn_tx_lock)) {
5656 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5657 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
5658 mtx_unlock(&txr->hn_tx_lock);
5660 taskqueue_enqueue(txr->hn_tx_taskq,
5666 * Release the OACTIVE earlier, with the hope, that
5667 * others could catch up. The task will clear the
5668 * flag again with the hn_tx_lock to avoid possible
5671 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
5672 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5676 #endif /* HN_IFSTART_SUPPORT */
5679 hn_xmit(struct hn_tx_ring *txr, int len)
5681 struct hn_softc *sc = txr->hn_sc;
5682 struct ifnet *ifp = sc->hn_ifp;
5683 struct mbuf *m_head;
5686 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
5687 #ifdef HN_IFSTART_SUPPORT
5688 KASSERT(hn_use_if_start == 0,
5689 ("hn_xmit is called, when if_start is enabled"));
5691 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
5693 if (__predict_false(txr->hn_suspended))
5696 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
5699 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
5700 struct hn_txdesc *txd;
5703 if (len > 0 && m_head->m_pkthdr.len > len) {
5705 * This sending could be time consuming; let callers
5706 * dispatch this packet sending (and sending of any
5707 * following up packets) to tx taskqueue.
5709 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5714 txd = hn_txdesc_get(txr);
5716 txr->hn_no_txdescs++;
5717 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
5718 txr->hn_oactive = 1;
5722 error = hn_encap(ifp, txr, txd, &m_head);
5724 /* Both txd and m_head are freed; discard */
5725 KASSERT(txr->hn_agg_txd == NULL,
5726 ("encap failed w/ pending aggregating txdesc"));
5727 drbr_advance(ifp, txr->hn_mbuf_br);
5731 if (txr->hn_agg_pktleft == 0) {
5732 if (txr->hn_agg_txd != NULL) {
5733 KASSERT(m_head == NULL,
5734 ("pending mbuf for aggregating txdesc"));
5735 error = hn_flush_txagg(ifp, txr);
5736 if (__predict_false(error)) {
5737 txr->hn_oactive = 1;
5741 KASSERT(m_head != NULL, ("mbuf was freed"));
5742 error = hn_txpkt(ifp, txr, txd);
5743 if (__predict_false(error)) {
5744 /* txd is freed, but m_head is not */
5745 drbr_putback(ifp, txr->hn_mbuf_br,
5747 txr->hn_oactive = 1;
5754 KASSERT(txr->hn_agg_txd != NULL,
5755 ("no aggregating txdesc"));
5756 KASSERT(m_head == NULL,
5757 ("pending mbuf for aggregating txdesc"));
5762 drbr_advance(ifp, txr->hn_mbuf_br);
5765 /* Flush pending aggerated transmission. */
5766 if (txr->hn_agg_txd != NULL)
5767 hn_flush_txagg(ifp, txr);
5772 hn_transmit(struct ifnet *ifp, struct mbuf *m)
5774 struct hn_softc *sc = ifp->if_softc;
5775 struct hn_tx_ring *txr;
5778 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
5779 struct rm_priotracker pt;
5781 rm_rlock(&sc->hn_vf_lock, &pt);
5782 if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
5783 struct mbuf *m_bpf = NULL;
5786 obytes = m->m_pkthdr.len;
5787 if (m->m_flags & M_MCAST)
5790 if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
5791 if (bpf_peers_present(ifp->if_bpf)) {
5792 m_bpf = m_copypacket(m, M_NOWAIT);
5793 if (m_bpf == NULL) {
5795 * Failed to grab a shallow
5798 ETHER_BPF_MTAP(ifp, m);
5802 ETHER_BPF_MTAP(ifp, m);
5805 error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
5806 rm_runlock(&sc->hn_vf_lock, &pt);
5808 if (m_bpf != NULL) {
5810 ETHER_BPF_MTAP(ifp, m_bpf);
5814 if (error == ENOBUFS) {
5815 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5817 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5819 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
5820 if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
5822 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
5828 rm_runlock(&sc->hn_vf_lock, &pt);
5831 #if defined(INET6) || defined(INET)
5833 * Perform TSO packet header fixup now, since the TSO
5834 * packet header should be cache-hot.
5836 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
5837 m = hn_tso_fixup(m);
5838 if (__predict_false(m == NULL)) {
5839 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
5846 * Select the TX ring based on flowid
5848 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
5849 #if defined(INET6) || defined(INET)
5852 if (m->m_pkthdr.len < 128 &&
5853 (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
5854 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
5855 m = hn_check_tcpsyn(m, &tcpsyn);
5856 if (__predict_false(m == NULL)) {
5858 IFCOUNTER_OERRORS, 1);
5863 const int tcpsyn = 0;
5868 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
5870 txr = &sc->hn_tx_ring[idx];
5872 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
5874 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
5878 if (txr->hn_oactive)
5881 if (txr->hn_sched_tx)
5884 if (mtx_trylock(&txr->hn_tx_lock)) {
5887 sched = hn_xmit(txr, txr->hn_direct_tx_size);
5888 mtx_unlock(&txr->hn_tx_lock);
5893 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
5898 hn_tx_ring_qflush(struct hn_tx_ring *txr)
5902 mtx_lock(&txr->hn_tx_lock);
5903 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
5905 mtx_unlock(&txr->hn_tx_lock);
5909 hn_xmit_qflush(struct ifnet *ifp)
5911 struct hn_softc *sc = ifp->if_softc;
5912 struct rm_priotracker pt;
5915 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
5916 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5919 rm_rlock(&sc->hn_vf_lock, &pt);
5920 if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
5921 sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
5922 rm_runlock(&sc->hn_vf_lock, &pt);
5926 hn_xmit_txeof(struct hn_tx_ring *txr)
5929 if (txr->hn_sched_tx)
5932 if (mtx_trylock(&txr->hn_tx_lock)) {
5935 txr->hn_oactive = 0;
5936 sched = hn_xmit(txr, txr->hn_direct_tx_size);
5937 mtx_unlock(&txr->hn_tx_lock);
5939 taskqueue_enqueue(txr->hn_tx_taskq,
5945 * Release the oactive earlier, with the hope, that
5946 * others could catch up. The task will clear the
5947 * oactive again with the hn_tx_lock to avoid possible
5950 txr->hn_oactive = 0;
5951 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5956 hn_xmit_taskfunc(void *xtxr, int pending __unused)
5958 struct hn_tx_ring *txr = xtxr;
5960 mtx_lock(&txr->hn_tx_lock);
5962 mtx_unlock(&txr->hn_tx_lock);
5966 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
5968 struct hn_tx_ring *txr = xtxr;
5970 mtx_lock(&txr->hn_tx_lock);
5971 txr->hn_oactive = 0;
5973 mtx_unlock(&txr->hn_tx_lock);
5977 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
5979 struct vmbus_chan_br cbr;
5980 struct hn_rx_ring *rxr;
5981 struct hn_tx_ring *txr = NULL;
5984 idx = vmbus_chan_subidx(chan);
5987 * Link this channel to RX/TX ring.
5989 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
5990 ("invalid channel index %d, should > 0 && < %d",
5991 idx, sc->hn_rx_ring_inuse));
5992 rxr = &sc->hn_rx_ring[idx];
5993 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
5994 ("RX ring %d already attached", idx));
5995 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
5996 rxr->hn_chan = chan;
5999 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
6000 idx, vmbus_chan_id(chan));
6003 if (idx < sc->hn_tx_ring_inuse) {
6004 txr = &sc->hn_tx_ring[idx];
6005 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
6006 ("TX ring %d already attached", idx));
6007 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
6009 txr->hn_chan = chan;
6011 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
6012 idx, vmbus_chan_id(chan));
6016 /* Bind this channel to a proper CPU. */
6017 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
6022 cbr.cbr = rxr->hn_br;
6023 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
6024 cbr.cbr_txsz = HN_TXBR_SIZE;
6025 cbr.cbr_rxsz = HN_RXBR_SIZE;
6026 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
6028 if (error == EISCONN) {
6029 if_printf(sc->hn_ifp, "bufring is connected after "
6030 "chan%u open failure\n", vmbus_chan_id(chan));
6031 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6033 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
6034 vmbus_chan_id(chan), error);
6041 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
6043 struct hn_rx_ring *rxr;
6046 idx = vmbus_chan_subidx(chan);
6049 * Link this channel to RX/TX ring.
6051 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
6052 ("invalid channel index %d, should > 0 && < %d",
6053 idx, sc->hn_rx_ring_inuse));
6054 rxr = &sc->hn_rx_ring[idx];
6055 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
6056 ("RX ring %d is not attached", idx));
6057 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
6059 if (idx < sc->hn_tx_ring_inuse) {
6060 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
6062 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
6063 ("TX ring %d is not attached attached", idx));
6064 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
6068 * Close this channel.
6071 * Channel closing does _not_ destroy the target channel.
6073 error = vmbus_chan_close_direct(chan);
6074 if (error == EISCONN) {
6075 if_printf(sc->hn_ifp, "chan%u bufring is connected "
6076 "after being closed\n", vmbus_chan_id(chan));
6077 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
6079 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
6080 vmbus_chan_id(chan), error);
6085 hn_attach_subchans(struct hn_softc *sc)
6087 struct vmbus_channel **subchans;
6088 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6091 KASSERT(subchan_cnt > 0, ("no sub-channels"));
6093 /* Attach the sub-channels. */
6094 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6095 for (i = 0; i < subchan_cnt; ++i) {
6098 error1 = hn_chan_attach(sc, subchans[i]);
6101 /* Move on; all channels will be detached later. */
6104 vmbus_subchan_rel(subchans, subchan_cnt);
6107 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
6110 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
6118 hn_detach_allchans(struct hn_softc *sc)
6120 struct vmbus_channel **subchans;
6121 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
6124 if (subchan_cnt == 0)
6127 /* Detach the sub-channels. */
6128 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
6129 for (i = 0; i < subchan_cnt; ++i)
6130 hn_chan_detach(sc, subchans[i]);
6131 vmbus_subchan_rel(subchans, subchan_cnt);
6135 * Detach the primary channel, _after_ all sub-channels
6138 hn_chan_detach(sc, sc->hn_prichan);
6140 /* Wait for sub-channels to be destroyed, if any. */
6141 vmbus_subchan_drain(sc->hn_prichan);
6144 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6145 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
6146 HN_RX_FLAG_ATTACHED) == 0,
6147 ("%dth RX ring is still attached", i));
6149 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
6150 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
6151 HN_TX_FLAG_ATTACHED) == 0,
6152 ("%dth TX ring is still attached", i));
6158 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
6160 struct vmbus_channel **subchans;
6161 int nchan, rxr_cnt, error;
6163 nchan = *nsubch + 1;
6166 * Multiple RX/TX rings are not requested.
6173 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
6176 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
6178 /* No RSS; this is benign. */
6183 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
6187 if (nchan > rxr_cnt)
6190 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
6196 * Allocate sub-channels from NVS.
6198 *nsubch = nchan - 1;
6199 error = hn_nvs_alloc_subchans(sc, nsubch);
6200 if (error || *nsubch == 0) {
6201 /* Failed to allocate sub-channels. */
6207 * Wait for all sub-channels to become ready before moving on.
6209 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
6210 vmbus_subchan_rel(subchans, *nsubch);
6215 hn_synth_attachable(const struct hn_softc *sc)
6219 if (sc->hn_flags & HN_FLAG_ERRORS)
6222 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
6223 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
6225 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
6232 * Make sure that the RX filter is zero after the successful
6233 * RNDIS initialization.
6236 * Under certain conditions on certain versions of Hyper-V,
6237 * the RNDIS rxfilter is _not_ zero on the hypervisor side
6238 * after the successful RNDIS initialization, which breaks
6239 * the assumption of any following code (well, it breaks the
6240 * RNDIS API contract actually). Clear the RNDIS rxfilter
6241 * explicitly, drain packets sneaking through, and drain the
6242 * interrupt taskqueues scheduled due to the stealth packets.
6245 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
6249 hn_drain_rxtx(sc, nchan);
6253 hn_synth_attach(struct hn_softc *sc, int mtu)
6255 #define ATTACHED_NVS 0x0002
6256 #define ATTACHED_RNDIS 0x0004
6258 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
6259 int error, nsubch, nchan = 1, i, rndis_inited;
6260 uint32_t old_caps, attached = 0;
6262 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
6263 ("synthetic parts were attached"));
6265 if (!hn_synth_attachable(sc))
6268 /* Save capabilities for later verification. */
6269 old_caps = sc->hn_caps;
6272 /* Clear RSS stuffs. */
6273 sc->hn_rss_ind_size = 0;
6274 sc->hn_rss_hash = 0;
6275 sc->hn_rss_hcap = 0;
6278 * Attach the primary channel _before_ attaching NVS and RNDIS.
6280 error = hn_chan_attach(sc, sc->hn_prichan);
6287 error = hn_nvs_attach(sc, mtu);
6290 attached |= ATTACHED_NVS;
6293 * Attach RNDIS _after_ NVS is attached.
6295 error = hn_rndis_attach(sc, mtu, &rndis_inited);
6297 attached |= ATTACHED_RNDIS;
6302 * Make sure capabilities are not changed.
6304 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
6305 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
6306 old_caps, sc->hn_caps);
6312 * Allocate sub-channels for multi-TX/RX rings.
6315 * The # of RX rings that can be used is equivalent to the # of
6316 * channels to be requested.
6318 nsubch = sc->hn_rx_ring_cnt - 1;
6319 error = hn_synth_alloc_subchans(sc, &nsubch);
6322 /* NOTE: _Full_ synthetic parts detach is required now. */
6323 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
6326 * Set the # of TX/RX rings that could be used according to
6327 * the # of channels that NVS offered.
6330 hn_set_ring_inuse(sc, nchan);
6332 /* Only the primary channel can be used; done */
6337 * Attach the sub-channels.
6339 * NOTE: hn_set_ring_inuse() _must_ have been called.
6341 error = hn_attach_subchans(sc);
6346 * Configure RSS key and indirect table _after_ all sub-channels
6349 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
6351 * RSS key is not set yet; set it to the default RSS key.
6354 if_printf(sc->hn_ifp, "setup default RSS key\n");
6355 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
6356 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
6359 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
6361 * RSS indirect table is not set yet; set it up in round-
6365 if_printf(sc->hn_ifp, "setup default RSS indirect "
6368 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
6369 rss->rss_ind[i] = i % nchan;
6370 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
6373 * # of usable channels may be changed, so we have to
6374 * make sure that all entries in RSS indirect table
6377 * NOTE: hn_set_ring_inuse() _must_ have been called.
6379 hn_rss_ind_fixup(sc);
6382 sc->hn_rss_hash = sc->hn_rss_hcap;
6383 if ((sc->hn_flags & HN_FLAG_RXVF) ||
6384 (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
6385 /* NOTE: Don't reconfigure RSS; will do immediately. */
6386 hn_vf_rss_fixup(sc, false);
6388 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
6393 * Fixup transmission aggregation setup.
6396 hn_rndis_init_fixat(sc, nchan);
6400 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
6401 hn_rndis_init_fixat(sc, nchan);
6402 hn_synth_detach(sc);
6404 if (attached & ATTACHED_RNDIS) {
6405 hn_rndis_init_fixat(sc, nchan);
6406 hn_rndis_detach(sc);
6408 if (attached & ATTACHED_NVS)
6410 hn_chan_detach(sc, sc->hn_prichan);
6411 /* Restore old capabilities. */
6412 sc->hn_caps = old_caps;
6416 #undef ATTACHED_RNDIS
6422 * The interface must have been suspended though hn_suspend(), before
6423 * this function get called.
6426 hn_synth_detach(struct hn_softc *sc)
6429 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
6430 ("synthetic parts were not attached"));
6432 /* Detach the RNDIS first. */
6433 hn_rndis_detach(sc);
6438 /* Detach all of the channels. */
6439 hn_detach_allchans(sc);
6441 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
6445 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
6447 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
6448 ("invalid ring count %d", ring_cnt));
6450 if (sc->hn_tx_ring_cnt > ring_cnt)
6451 sc->hn_tx_ring_inuse = ring_cnt;
6453 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
6454 sc->hn_rx_ring_inuse = ring_cnt;
6457 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
6458 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
6463 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
6468 * The TX bufring will not be drained by the hypervisor,
6469 * if the primary channel is revoked.
6471 while (!vmbus_chan_rx_empty(chan) ||
6472 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
6473 !vmbus_chan_tx_empty(chan)))
6475 vmbus_chan_intr_drain(chan);
6479 hn_disable_rx(struct hn_softc *sc)
6483 * Disable RX by clearing RX filter forcefully.
6485 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
6486 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
6489 * Give RNDIS enough time to flush all pending data packets.
6491 pause("waitrx", (200 * hz) / 1000);
6496 * RX/TX _must_ have been suspended/disabled, before this function
6500 hn_drain_rxtx(struct hn_softc *sc, int nchan)
6502 struct vmbus_channel **subch = NULL;
6506 * Drain RX/TX bufrings and interrupts.
6510 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
6512 if (subch != NULL) {
6515 for (i = 0; i < nsubch; ++i)
6516 hn_chan_drain(sc, subch[i]);
6518 hn_chan_drain(sc, sc->hn_prichan);
6521 vmbus_subchan_rel(subch, nsubch);
6525 hn_suspend_data(struct hn_softc *sc)
6527 struct hn_tx_ring *txr;
6535 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6536 txr = &sc->hn_tx_ring[i];
6538 mtx_lock(&txr->hn_tx_lock);
6539 txr->hn_suspended = 1;
6540 mtx_unlock(&txr->hn_tx_lock);
6541 /* No one is able send more packets now. */
6544 * Wait for all pending sends to finish.
6547 * We will _not_ receive all pending send-done, if the
6548 * primary channel is revoked.
6550 while (hn_tx_ring_pending(txr) &&
6551 !vmbus_chan_is_revoked(sc->hn_prichan))
6552 pause("hnwtx", 1 /* 1 tick */);
6563 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
6566 * Drain any pending TX tasks.
6569 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
6570 * tasks will have to be drained _after_ the above hn_drain_rxtx().
6572 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6573 txr = &sc->hn_tx_ring[i];
6575 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
6576 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
6581 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
6584 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
6588 hn_suspend_mgmt(struct hn_softc *sc)
6595 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
6596 * through hn_mgmt_taskq.
6598 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
6599 vmbus_chan_run_task(sc->hn_prichan, &task);
6602 * Make sure that all pending management tasks are completed.
6604 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
6605 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
6606 taskqueue_drain_all(sc->hn_mgmt_taskq0);
6610 hn_suspend(struct hn_softc *sc)
6613 /* Disable polling. */
6617 * If the non-transparent mode VF is activated, the synthetic
6618 * device is receiving packets, so the data path of the
6619 * synthetic device must be suspended.
6621 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6622 (sc->hn_flags & HN_FLAG_RXVF))
6623 hn_suspend_data(sc);
6624 hn_suspend_mgmt(sc);
6628 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
6632 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
6633 ("invalid TX ring count %d", tx_ring_cnt));
6635 for (i = 0; i < tx_ring_cnt; ++i) {
6636 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6638 mtx_lock(&txr->hn_tx_lock);
6639 txr->hn_suspended = 0;
6640 mtx_unlock(&txr->hn_tx_lock);
6645 hn_resume_data(struct hn_softc *sc)
6654 hn_rxfilter_config(sc);
6657 * Make sure to clear suspend status on "all" TX rings,
6658 * since hn_tx_ring_inuse can be changed after
6659 * hn_suspend_data().
6661 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
6663 #ifdef HN_IFSTART_SUPPORT
6664 if (!hn_use_if_start)
6668 * Flush unused drbrs, since hn_tx_ring_inuse may be
6671 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
6672 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
6678 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
6679 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
6682 * Use txeof task, so that any pending oactive can be
6685 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
6690 hn_resume_mgmt(struct hn_softc *sc)
6693 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
6696 * Kick off network change detection, if it was pending.
6697 * If no network change was pending, start link status
6698 * checks, which is more lightweight than network change
6701 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
6702 hn_change_network(sc);
6704 hn_update_link_status(sc);
6708 hn_resume(struct hn_softc *sc)
6712 * If the non-transparent mode VF is activated, the synthetic
6713 * device have to receive packets, so the data path of the
6714 * synthetic device must be resumed.
6716 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
6717 (sc->hn_flags & HN_FLAG_RXVF))
6721 * Don't resume link status change if VF is attached/activated.
6722 * - In the non-transparent VF mode, the synthetic device marks
6723 * link down until the VF is deactivated; i.e. VF is down.
6724 * - In transparent VF mode, VF's media status is used until
6725 * the VF is detached.
6727 if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
6728 !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
6732 * Re-enable polling if this interface is running and
6733 * the polling is requested.
6735 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
6736 hn_polling(sc, sc->hn_pollhz);
6740 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
6742 const struct rndis_status_msg *msg;
6745 if (dlen < sizeof(*msg)) {
6746 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
6751 switch (msg->rm_status) {
6752 case RNDIS_STATUS_MEDIA_CONNECT:
6753 case RNDIS_STATUS_MEDIA_DISCONNECT:
6754 hn_update_link_status(sc);
6757 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
6758 case RNDIS_STATUS_LINK_SPEED_CHANGE:
6759 /* Not really useful; ignore. */
6762 case RNDIS_STATUS_NETWORK_CHANGE:
6763 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
6764 if (dlen < ofs + msg->rm_stbuflen ||
6765 msg->rm_stbuflen < sizeof(uint32_t)) {
6766 if_printf(sc->hn_ifp, "network changed\n");
6770 memcpy(&change, ((const uint8_t *)msg) + ofs,
6772 if_printf(sc->hn_ifp, "network changed, change %u\n",
6775 hn_change_network(sc);
6779 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
6786 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
6788 const struct rndis_pktinfo *pi = info_data;
6791 while (info_dlen != 0) {
6795 if (__predict_false(info_dlen < sizeof(*pi)))
6797 if (__predict_false(info_dlen < pi->rm_size))
6799 info_dlen -= pi->rm_size;
6801 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
6803 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
6805 dlen = pi->rm_size - pi->rm_pktinfooffset;
6808 switch (pi->rm_type) {
6809 case NDIS_PKTINFO_TYPE_VLAN:
6810 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
6812 info->vlan_info = *((const uint32_t *)data);
6813 mask |= HN_RXINFO_VLAN;
6816 case NDIS_PKTINFO_TYPE_CSUM:
6817 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
6819 info->csum_info = *((const uint32_t *)data);
6820 mask |= HN_RXINFO_CSUM;
6823 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
6824 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
6826 info->hash_value = *((const uint32_t *)data);
6827 mask |= HN_RXINFO_HASHVAL;
6830 case HN_NDIS_PKTINFO_TYPE_HASHINF:
6831 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
6833 info->hash_info = *((const uint32_t *)data);
6834 mask |= HN_RXINFO_HASHINF;
6841 if (mask == HN_RXINFO_ALL) {
6842 /* All found; done */
6846 pi = (const struct rndis_pktinfo *)
6847 ((const uint8_t *)pi + pi->rm_size);
6852 * - If there is no hash value, invalidate the hash info.
6854 if ((mask & HN_RXINFO_HASHVAL) == 0)
6855 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
6859 static __inline bool
6860 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
6863 if (off < check_off) {
6864 if (__predict_true(off + len <= check_off))
6866 } else if (off > check_off) {
6867 if (__predict_true(check_off + check_len <= off))
6874 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
6876 const struct rndis_packet_msg *pkt;
6877 struct hn_rxinfo info;
6878 int data_off, pktinfo_off, data_len, pktinfo_len;
6883 if (__predict_false(dlen < sizeof(*pkt))) {
6884 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
6889 if (__predict_false(dlen < pkt->rm_len)) {
6890 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
6891 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
6894 if (__predict_false(pkt->rm_len <
6895 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
6896 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
6897 "msglen %u, data %u, oob %u, pktinfo %u\n",
6898 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
6899 pkt->rm_pktinfolen);
6902 if (__predict_false(pkt->rm_datalen == 0)) {
6903 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
6910 #define IS_OFFSET_INVALID(ofs) \
6911 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
6912 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
6914 /* XXX Hyper-V does not meet data offset alignment requirement */
6915 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
6916 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6917 "data offset %u\n", pkt->rm_dataoffset);
6920 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
6921 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
6922 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6923 "oob offset %u\n", pkt->rm_oobdataoffset);
6926 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
6927 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
6928 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6929 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
6933 #undef IS_OFFSET_INVALID
6935 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
6936 data_len = pkt->rm_datalen;
6937 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
6938 pktinfo_len = pkt->rm_pktinfolen;
6941 * Check OOB coverage.
6943 if (__predict_false(pkt->rm_oobdatalen != 0)) {
6944 int oob_off, oob_len;
6946 if_printf(rxr->hn_ifp, "got oobdata\n");
6947 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
6948 oob_len = pkt->rm_oobdatalen;
6950 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
6951 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6952 "oob overflow, msglen %u, oob abs %d len %d\n",
6953 pkt->rm_len, oob_off, oob_len);
6958 * Check against data.
6960 if (hn_rndis_check_overlap(oob_off, oob_len,
6961 data_off, data_len)) {
6962 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6963 "oob overlaps data, oob abs %d len %d, "
6964 "data abs %d len %d\n",
6965 oob_off, oob_len, data_off, data_len);
6970 * Check against pktinfo.
6972 if (pktinfo_len != 0 &&
6973 hn_rndis_check_overlap(oob_off, oob_len,
6974 pktinfo_off, pktinfo_len)) {
6975 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6976 "oob overlaps pktinfo, oob abs %d len %d, "
6977 "pktinfo abs %d len %d\n",
6978 oob_off, oob_len, pktinfo_off, pktinfo_len);
6984 * Check per-packet-info coverage and find useful per-packet-info.
6986 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
6987 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
6988 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
6989 if (__predict_true(pktinfo_len != 0)) {
6993 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
6994 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
6995 "pktinfo overflow, msglen %u, "
6996 "pktinfo abs %d len %d\n",
6997 pkt->rm_len, pktinfo_off, pktinfo_len);
7002 * Check packet info coverage.
7004 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
7005 data_off, data_len);
7006 if (__predict_false(overlap)) {
7007 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7008 "pktinfo overlap data, pktinfo abs %d len %d, "
7009 "data abs %d len %d\n",
7010 pktinfo_off, pktinfo_len, data_off, data_len);
7015 * Find useful per-packet-info.
7017 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
7018 pktinfo_len, &info);
7019 if (__predict_false(error)) {
7020 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
7026 if (__predict_false(data_off + data_len > pkt->rm_len)) {
7027 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
7028 "data overflow, msglen %u, data abs %d len %d\n",
7029 pkt->rm_len, data_off, data_len);
7032 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
7035 static __inline void
7036 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
7038 const struct rndis_msghdr *hdr;
7040 if (__predict_false(dlen < sizeof(*hdr))) {
7041 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
7046 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
7047 /* Hot data path. */
7048 hn_rndis_rx_data(rxr, data, dlen);
7053 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
7054 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
7056 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
7060 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
7062 const struct hn_nvs_hdr *hdr;
7064 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
7065 if_printf(sc->hn_ifp, "invalid nvs notify\n");
7068 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
7070 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
7071 /* Useless; ignore */
7074 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
7078 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
7079 const struct vmbus_chanpkt_hdr *pkt)
7081 struct hn_nvs_sendctx *sndc;
7083 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
7084 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
7085 VMBUS_CHANPKT_DATALEN(pkt));
7088 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
7094 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7095 const struct vmbus_chanpkt_hdr *pkthdr)
7097 const struct vmbus_chanpkt_rxbuf *pkt;
7098 const struct hn_nvs_hdr *nvs_hdr;
7101 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
7102 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
7105 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
7107 /* Make sure that this is a RNDIS message. */
7108 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
7109 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
7114 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
7115 if (__predict_false(hlen < sizeof(*pkt))) {
7116 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
7119 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
7121 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
7122 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
7127 count = pkt->cp_rxbuf_cnt;
7128 if (__predict_false(hlen <
7129 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
7130 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
7134 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
7135 for (i = 0; i < count; ++i) {
7138 ofs = pkt->cp_rxbuf[i].rb_ofs;
7139 len = pkt->cp_rxbuf[i].rb_len;
7140 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
7141 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
7142 "ofs %d, len %d\n", i, ofs, len);
7145 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
7149 * Ack the consumed RXBUF associated w/ this channel packet,
7150 * so that this RXBUF can be recycled by the hypervisor.
7152 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
7156 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
7159 struct hn_nvs_rndis_ack ack;
7162 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
7163 ack.nvs_status = HN_NVS_STATUS_OK;
7167 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
7168 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
7169 if (__predict_false(error == EAGAIN)) {
7172 * This should _not_ happen in real world, since the
7173 * consumption of the TX bufring from the TX path is
7176 if (rxr->hn_ack_failed == 0)
7177 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
7178 rxr->hn_ack_failed++;
7185 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
7190 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
7192 struct hn_rx_ring *rxr = xrxr;
7193 struct hn_softc *sc = rxr->hn_ifp->if_softc;
7196 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
7199 pktlen = rxr->hn_pktbuf_len;
7200 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
7201 if (__predict_false(error == ENOBUFS)) {
7206 * Expand channel packet buffer.
7209 * Use M_WAITOK here, since allocation failure
7212 nlen = rxr->hn_pktbuf_len * 2;
7213 while (nlen < pktlen)
7215 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
7217 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
7218 rxr->hn_pktbuf_len, nlen);
7220 free(rxr->hn_pktbuf, M_DEVBUF);
7221 rxr->hn_pktbuf = nbuf;
7222 rxr->hn_pktbuf_len = nlen;
7225 } else if (__predict_false(error == EAGAIN)) {
7226 /* No more channel packets; done! */
7229 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
7231 switch (pkt->cph_type) {
7232 case VMBUS_CHANPKT_TYPE_COMP:
7233 hn_nvs_handle_comp(sc, chan, pkt);
7236 case VMBUS_CHANPKT_TYPE_RXBUF:
7237 hn_nvs_handle_rxbuf(rxr, chan, pkt);
7240 case VMBUS_CHANPKT_TYPE_INBAND:
7241 hn_nvs_handle_notify(sc, pkt);
7245 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
7250 hn_chan_rollup(rxr, rxr->hn_txr);
7254 hn_sysinit(void *arg __unused)
7258 #ifdef HN_IFSTART_SUPPORT
7260 * Don't use ifnet.if_start if transparent VF mode is requested;
7261 * mainly due to the IFF_DRV_OACTIVE flag.
7263 if (hn_xpnt_vf && hn_use_if_start) {
7264 hn_use_if_start = 0;
7265 printf("hn: tranparent VF mode, if_transmit will be used, "
7266 "instead of if_start\n");
7269 if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
7270 printf("hn: invalid transparent VF attach routing "
7271 "wait timeout %d, reset to %d\n",
7272 hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
7273 hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
7277 * Initialize VF map.
7279 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
7280 hn_vfmap_size = HN_VFMAP_SIZE_DEF;
7281 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
7285 * Fix the # of TX taskqueues.
7287 if (hn_tx_taskq_cnt <= 0)
7288 hn_tx_taskq_cnt = 1;
7289 else if (hn_tx_taskq_cnt > mp_ncpus)
7290 hn_tx_taskq_cnt = mp_ncpus;
7293 * Fix the TX taskqueue mode.
7295 switch (hn_tx_taskq_mode) {
7296 case HN_TX_TASKQ_M_INDEP:
7297 case HN_TX_TASKQ_M_GLOBAL:
7298 case HN_TX_TASKQ_M_EVTTQ:
7301 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
7305 if (vm_guest != VM_GUEST_HV)
7308 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
7311 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
7312 M_DEVBUF, M_WAITOK);
7313 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
7314 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
7315 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
7316 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
7320 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
7323 hn_sysuninit(void *arg __unused)
7326 if (hn_tx_taskque != NULL) {
7329 for (i = 0; i < hn_tx_taskq_cnt; ++i)
7330 taskqueue_free(hn_tx_taskque[i]);
7331 free(hn_tx_taskque, M_DEVBUF);
7334 if (hn_vfmap != NULL)
7335 free(hn_vfmap, M_DEVBUF);
7336 rm_destroy(&hn_vfmap_lock);
7338 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);