2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
62 #include <sys/param.h>
64 #include <sys/kernel.h>
65 #include <sys/limits.h>
66 #include <sys/malloc.h>
68 #include <sys/module.h>
70 #include <sys/queue.h>
72 #include <sys/rmlock.h>
75 #include <sys/socket.h>
76 #include <sys/sockio.h>
78 #include <sys/sysctl.h>
79 #include <sys/systm.h>
80 #include <sys/taskqueue.h>
81 #include <sys/buf_ring.h>
82 #include <sys/eventhandler.h>
84 #include <machine/atomic.h>
85 #include <machine/in_cksum.h>
88 #include <net/ethernet.h>
90 #include <net/if_arp.h>
91 #include <net/if_dl.h>
92 #include <net/if_media.h>
93 #include <net/if_types.h>
94 #include <net/if_var.h>
95 #include <net/if_vlan_var.h>
96 #include <net/rndis.h>
98 #include <netinet/in_systm.h>
99 #include <netinet/in.h>
100 #include <netinet/ip.h>
101 #include <netinet/ip6.h>
102 #include <netinet/tcp.h>
103 #include <netinet/tcp_lro.h>
104 #include <netinet/udp.h>
106 #include <dev/hyperv/include/hyperv.h>
107 #include <dev/hyperv/include/hyperv_busdma.h>
108 #include <dev/hyperv/include/vmbus.h>
109 #include <dev/hyperv/include/vmbus_xact.h>
111 #include <dev/hyperv/netvsc/ndis.h>
112 #include <dev/hyperv/netvsc/if_hnreg.h>
113 #include <dev/hyperv/netvsc/if_hnvar.h>
114 #include <dev/hyperv/netvsc/hn_nvs.h>
115 #include <dev/hyperv/netvsc/hn_rndis.h>
117 #include "vmbus_if.h"
119 #define HN_IFSTART_SUPPORT
121 #define HN_RING_CNT_DEF_MAX 8
123 #define HN_VFMAP_SIZE_DEF 8
125 /* YYY should get it from the underlying channel */
126 #define HN_TX_DESC_CNT 512
128 #define HN_RNDIS_PKT_LEN \
129 (sizeof(struct rndis_packet_msg) + \
130 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
131 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
132 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
133 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
134 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
135 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
137 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
138 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
139 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
140 /* -1 for RNDIS packet message */
141 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
143 #define HN_DIRECT_TX_SIZE_DEF 128
145 #define HN_EARLY_TXEOF_THRESH 8
147 #define HN_PKTBUF_LEN_DEF (16 * 1024)
149 #define HN_LROENT_CNT_DEF 128
151 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
152 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
153 /* YYY 2*MTU is a bit rough, but should be good enough. */
154 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
156 #define HN_LRO_ACKCNT_DEF 1
158 #define HN_LOCK_INIT(sc) \
159 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
160 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
161 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
162 #define HN_LOCK(sc) \
164 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
167 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
169 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
170 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
171 #define HN_CSUM_IP_HWASSIST(sc) \
172 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
173 #define HN_CSUM_IP6_HWASSIST(sc) \
174 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
176 #define HN_PKTSIZE_MIN(align) \
177 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
178 HN_RNDIS_PKT_LEN, (align))
179 #define HN_PKTSIZE(m, align) \
180 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
182 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
185 #ifndef HN_USE_TXDESC_BUFRING
186 SLIST_ENTRY(hn_txdesc) link;
188 STAILQ_ENTRY(hn_txdesc) agg_link;
190 /* Aggregated txdescs, in sending order. */
191 STAILQ_HEAD(, hn_txdesc) agg_list;
193 /* The oldest packet, if transmission aggregation happens. */
195 struct hn_tx_ring *txr;
197 uint32_t flags; /* HN_TXD_FLAG_ */
198 struct hn_nvs_sendctx send_ctx;
202 bus_dmamap_t data_dmap;
204 bus_addr_t rndis_pkt_paddr;
205 struct rndis_packet_msg *rndis_pkt;
206 bus_dmamap_t rndis_pkt_dmap;
209 #define HN_TXD_FLAG_ONLIST 0x0001
210 #define HN_TXD_FLAG_DMAMAP 0x0002
211 #define HN_TXD_FLAG_ONAGG 0x0004
220 struct hn_rxvf_setarg {
221 struct hn_rx_ring *rxr;
222 struct ifnet *vf_ifp;
225 #define HN_RXINFO_VLAN 0x0001
226 #define HN_RXINFO_CSUM 0x0002
227 #define HN_RXINFO_HASHINF 0x0004
228 #define HN_RXINFO_HASHVAL 0x0008
229 #define HN_RXINFO_ALL \
232 HN_RXINFO_HASHINF | \
235 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
236 #define HN_NDIS_RXCSUM_INFO_INVALID 0
237 #define HN_NDIS_HASH_INFO_INVALID 0
239 static int hn_probe(device_t);
240 static int hn_attach(device_t);
241 static int hn_detach(device_t);
242 static int hn_shutdown(device_t);
243 static void hn_chan_callback(struct vmbus_channel *,
246 static void hn_init(void *);
247 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
248 #ifdef HN_IFSTART_SUPPORT
249 static void hn_start(struct ifnet *);
251 static int hn_transmit(struct ifnet *, struct mbuf *);
252 static void hn_xmit_qflush(struct ifnet *);
253 static int hn_ifmedia_upd(struct ifnet *);
254 static void hn_ifmedia_sts(struct ifnet *,
255 struct ifmediareq *);
257 static void hn_ifnet_event(void *, struct ifnet *, int);
258 static void hn_ifaddr_event(void *, struct ifnet *);
259 static void hn_ifnet_attevent(void *, struct ifnet *);
260 static void hn_ifnet_detevent(void *, struct ifnet *);
262 static bool hn_ismyvf(const struct hn_softc *,
263 const struct ifnet *);
264 static void hn_rxvf_change(struct hn_softc *,
265 struct ifnet *, bool);
266 static void hn_rxvf_set(struct hn_softc *, struct ifnet *);
267 static void hn_rxvf_set_task(void *, int);
269 static int hn_rndis_rxinfo(const void *, int,
271 static void hn_rndis_rx_data(struct hn_rx_ring *,
273 static void hn_rndis_rx_status(struct hn_softc *,
275 static void hn_rndis_init_fixat(struct hn_softc *, int);
277 static void hn_nvs_handle_notify(struct hn_softc *,
278 const struct vmbus_chanpkt_hdr *);
279 static void hn_nvs_handle_comp(struct hn_softc *,
280 struct vmbus_channel *,
281 const struct vmbus_chanpkt_hdr *);
282 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
283 struct vmbus_channel *,
284 const struct vmbus_chanpkt_hdr *);
285 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
286 struct vmbus_channel *, uint64_t);
288 #if __FreeBSD_version >= 1100099
289 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
290 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
292 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
293 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
294 #if __FreeBSD_version < 1100095
295 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
297 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
299 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
300 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
301 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
302 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
303 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
304 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
305 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
306 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
307 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
308 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
309 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
310 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
311 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
312 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
313 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
314 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
315 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
316 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
317 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
319 static void hn_stop(struct hn_softc *, bool);
320 static void hn_init_locked(struct hn_softc *);
321 static int hn_chan_attach(struct hn_softc *,
322 struct vmbus_channel *);
323 static void hn_chan_detach(struct hn_softc *,
324 struct vmbus_channel *);
325 static int hn_attach_subchans(struct hn_softc *);
326 static void hn_detach_allchans(struct hn_softc *);
327 static void hn_chan_rollup(struct hn_rx_ring *,
328 struct hn_tx_ring *);
329 static void hn_set_ring_inuse(struct hn_softc *, int);
330 static int hn_synth_attach(struct hn_softc *, int);
331 static void hn_synth_detach(struct hn_softc *);
332 static int hn_synth_alloc_subchans(struct hn_softc *,
334 static bool hn_synth_attachable(const struct hn_softc *);
335 static void hn_suspend(struct hn_softc *);
336 static void hn_suspend_data(struct hn_softc *);
337 static void hn_suspend_mgmt(struct hn_softc *);
338 static void hn_resume(struct hn_softc *);
339 static void hn_resume_data(struct hn_softc *);
340 static void hn_resume_mgmt(struct hn_softc *);
341 static void hn_suspend_mgmt_taskfunc(void *, int);
342 static void hn_chan_drain(struct hn_softc *,
343 struct vmbus_channel *);
344 static void hn_disable_rx(struct hn_softc *);
345 static void hn_drain_rxtx(struct hn_softc *, int);
346 static void hn_polling(struct hn_softc *, u_int);
347 static void hn_chan_polling(struct vmbus_channel *, u_int);
349 static void hn_update_link_status(struct hn_softc *);
350 static void hn_change_network(struct hn_softc *);
351 static void hn_link_taskfunc(void *, int);
352 static void hn_netchg_init_taskfunc(void *, int);
353 static void hn_netchg_status_taskfunc(void *, int);
354 static void hn_link_status(struct hn_softc *);
356 static int hn_create_rx_data(struct hn_softc *, int);
357 static void hn_destroy_rx_data(struct hn_softc *);
358 static int hn_check_iplen(const struct mbuf *, int);
359 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
360 static int hn_rxfilter_config(struct hn_softc *);
361 static int hn_rss_reconfig(struct hn_softc *);
362 static void hn_rss_ind_fixup(struct hn_softc *);
363 static int hn_rxpkt(struct hn_rx_ring *, const void *,
364 int, const struct hn_rxinfo *);
366 static int hn_tx_ring_create(struct hn_softc *, int);
367 static void hn_tx_ring_destroy(struct hn_tx_ring *);
368 static int hn_create_tx_data(struct hn_softc *, int);
369 static void hn_fixup_tx_data(struct hn_softc *);
370 static void hn_destroy_tx_data(struct hn_softc *);
371 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
372 static void hn_txdesc_gc(struct hn_tx_ring *,
374 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
375 struct hn_txdesc *, struct mbuf **);
376 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
378 static void hn_set_chim_size(struct hn_softc *, int);
379 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
380 static bool hn_tx_ring_pending(struct hn_tx_ring *);
381 static void hn_tx_ring_qflush(struct hn_tx_ring *);
382 static void hn_resume_tx(struct hn_softc *, int);
383 static void hn_set_txagg(struct hn_softc *);
384 static void *hn_try_txagg(struct ifnet *,
385 struct hn_tx_ring *, struct hn_txdesc *,
387 static int hn_get_txswq_depth(const struct hn_tx_ring *);
388 static void hn_txpkt_done(struct hn_nvs_sendctx *,
389 struct hn_softc *, struct vmbus_channel *,
391 static int hn_txpkt_sglist(struct hn_tx_ring *,
393 static int hn_txpkt_chim(struct hn_tx_ring *,
395 static int hn_xmit(struct hn_tx_ring *, int);
396 static void hn_xmit_taskfunc(void *, int);
397 static void hn_xmit_txeof(struct hn_tx_ring *);
398 static void hn_xmit_txeof_taskfunc(void *, int);
399 #ifdef HN_IFSTART_SUPPORT
400 static int hn_start_locked(struct hn_tx_ring *, int);
401 static void hn_start_taskfunc(void *, int);
402 static void hn_start_txeof(struct hn_tx_ring *);
403 static void hn_start_txeof_taskfunc(void *, int);
406 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
407 "Hyper-V network interface");
409 /* Trust tcp segements verification on host side. */
410 static int hn_trust_hosttcp = 1;
411 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
412 &hn_trust_hosttcp, 0,
413 "Trust tcp segement verification on host side, "
414 "when csum info is missing (global setting)");
416 /* Trust udp datagrams verification on host side. */
417 static int hn_trust_hostudp = 1;
418 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
419 &hn_trust_hostudp, 0,
420 "Trust udp datagram verification on host side, "
421 "when csum info is missing (global setting)");
423 /* Trust ip packets verification on host side. */
424 static int hn_trust_hostip = 1;
425 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
427 "Trust ip packet verification on host side, "
428 "when csum info is missing (global setting)");
430 /* Limit TSO burst size */
431 static int hn_tso_maxlen = IP_MAXPACKET;
432 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
433 &hn_tso_maxlen, 0, "TSO burst limit");
435 /* Limit chimney send size */
436 static int hn_tx_chimney_size = 0;
437 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
438 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
440 /* Limit the size of packet for direct transmission */
441 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
442 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
443 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
445 /* # of LRO entries per RX ring */
446 #if defined(INET) || defined(INET6)
447 #if __FreeBSD_version >= 1100095
448 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
449 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
450 &hn_lro_entry_count, 0, "LRO entry count");
454 static int hn_tx_taskq_cnt = 1;
455 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
456 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
458 #define HN_TX_TASKQ_M_INDEP 0
459 #define HN_TX_TASKQ_M_GLOBAL 1
460 #define HN_TX_TASKQ_M_EVTTQ 2
462 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
463 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
464 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
465 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
467 #ifndef HN_USE_TXDESC_BUFRING
468 static int hn_use_txdesc_bufring = 0;
470 static int hn_use_txdesc_bufring = 1;
472 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
473 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
475 #ifdef HN_IFSTART_SUPPORT
476 /* Use ifnet.if_start instead of ifnet.if_transmit */
477 static int hn_use_if_start = 0;
478 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
479 &hn_use_if_start, 0, "Use if_start TX method");
482 /* # of channels to use */
483 static int hn_chan_cnt = 0;
484 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
486 "# of channels to use; each channel has one RX ring and one TX ring");
488 /* # of transmit rings to use */
489 static int hn_tx_ring_cnt = 0;
490 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
491 &hn_tx_ring_cnt, 0, "# of TX rings to use");
493 /* Software TX ring deptch */
494 static int hn_tx_swq_depth = 0;
495 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
496 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
498 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
499 #if __FreeBSD_version >= 1100095
500 static u_int hn_lro_mbufq_depth = 0;
501 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
502 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
505 /* Packet transmission aggregation size limit */
506 static int hn_tx_agg_size = -1;
507 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
508 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
510 /* Packet transmission aggregation count limit */
511 static int hn_tx_agg_pkts = -1;
512 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
513 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
516 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
517 0, 0, hn_vflist_sysctl, "A", "VF list");
520 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
521 0, 0, hn_vfmap_sysctl, "A", "VF mapping");
523 static u_int hn_cpu_index; /* next CPU for channel */
524 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
526 static struct rmlock hn_vfmap_lock;
527 static int hn_vfmap_size;
528 static struct ifnet **hn_vfmap;
531 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
532 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
533 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
534 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
535 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
536 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
539 static device_method_t hn_methods[] = {
540 /* Device interface */
541 DEVMETHOD(device_probe, hn_probe),
542 DEVMETHOD(device_attach, hn_attach),
543 DEVMETHOD(device_detach, hn_detach),
544 DEVMETHOD(device_shutdown, hn_shutdown),
548 static driver_t hn_driver = {
551 sizeof(struct hn_softc)
554 static devclass_t hn_devclass;
556 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
557 MODULE_VERSION(hn, 1);
558 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
560 #if __FreeBSD_version >= 1100099
562 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
566 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
567 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
572 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
575 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
576 txd->chim_size == 0, ("invalid rndis sglist txd"));
577 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
578 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
582 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
584 struct hn_nvs_rndis rndis;
586 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
587 txd->chim_size > 0, ("invalid rndis chim txd"));
589 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
590 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
591 rndis.nvs_chim_idx = txd->chim_index;
592 rndis.nvs_chim_sz = txd->chim_size;
594 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
595 &rndis, sizeof(rndis), &txd->send_ctx));
598 static __inline uint32_t
599 hn_chim_alloc(struct hn_softc *sc)
601 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
602 u_long *bmap = sc->hn_chim_bmap;
603 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
605 for (i = 0; i < bmap_cnt; ++i) {
608 idx = ffsl(~bmap[i]);
612 --idx; /* ffsl is 1-based */
613 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
614 ("invalid i %d and idx %d", i, idx));
616 if (atomic_testandset_long(&bmap[i], idx))
619 ret = i * LONG_BIT + idx;
626 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
631 idx = chim_idx / LONG_BIT;
632 KASSERT(idx < sc->hn_chim_bmap_cnt,
633 ("invalid chimney index 0x%x", chim_idx));
635 mask = 1UL << (chim_idx % LONG_BIT);
636 KASSERT(sc->hn_chim_bmap[idx] & mask,
637 ("index bitmap 0x%lx, chimney index %u, "
638 "bitmap idx %d, bitmask 0x%lx",
639 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
641 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
644 #if defined(INET6) || defined(INET)
646 #define PULLUP_HDR(m, len) \
648 if (__predict_false((m)->m_len < (len))) { \
649 (m) = m_pullup((m), (len)); \
656 * NOTE: If this function failed, the m_head would be freed.
658 static __inline struct mbuf *
659 hn_tso_fixup(struct mbuf *m_head)
661 struct ether_vlan_header *evl;
665 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
667 PULLUP_HDR(m_head, sizeof(*evl));
668 evl = mtod(m_head, struct ether_vlan_header *);
669 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
670 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
672 ehlen = ETHER_HDR_LEN;
675 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
679 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
680 ip = mtodo(m_head, ehlen);
681 iphlen = ip->ip_hl << 2;
683 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
684 th = mtodo(m_head, ehlen + iphlen);
688 th->th_sum = in_pseudo(ip->ip_src.s_addr,
689 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
692 #if defined(INET6) && defined(INET)
699 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
700 ip6 = mtodo(m_head, ehlen);
701 if (ip6->ip6_nxt != IPPROTO_TCP) {
706 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
707 th = mtodo(m_head, ehlen + sizeof(*ip6));
710 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
718 * NOTE: If this function failed, the m_head would be freed.
720 static __inline struct mbuf *
721 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
723 const struct ether_vlan_header *evl;
724 const struct tcphdr *th;
729 PULLUP_HDR(m_head, sizeof(*evl));
730 evl = mtod(m_head, const struct ether_vlan_header *);
731 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
732 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
734 ehlen = ETHER_HDR_LEN;
737 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TCP) {
741 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
742 ip = mtodo(m_head, ehlen);
743 iphlen = ip->ip_hl << 2;
745 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
746 th = mtodo(m_head, ehlen + iphlen);
747 if (th->th_flags & TH_SYN)
751 #if defined(INET6) && defined(INET)
756 const struct ip6_hdr *ip6;
758 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
759 ip6 = mtodo(m_head, ehlen);
760 if (ip6->ip6_nxt != IPPROTO_TCP)
763 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
764 th = mtodo(m_head, ehlen + sizeof(*ip6));
765 if (th->th_flags & TH_SYN)
774 #endif /* INET6 || INET */
777 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
783 if (sc->hn_rx_filter != filter) {
784 error = hn_rndis_set_rxfilter(sc, filter);
786 sc->hn_rx_filter = filter;
792 hn_rxfilter_config(struct hn_softc *sc)
794 struct ifnet *ifp = sc->hn_ifp;
799 if ((ifp->if_flags & IFF_PROMISC) ||
800 (sc->hn_flags & HN_FLAG_RXVF)) {
801 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
803 filter = NDIS_PACKET_TYPE_DIRECTED;
804 if (ifp->if_flags & IFF_BROADCAST)
805 filter |= NDIS_PACKET_TYPE_BROADCAST;
806 /* TODO: support multicast list */
807 if ((ifp->if_flags & IFF_ALLMULTI) ||
808 !TAILQ_EMPTY(&ifp->if_multiaddrs))
809 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
811 return (hn_set_rxfilter(sc, filter));
815 hn_set_txagg(struct hn_softc *sc)
821 * Setup aggregation size.
823 if (sc->hn_agg_size < 0)
826 size = sc->hn_agg_size;
828 if (sc->hn_rndis_agg_size < size)
829 size = sc->hn_rndis_agg_size;
831 /* NOTE: We only aggregate packets using chimney sending buffers. */
832 if (size > (uint32_t)sc->hn_chim_szmax)
833 size = sc->hn_chim_szmax;
835 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
842 /* NOTE: Type of the per TX ring setting is 'int'. */
847 * Setup aggregation packet count.
849 if (sc->hn_agg_pkts < 0)
852 pkts = sc->hn_agg_pkts;
854 if (sc->hn_rndis_agg_pkts < pkts)
855 pkts = sc->hn_rndis_agg_pkts;
864 /* NOTE: Type of the per TX ring setting is 'short'. */
869 /* NOTE: Type of the per TX ring setting is 'short'. */
870 if (sc->hn_rndis_agg_align > SHRT_MAX) {
877 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
878 size, pkts, sc->hn_rndis_agg_align);
881 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
882 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
884 mtx_lock(&txr->hn_tx_lock);
885 txr->hn_agg_szmax = size;
886 txr->hn_agg_pktmax = pkts;
887 txr->hn_agg_align = sc->hn_rndis_agg_align;
888 mtx_unlock(&txr->hn_tx_lock);
893 hn_get_txswq_depth(const struct hn_tx_ring *txr)
896 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
897 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
898 return txr->hn_txdesc_cnt;
899 return hn_tx_swq_depth;
903 hn_rss_reconfig(struct hn_softc *sc)
909 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
916 * Direct reconfiguration by setting the UNCHG flags does
917 * _not_ work properly.
920 if_printf(sc->hn_ifp, "disable RSS\n");
921 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
923 if_printf(sc->hn_ifp, "RSS disable failed\n");
928 * Reenable the RSS w/ the updated RSS key or indirect
932 if_printf(sc->hn_ifp, "reconfig RSS\n");
933 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
935 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
942 hn_rss_ind_fixup(struct hn_softc *sc)
944 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
947 nchan = sc->hn_rx_ring_inuse;
948 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
951 * Check indirect table to make sure that all channels in it
954 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
955 if (rss->rss_ind[i] >= nchan) {
956 if_printf(sc->hn_ifp,
957 "RSS indirect table %d fixup: %u -> %d\n",
958 i, rss->rss_ind[i], nchan - 1);
959 rss->rss_ind[i] = nchan - 1;
965 hn_ifmedia_upd(struct ifnet *ifp __unused)
972 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
974 struct hn_softc *sc = ifp->if_softc;
976 ifmr->ifm_status = IFM_AVALID;
977 ifmr->ifm_active = IFM_ETHER;
979 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
980 ifmr->ifm_active |= IFM_NONE;
983 ifmr->ifm_status |= IFM_ACTIVE;
984 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
988 hn_rxvf_set_task(void *xarg, int pending __unused)
990 struct hn_rxvf_setarg *arg = xarg;
992 arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
996 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
998 struct hn_rx_ring *rxr;
999 struct hn_rxvf_setarg arg;
1005 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1007 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1008 rxr = &sc->hn_rx_ring[i];
1010 if (i < sc->hn_rx_ring_inuse) {
1012 arg.vf_ifp = vf_ifp;
1013 vmbus_chan_run_task(rxr->hn_chan, &task);
1015 rxr->hn_rxvf_ifp = vf_ifp;
1021 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1023 const struct ifnet *hn_ifp;
1025 hn_ifp = sc->hn_ifp;
1030 if (ifp->if_alloctype != IFT_ETHER)
1033 /* Ignore lagg/vlan interfaces */
1034 if (strcmp(ifp->if_dname, "lagg") == 0 ||
1035 strcmp(ifp->if_dname, "vlan") == 0)
1038 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1045 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1047 struct ifnet *hn_ifp;
1051 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1054 if (!hn_ismyvf(sc, ifp))
1056 hn_ifp = sc->hn_ifp;
1059 if (sc->hn_flags & HN_FLAG_RXVF)
1062 sc->hn_flags |= HN_FLAG_RXVF;
1063 hn_rxfilter_config(sc);
1065 if (!(sc->hn_flags & HN_FLAG_RXVF))
1068 sc->hn_flags &= ~HN_FLAG_RXVF;
1069 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1070 hn_rxfilter_config(sc);
1072 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1075 hn_nvs_set_datapath(sc,
1076 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTHETIC);
1078 hn_rxvf_set(sc, rxvf ? ifp : NULL);
1081 hn_suspend_mgmt(sc);
1082 sc->hn_link_flags &=
1083 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1084 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1089 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1090 rxvf ? "VF_UP" : "VF_DOWN", NULL);
1093 if_printf(hn_ifp, "datapath is switched %s %s\n",
1094 rxvf ? "to" : "from", ifp->if_xname);
1101 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1104 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1106 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1110 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1113 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1117 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1119 struct hn_softc *sc = xsc;
1123 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1126 if (!hn_ismyvf(sc, ifp))
1129 if (sc->hn_vf_ifp != NULL) {
1130 if_printf(sc->hn_ifp, "%s was attached as VF\n",
1131 sc->hn_vf_ifp->if_xname);
1135 rm_wlock(&hn_vfmap_lock);
1137 if (ifp->if_index >= hn_vfmap_size) {
1138 struct ifnet **newmap;
1141 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1142 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1145 memcpy(newmap, hn_vfmap,
1146 sizeof(struct ifnet *) * hn_vfmap_size);
1147 free(hn_vfmap, M_DEVBUF);
1149 hn_vfmap_size = newsize;
1151 KASSERT(hn_vfmap[ifp->if_index] == NULL,
1152 ("%s: ifindex %d was mapped to %s",
1153 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1154 hn_vfmap[ifp->if_index] = sc->hn_ifp;
1156 rm_wunlock(&hn_vfmap_lock);
1158 sc->hn_vf_ifp = ifp;
1164 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1166 struct hn_softc *sc = xsc;
1170 if (sc->hn_vf_ifp == NULL)
1173 if (!hn_ismyvf(sc, ifp))
1176 sc->hn_vf_ifp = NULL;
1178 rm_wlock(&hn_vfmap_lock);
1180 KASSERT(ifp->if_index < hn_vfmap_size,
1181 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
1182 if (hn_vfmap[ifp->if_index] != NULL) {
1183 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
1184 ("%s: ifindex %d was mapped to %s",
1185 ifp->if_xname, ifp->if_index,
1186 hn_vfmap[ifp->if_index]->if_xname));
1187 hn_vfmap[ifp->if_index] = NULL;
1190 rm_wunlock(&hn_vfmap_lock);
1195 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
1196 static const struct hyperv_guid g_net_vsc_device_type = {
1197 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
1198 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
1202 hn_probe(device_t dev)
1205 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
1206 &g_net_vsc_device_type) == 0) {
1207 device_set_desc(dev, "Hyper-V Network Interface");
1208 return BUS_PROBE_DEFAULT;
1214 hn_attach(device_t dev)
1216 struct hn_softc *sc = device_get_softc(dev);
1217 struct sysctl_oid_list *child;
1218 struct sysctl_ctx_list *ctx;
1219 uint8_t eaddr[ETHER_ADDR_LEN];
1220 struct ifnet *ifp = NULL;
1221 int error, ring_cnt, tx_ring_cnt;
1224 sc->hn_prichan = vmbus_get_channel(dev);
1228 * Initialize these tunables once.
1230 sc->hn_agg_size = hn_tx_agg_size;
1231 sc->hn_agg_pkts = hn_tx_agg_pkts;
1234 * Setup taskqueue for transmission.
1236 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
1240 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
1241 M_DEVBUF, M_WAITOK);
1242 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
1243 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
1244 M_WAITOK, taskqueue_thread_enqueue,
1245 &sc->hn_tx_taskqs[i]);
1246 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
1247 "%s tx%d", device_get_nameunit(dev), i);
1249 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
1250 sc->hn_tx_taskqs = hn_tx_taskque;
1254 * Setup taskqueue for mangement tasks, e.g. link status.
1256 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
1257 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
1258 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
1259 device_get_nameunit(dev));
1260 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
1261 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
1262 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
1263 hn_netchg_status_taskfunc, sc);
1266 * Allocate ifnet and setup its name earlier, so that if_printf
1267 * can be used by functions, which will be called after
1270 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
1272 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
1275 * Initialize ifmedia earlier so that it can be unconditionally
1276 * destroyed, if error happened later on.
1278 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
1281 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
1282 * to use (tx_ring_cnt).
1285 * The # of RX rings to use is same as the # of channels to use.
1287 ring_cnt = hn_chan_cnt;
1288 if (ring_cnt <= 0) {
1290 ring_cnt = mp_ncpus;
1291 if (ring_cnt > HN_RING_CNT_DEF_MAX)
1292 ring_cnt = HN_RING_CNT_DEF_MAX;
1293 } else if (ring_cnt > mp_ncpus) {
1294 ring_cnt = mp_ncpus;
1297 tx_ring_cnt = hn_tx_ring_cnt;
1298 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
1299 tx_ring_cnt = ring_cnt;
1300 #ifdef HN_IFSTART_SUPPORT
1301 if (hn_use_if_start) {
1302 /* ifnet.if_start only needs one TX ring. */
1308 * Set the leader CPU for channels.
1310 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1313 * Create enough TX/RX rings, even if only limited number of
1314 * channels can be allocated.
1316 error = hn_create_tx_data(sc, tx_ring_cnt);
1319 error = hn_create_rx_data(sc, ring_cnt);
1324 * Create transaction context for NVS and RNDIS transactions.
1326 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1327 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1328 if (sc->hn_xact == NULL) {
1334 * Install orphan handler for the revocation of this device's
1338 * The processing order is critical here:
1339 * Install the orphan handler, _before_ testing whether this
1340 * device's primary channel has been revoked or not.
1342 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1343 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1349 * Attach the synthetic parts, i.e. NVS and RNDIS.
1351 error = hn_synth_attach(sc, ETHERMTU);
1355 error = hn_rndis_get_eaddr(sc, eaddr);
1359 #if __FreeBSD_version >= 1100099
1360 if (sc->hn_rx_ring_inuse > 1) {
1362 * Reduce TCP segment aggregation limit for multiple
1363 * RX rings to increase ACK timeliness.
1365 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1370 * Fixup TX stuffs after synthetic parts are attached.
1372 hn_fixup_tx_data(sc);
1374 ctx = device_get_sysctl_ctx(dev);
1375 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1376 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1377 &sc->hn_nvs_ver, 0, "NVS version");
1378 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1379 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1380 hn_ndis_version_sysctl, "A", "NDIS version");
1381 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1382 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1383 hn_caps_sysctl, "A", "capabilities");
1384 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1385 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1386 hn_hwassist_sysctl, "A", "hwassist");
1387 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1388 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1389 hn_rxfilter_sysctl, "A", "rxfilter");
1390 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1391 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1392 hn_rss_hash_sysctl, "A", "RSS hash");
1393 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1394 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1395 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1396 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1397 hn_rss_key_sysctl, "IU", "RSS key");
1398 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1399 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1400 hn_rss_ind_sysctl, "IU", "RSS indirect table");
1401 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1402 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1403 "RNDIS offered packet transmission aggregation size limit");
1404 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1405 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1406 "RNDIS offered packet transmission aggregation count limit");
1407 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1408 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1409 "RNDIS packet transmission aggregation alignment");
1410 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1411 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1412 hn_txagg_size_sysctl, "I",
1413 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1414 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1415 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1416 hn_txagg_pkts_sysctl, "I",
1417 "Packet transmission aggregation packets, "
1418 "0 -- disable, -1 -- auto");
1419 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1420 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1421 hn_polling_sysctl, "I",
1422 "Polling frequency: [100,1000000], 0 disable polling");
1423 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
1424 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1425 hn_vf_sysctl, "A", "Virtual Function's name");
1426 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
1427 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1428 hn_rxvf_sysctl, "A", "activated Virtual Function's name");
1431 * Setup the ifmedia, which has been initialized earlier.
1433 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1434 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1435 /* XXX ifmedia_set really should do this for us */
1436 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1439 * Setup the ifnet for this interface.
1443 ifp->if_baudrate = IF_Gbps(10);
1445 /* if_baudrate is 32bits on 32bit system. */
1446 ifp->if_baudrate = IF_Gbps(1);
1448 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1449 ifp->if_ioctl = hn_ioctl;
1450 ifp->if_init = hn_init;
1451 #ifdef HN_IFSTART_SUPPORT
1452 if (hn_use_if_start) {
1453 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1455 ifp->if_start = hn_start;
1456 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1457 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1458 IFQ_SET_READY(&ifp->if_snd);
1462 ifp->if_transmit = hn_transmit;
1463 ifp->if_qflush = hn_xmit_qflush;
1466 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1468 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1469 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1471 if (sc->hn_caps & HN_CAP_VLAN) {
1472 /* XXX not sure about VLAN_MTU. */
1473 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1476 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1477 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1478 ifp->if_capabilities |= IFCAP_TXCSUM;
1479 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1480 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1481 if (sc->hn_caps & HN_CAP_TSO4) {
1482 ifp->if_capabilities |= IFCAP_TSO4;
1483 ifp->if_hwassist |= CSUM_IP_TSO;
1485 if (sc->hn_caps & HN_CAP_TSO6) {
1486 ifp->if_capabilities |= IFCAP_TSO6;
1487 ifp->if_hwassist |= CSUM_IP6_TSO;
1490 /* Enable all available capabilities by default. */
1491 ifp->if_capenable = ifp->if_capabilities;
1494 * Disable IPv6 TSO and TXCSUM by default, they still can
1495 * be enabled through SIOCSIFCAP.
1497 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1498 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1500 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1501 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1502 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1503 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1506 ether_ifattach(ifp, eaddr);
1508 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1509 if_printf(ifp, "TSO segcnt %u segsz %u\n",
1510 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1513 /* Inform the upper layer about the long frame support. */
1514 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1517 * Kick off link status check.
1519 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1520 hn_update_link_status(sc);
1522 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
1523 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
1524 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
1525 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
1529 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
1530 * since interface's LLADDR is needed; interface LLADDR is not
1531 * available when ifnet_arrival event is triggered.
1533 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
1534 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
1535 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
1536 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
1540 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1541 hn_synth_detach(sc);
1547 hn_detach(device_t dev)
1549 struct hn_softc *sc = device_get_softc(dev);
1550 struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
1552 if (sc->hn_ifaddr_evthand != NULL)
1553 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
1554 if (sc->hn_ifnet_evthand != NULL)
1555 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
1556 if (sc->hn_ifnet_atthand != NULL) {
1557 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
1558 sc->hn_ifnet_atthand);
1560 if (sc->hn_ifnet_dethand != NULL) {
1561 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
1562 sc->hn_ifnet_dethand);
1565 vf_ifp = sc->hn_vf_ifp;
1566 __compiler_membar();
1568 hn_ifnet_detevent(sc, vf_ifp);
1570 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1572 * In case that the vmbus missed the orphan handler
1575 vmbus_xact_ctx_orphan(sc->hn_xact);
1578 if (device_is_attached(dev)) {
1580 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1581 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1585 * hn_stop() only suspends data, so managment
1586 * stuffs have to be suspended manually here.
1588 hn_suspend_mgmt(sc);
1589 hn_synth_detach(sc);
1592 ether_ifdetach(ifp);
1595 ifmedia_removeall(&sc->hn_media);
1596 hn_destroy_rx_data(sc);
1597 hn_destroy_tx_data(sc);
1599 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1602 for (i = 0; i < hn_tx_taskq_cnt; ++i)
1603 taskqueue_free(sc->hn_tx_taskqs[i]);
1604 free(sc->hn_tx_taskqs, M_DEVBUF);
1606 taskqueue_free(sc->hn_mgmt_taskq0);
1608 if (sc->hn_xact != NULL) {
1610 * Uninstall the orphan handler _before_ the xact is
1613 vmbus_chan_unset_orphan(sc->hn_prichan);
1614 vmbus_xact_ctx_destroy(sc->hn_xact);
1619 HN_LOCK_DESTROY(sc);
1624 hn_shutdown(device_t dev)
1631 hn_link_status(struct hn_softc *sc)
1633 uint32_t link_status;
1636 error = hn_rndis_get_linkstatus(sc, &link_status);
1638 /* XXX what to do? */
1642 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1643 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1645 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1646 if_link_state_change(sc->hn_ifp,
1647 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1648 LINK_STATE_UP : LINK_STATE_DOWN);
1652 hn_link_taskfunc(void *xsc, int pending __unused)
1654 struct hn_softc *sc = xsc;
1656 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1662 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1664 struct hn_softc *sc = xsc;
1666 /* Prevent any link status checks from running. */
1667 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1670 * Fake up a [link down --> link up] state change; 5 seconds
1671 * delay is used, which closely simulates miibus reaction
1672 * upon link down event.
1674 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1675 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1676 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1677 &sc->hn_netchg_status, 5 * hz);
1681 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1683 struct hn_softc *sc = xsc;
1685 /* Re-allow link status checks. */
1686 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1691 hn_update_link_status(struct hn_softc *sc)
1694 if (sc->hn_mgmt_taskq != NULL)
1695 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1699 hn_change_network(struct hn_softc *sc)
1702 if (sc->hn_mgmt_taskq != NULL)
1703 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1707 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1708 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1710 struct mbuf *m = *m_head;
1713 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1715 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1716 m, segs, nsegs, BUS_DMA_NOWAIT);
1717 if (error == EFBIG) {
1720 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1724 *m_head = m = m_new;
1725 txr->hn_tx_collapsed++;
1727 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1728 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1731 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1732 BUS_DMASYNC_PREWRITE);
1733 txd->flags |= HN_TXD_FLAG_DMAMAP;
1739 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1742 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1743 ("put an onlist txd %#x", txd->flags));
1744 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1745 ("put an onagg txd %#x", txd->flags));
1747 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1748 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1751 if (!STAILQ_EMPTY(&txd->agg_list)) {
1752 struct hn_txdesc *tmp_txd;
1754 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1757 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1758 ("resursive aggregation on aggregated txdesc"));
1759 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1760 ("not aggregated txdesc"));
1761 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1762 ("aggregated txdesc uses dmamap"));
1763 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1764 ("aggregated txdesc consumes "
1765 "chimney sending buffer"));
1766 KASSERT(tmp_txd->chim_size == 0,
1767 ("aggregated txdesc has non-zero "
1768 "chimney sending size"));
1770 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1771 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1772 freed = hn_txdesc_put(txr, tmp_txd);
1773 KASSERT(freed, ("failed to free aggregated txdesc"));
1777 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1778 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1779 ("chim txd uses dmamap"));
1780 hn_chim_free(txr->hn_sc, txd->chim_index);
1781 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1783 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1784 bus_dmamap_sync(txr->hn_tx_data_dtag,
1785 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1786 bus_dmamap_unload(txr->hn_tx_data_dtag,
1788 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1791 if (txd->m != NULL) {
1796 txd->flags |= HN_TXD_FLAG_ONLIST;
1797 #ifndef HN_USE_TXDESC_BUFRING
1798 mtx_lock_spin(&txr->hn_txlist_spin);
1799 KASSERT(txr->hn_txdesc_avail >= 0 &&
1800 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1801 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1802 txr->hn_txdesc_avail++;
1803 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1804 mtx_unlock_spin(&txr->hn_txlist_spin);
1805 #else /* HN_USE_TXDESC_BUFRING */
1807 atomic_add_int(&txr->hn_txdesc_avail, 1);
1809 buf_ring_enqueue(txr->hn_txdesc_br, txd);
1810 #endif /* !HN_USE_TXDESC_BUFRING */
1815 static __inline struct hn_txdesc *
1816 hn_txdesc_get(struct hn_tx_ring *txr)
1818 struct hn_txdesc *txd;
1820 #ifndef HN_USE_TXDESC_BUFRING
1821 mtx_lock_spin(&txr->hn_txlist_spin);
1822 txd = SLIST_FIRST(&txr->hn_txlist);
1824 KASSERT(txr->hn_txdesc_avail > 0,
1825 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1826 txr->hn_txdesc_avail--;
1827 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1829 mtx_unlock_spin(&txr->hn_txlist_spin);
1831 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1835 #ifdef HN_USE_TXDESC_BUFRING
1837 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1839 #endif /* HN_USE_TXDESC_BUFRING */
1840 KASSERT(txd->m == NULL && txd->refs == 0 &&
1841 STAILQ_EMPTY(&txd->agg_list) &&
1842 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1843 txd->chim_size == 0 &&
1844 (txd->flags & HN_TXD_FLAG_ONLIST) &&
1845 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1846 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1847 txd->flags &= ~HN_TXD_FLAG_ONLIST;
1853 static __inline void
1854 hn_txdesc_hold(struct hn_txdesc *txd)
1857 /* 0->1 transition will never work */
1858 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1859 atomic_add_int(&txd->refs, 1);
1862 static __inline void
1863 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1866 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1867 ("recursive aggregation on aggregating txdesc"));
1869 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1870 ("already aggregated"));
1871 KASSERT(STAILQ_EMPTY(&txd->agg_list),
1872 ("recursive aggregation on to-be-aggregated txdesc"));
1874 txd->flags |= HN_TXD_FLAG_ONAGG;
1875 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1879 hn_tx_ring_pending(struct hn_tx_ring *txr)
1881 bool pending = false;
1883 #ifndef HN_USE_TXDESC_BUFRING
1884 mtx_lock_spin(&txr->hn_txlist_spin);
1885 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1887 mtx_unlock_spin(&txr->hn_txlist_spin);
1889 if (!buf_ring_full(txr->hn_txdesc_br))
1895 static __inline void
1896 hn_txeof(struct hn_tx_ring *txr)
1898 txr->hn_has_txeof = 0;
1903 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1904 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1906 struct hn_txdesc *txd = sndc->hn_cbarg;
1907 struct hn_tx_ring *txr;
1910 KASSERT(txr->hn_chan == chan,
1911 ("channel mismatch, on chan%u, should be chan%u",
1912 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1914 txr->hn_has_txeof = 1;
1915 hn_txdesc_put(txr, txd);
1917 ++txr->hn_txdone_cnt;
1918 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1919 txr->hn_txdone_cnt = 0;
1920 if (txr->hn_oactive)
1926 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1928 #if defined(INET) || defined(INET6)
1929 struct lro_ctrl *lro = &rxr->hn_lro;
1930 struct lro_entry *queued;
1932 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1933 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1934 tcp_lro_flush(lro, queued);
1940 * 'txr' could be NULL, if multiple channels and
1941 * ifnet.if_start method are enabled.
1943 if (txr == NULL || !txr->hn_has_txeof)
1946 txr->hn_txdone_cnt = 0;
1950 static __inline uint32_t
1951 hn_rndis_pktmsg_offset(uint32_t ofs)
1954 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1955 ("invalid RNDIS packet msg offset %u", ofs));
1956 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1959 static __inline void *
1960 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1961 size_t pi_dlen, uint32_t pi_type)
1963 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1964 struct rndis_pktinfo *pi;
1966 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1967 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1970 * Per-packet-info does not move; it only grows.
1973 * rm_pktinfooffset in this phase counts from the beginning
1974 * of rndis_packet_msg.
1976 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1977 ("%u pktinfo overflows RNDIS packet msg", pi_type));
1978 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1979 pkt->rm_pktinfolen);
1980 pkt->rm_pktinfolen += pi_size;
1982 pi->rm_size = pi_size;
1983 pi->rm_type = pi_type;
1984 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1986 return (pi->rm_data);
1990 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1992 struct hn_txdesc *txd;
1996 txd = txr->hn_agg_txd;
1997 KASSERT(txd != NULL, ("no aggregate txdesc"));
2000 * Since hn_txpkt() will reset this temporary stat, save
2001 * it now, so that oerrors can be updated properly, if
2002 * hn_txpkt() ever fails.
2004 pkts = txr->hn_stat_pkts;
2007 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2008 * failure, save it for later freeing, if hn_txpkt() ever
2012 error = hn_txpkt(ifp, txr, txd);
2013 if (__predict_false(error)) {
2014 /* txd is freed, but m is not. */
2017 txr->hn_flush_failed++;
2018 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2021 /* Reset all aggregation states. */
2022 txr->hn_agg_txd = NULL;
2023 txr->hn_agg_szleft = 0;
2024 txr->hn_agg_pktleft = 0;
2025 txr->hn_agg_prevpkt = NULL;
2031 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2036 if (txr->hn_agg_txd != NULL) {
2037 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2038 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2039 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2043 * Update the previous RNDIS packet's total length,
2044 * it can be increased due to the mandatory alignment
2045 * padding for this RNDIS packet. And update the
2046 * aggregating txdesc's chimney sending buffer size
2050 * Zero-out the padding, as required by the RNDIS spec.
2053 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2054 agg_txd->chim_size += pkt->rm_len - olen;
2056 /* Link this txdesc to the parent. */
2057 hn_txdesc_agg(agg_txd, txd);
2059 chim = (uint8_t *)pkt + pkt->rm_len;
2060 /* Save the current packet for later fixup. */
2061 txr->hn_agg_prevpkt = chim;
2063 txr->hn_agg_pktleft--;
2064 txr->hn_agg_szleft -= pktsize;
2065 if (txr->hn_agg_szleft <=
2066 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2068 * Probably can't aggregate more packets,
2069 * flush this aggregating txdesc proactively.
2071 txr->hn_agg_pktleft = 0;
2076 hn_flush_txagg(ifp, txr);
2078 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
2080 txr->hn_tx_chimney_tried++;
2081 txd->chim_index = hn_chim_alloc(txr->hn_sc);
2082 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
2084 txr->hn_tx_chimney++;
2086 chim = txr->hn_sc->hn_chim +
2087 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
2089 if (txr->hn_agg_pktmax > 1 &&
2090 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2091 txr->hn_agg_txd = txd;
2092 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
2093 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
2094 txr->hn_agg_prevpkt = chim;
2101 * If this function fails, then both txd and m_head0 will be freed.
2104 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2105 struct mbuf **m_head0)
2107 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
2108 int error, nsegs, i;
2109 struct mbuf *m_head = *m_head0;
2110 struct rndis_packet_msg *pkt;
2113 int pkt_hlen, pkt_size;
2115 pkt = txd->rndis_pkt;
2116 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
2117 if (pkt_size < txr->hn_chim_size) {
2118 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
2122 if (txr->hn_agg_txd != NULL)
2123 hn_flush_txagg(ifp, txr);
2126 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
2127 pkt->rm_len = m_head->m_pkthdr.len;
2128 pkt->rm_dataoffset = 0;
2129 pkt->rm_datalen = m_head->m_pkthdr.len;
2130 pkt->rm_oobdataoffset = 0;
2131 pkt->rm_oobdatalen = 0;
2132 pkt->rm_oobdataelements = 0;
2133 pkt->rm_pktinfooffset = sizeof(*pkt);
2134 pkt->rm_pktinfolen = 0;
2135 pkt->rm_vchandle = 0;
2136 pkt->rm_reserved = 0;
2138 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
2140 * Set the hash value for this packet, so that the host could
2141 * dispatch the TX done event for this packet back to this TX
2144 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2145 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
2146 *pi_data = txr->hn_tx_idx;
2149 if (m_head->m_flags & M_VLANTAG) {
2150 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2151 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
2152 *pi_data = NDIS_VLAN_INFO_MAKE(
2153 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
2154 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
2155 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
2158 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
2159 #if defined(INET6) || defined(INET)
2160 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2161 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
2163 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
2164 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
2165 m_head->m_pkthdr.tso_segsz);
2168 #if defined(INET6) && defined(INET)
2173 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
2174 m_head->m_pkthdr.tso_segsz);
2177 #endif /* INET6 || INET */
2178 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
2179 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2180 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
2181 if (m_head->m_pkthdr.csum_flags &
2182 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
2183 *pi_data = NDIS_TXCSUM_INFO_IPV6;
2185 *pi_data = NDIS_TXCSUM_INFO_IPV4;
2186 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
2187 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
2190 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
2191 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
2192 else if (m_head->m_pkthdr.csum_flags &
2193 (CSUM_IP_UDP | CSUM_IP6_UDP))
2194 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
2197 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
2198 /* Fixup RNDIS packet message total length */
2199 pkt->rm_len += pkt_hlen;
2200 /* Convert RNDIS packet message offsets */
2201 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
2202 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
2205 * Fast path: Chimney sending.
2208 struct hn_txdesc *tgt_txd = txd;
2210 if (txr->hn_agg_txd != NULL) {
2211 tgt_txd = txr->hn_agg_txd;
2217 KASSERT(pkt == chim,
2218 ("RNDIS pkt not in chimney sending buffer"));
2219 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
2220 ("chimney sending buffer is not used"));
2221 tgt_txd->chim_size += pkt->rm_len;
2223 m_copydata(m_head, 0, m_head->m_pkthdr.len,
2224 ((uint8_t *)chim) + pkt_hlen);
2226 txr->hn_gpa_cnt = 0;
2227 txr->hn_sendpkt = hn_txpkt_chim;
2231 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
2232 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2233 ("chimney buffer is used"));
2234 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
2236 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
2237 if (__predict_false(error)) {
2241 * This mbuf is not linked w/ the txd yet, so free it now.
2246 freed = hn_txdesc_put(txr, txd);
2248 ("fail to free txd upon txdma error"));
2250 txr->hn_txdma_failed++;
2251 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2256 /* +1 RNDIS packet message */
2257 txr->hn_gpa_cnt = nsegs + 1;
2259 /* send packet with page buffer */
2260 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
2261 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
2262 txr->hn_gpa[0].gpa_len = pkt_hlen;
2265 * Fill the page buffers with mbuf info after the page
2266 * buffer for RNDIS packet message.
2268 for (i = 0; i < nsegs; ++i) {
2269 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
2271 gpa->gpa_page = atop(segs[i].ds_addr);
2272 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
2273 gpa->gpa_len = segs[i].ds_len;
2276 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2278 txr->hn_sendpkt = hn_txpkt_sglist;
2282 /* Set the completion routine */
2283 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
2285 /* Update temporary stats for later use. */
2286 txr->hn_stat_pkts++;
2287 txr->hn_stat_size += m_head->m_pkthdr.len;
2288 if (m_head->m_flags & M_MCAST)
2289 txr->hn_stat_mcasts++;
2296 * If this function fails, then txd will be freed, but the mbuf
2297 * associated w/ the txd will _not_ be freed.
2300 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
2302 int error, send_failed = 0, has_bpf;
2305 has_bpf = bpf_peers_present(ifp->if_bpf);
2308 * Make sure that this txd and any aggregated txds are not
2309 * freed before ETHER_BPF_MTAP.
2311 hn_txdesc_hold(txd);
2313 error = txr->hn_sendpkt(txr, txd);
2316 const struct hn_txdesc *tmp_txd;
2318 ETHER_BPF_MTAP(ifp, txd->m);
2319 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
2320 ETHER_BPF_MTAP(ifp, tmp_txd->m);
2323 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
2324 #ifdef HN_IFSTART_SUPPORT
2325 if (!hn_use_if_start)
2328 if_inc_counter(ifp, IFCOUNTER_OBYTES,
2330 if (txr->hn_stat_mcasts != 0) {
2331 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
2332 txr->hn_stat_mcasts);
2335 txr->hn_pkts += txr->hn_stat_pkts;
2339 hn_txdesc_put(txr, txd);
2341 if (__predict_false(error)) {
2345 * This should "really rarely" happen.
2347 * XXX Too many RX to be acked or too many sideband
2348 * commands to run? Ask netvsc_channel_rollup()
2349 * to kick start later.
2351 txr->hn_has_txeof = 1;
2353 txr->hn_send_failed++;
2356 * Try sending again after set hn_has_txeof;
2357 * in case that we missed the last
2358 * netvsc_channel_rollup().
2362 if_printf(ifp, "send failed\n");
2365 * Caller will perform further processing on the
2366 * associated mbuf, so don't free it in hn_txdesc_put();
2367 * only unload it from the DMA map in hn_txdesc_put(),
2371 freed = hn_txdesc_put(txr, txd);
2373 ("fail to free txd upon send error"));
2375 txr->hn_send_failed++;
2378 /* Reset temporary stats, after this sending is done. */
2379 txr->hn_stat_size = 0;
2380 txr->hn_stat_pkts = 0;
2381 txr->hn_stat_mcasts = 0;
2387 * Append the specified data to the indicated mbuf chain,
2388 * Extend the mbuf chain if the new data does not fit in
2391 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2392 * There should be an equivalent in the kernel mbuf code,
2393 * but there does not appear to be one yet.
2395 * Differs from m_append() in that additional mbufs are
2396 * allocated with cluster size MJUMPAGESIZE, and filled
2399 * Return 1 if able to complete the job; otherwise 0.
2402 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2405 int remainder, space;
2407 for (m = m0; m->m_next != NULL; m = m->m_next)
2410 space = M_TRAILINGSPACE(m);
2413 * Copy into available space.
2415 if (space > remainder)
2417 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2422 while (remainder > 0) {
2424 * Allocate a new mbuf; could check space
2425 * and allocate a cluster instead.
2427 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
2430 n->m_len = min(MJUMPAGESIZE, remainder);
2431 bcopy(cp, mtod(n, caddr_t), n->m_len);
2433 remainder -= n->m_len;
2437 if (m0->m_flags & M_PKTHDR)
2438 m0->m_pkthdr.len += len - remainder;
2440 return (remainder == 0);
2443 #if defined(INET) || defined(INET6)
2445 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2447 #if __FreeBSD_version >= 1100095
2448 if (hn_lro_mbufq_depth) {
2449 tcp_lro_queue_mbuf(lc, m);
2453 return tcp_lro_rx(lc, m, 0);
2458 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2459 const struct hn_rxinfo *info)
2463 int size, do_lro = 0, do_csum = 1;
2464 int hash_type = M_HASHTYPE_OPAQUE;
2466 /* If the VF is active, inject the packet through the VF */
2467 ifp = rxr->hn_rxvf_ifp ? rxr->hn_rxvf_ifp : rxr->hn_ifp;
2469 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
2472 * See the NOTE of hn_rndis_init_fixat(). This
2473 * function can be reached, immediately after the
2474 * RNDIS is initialized but before the ifnet is
2475 * setup on the hn_attach() path; drop the unexpected
2481 if (dlen <= MHLEN) {
2482 m_new = m_gethdr(M_NOWAIT, MT_DATA);
2483 if (m_new == NULL) {
2484 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2487 memcpy(mtod(m_new, void *), data, dlen);
2488 m_new->m_pkthdr.len = m_new->m_len = dlen;
2489 rxr->hn_small_pkts++;
2492 * Get an mbuf with a cluster. For packets 2K or less,
2493 * get a standard 2K cluster. For anything larger, get a
2494 * 4K cluster. Any buffers larger than 4K can cause problems
2495 * if looped around to the Hyper-V TX channel, so avoid them.
2498 if (dlen > MCLBYTES) {
2500 size = MJUMPAGESIZE;
2503 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2504 if (m_new == NULL) {
2505 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2509 hv_m_append(m_new, dlen, data);
2511 m_new->m_pkthdr.rcvif = ifp;
2513 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2516 /* receive side checksum offload */
2517 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2518 /* IP csum offload */
2519 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2520 m_new->m_pkthdr.csum_flags |=
2521 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2525 /* TCP/UDP csum offload */
2526 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2527 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2528 m_new->m_pkthdr.csum_flags |=
2529 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2530 m_new->m_pkthdr.csum_data = 0xffff;
2531 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2539 * As of this write (Oct 28th, 2016), host side will turn
2540 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2541 * the do_lro setting here is actually _not_ accurate. We
2542 * depend on the RSS hash type check to reset do_lro.
2544 if ((info->csum_info &
2545 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2546 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2549 const struct ether_header *eh;
2554 if (m_new->m_len < hoff)
2556 eh = mtod(m_new, struct ether_header *);
2557 etype = ntohs(eh->ether_type);
2558 if (etype == ETHERTYPE_VLAN) {
2559 const struct ether_vlan_header *evl;
2561 hoff = sizeof(*evl);
2562 if (m_new->m_len < hoff)
2564 evl = mtod(m_new, struct ether_vlan_header *);
2565 etype = ntohs(evl->evl_proto);
2568 if (etype == ETHERTYPE_IP) {
2571 pr = hn_check_iplen(m_new, hoff);
2572 if (pr == IPPROTO_TCP) {
2574 (rxr->hn_trust_hcsum &
2575 HN_TRUST_HCSUM_TCP)) {
2576 rxr->hn_csum_trusted++;
2577 m_new->m_pkthdr.csum_flags |=
2578 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2579 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2580 m_new->m_pkthdr.csum_data = 0xffff;
2583 } else if (pr == IPPROTO_UDP) {
2585 (rxr->hn_trust_hcsum &
2586 HN_TRUST_HCSUM_UDP)) {
2587 rxr->hn_csum_trusted++;
2588 m_new->m_pkthdr.csum_flags |=
2589 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2590 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2591 m_new->m_pkthdr.csum_data = 0xffff;
2593 } else if (pr != IPPROTO_DONE && do_csum &&
2594 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2595 rxr->hn_csum_trusted++;
2596 m_new->m_pkthdr.csum_flags |=
2597 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2602 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2603 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2604 NDIS_VLAN_INFO_ID(info->vlan_info),
2605 NDIS_VLAN_INFO_PRI(info->vlan_info),
2606 NDIS_VLAN_INFO_CFI(info->vlan_info));
2607 m_new->m_flags |= M_VLANTAG;
2610 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2612 m_new->m_pkthdr.flowid = info->hash_value;
2613 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2614 NDIS_HASH_FUNCTION_TOEPLITZ) {
2615 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2619 * do_lro is resetted, if the hash types are not TCP
2620 * related. See the comment in the above csum_flags
2624 case NDIS_HASH_IPV4:
2625 hash_type = M_HASHTYPE_RSS_IPV4;
2629 case NDIS_HASH_TCP_IPV4:
2630 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2633 case NDIS_HASH_IPV6:
2634 hash_type = M_HASHTYPE_RSS_IPV6;
2638 case NDIS_HASH_IPV6_EX:
2639 hash_type = M_HASHTYPE_RSS_IPV6_EX;
2643 case NDIS_HASH_TCP_IPV6:
2644 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2647 case NDIS_HASH_TCP_IPV6_EX:
2648 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2653 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2655 M_HASHTYPE_SET(m_new, hash_type);
2658 * Note: Moved RX completion back to hv_nv_on_receive() so all
2659 * messages (not just data messages) will trigger a response.
2665 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2666 #if defined(INET) || defined(INET6)
2667 struct lro_ctrl *lro = &rxr->hn_lro;
2670 rxr->hn_lro_tried++;
2671 if (hn_lro_rx(lro, m_new) == 0) {
2679 /* We're not holding the lock here, so don't release it */
2680 (*ifp->if_input)(ifp, m_new);
2686 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2688 struct hn_softc *sc = ifp->if_softc;
2689 struct ifreq *ifr = (struct ifreq *)data;
2690 int mask, error = 0;
2694 if (ifr->ifr_mtu > HN_MTU_MAX) {
2701 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2706 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2707 /* Can't change MTU */
2713 if (ifp->if_mtu == ifr->ifr_mtu) {
2719 * Suspend this interface before the synthetic parts
2725 * Detach the synthetics parts, i.e. NVS and RNDIS.
2727 hn_synth_detach(sc);
2730 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2731 * with the new MTU setting.
2733 error = hn_synth_attach(sc, ifr->ifr_mtu);
2740 * Commit the requested MTU, after the synthetic parts
2741 * have been successfully attached.
2743 ifp->if_mtu = ifr->ifr_mtu;
2746 * Make sure that various parameters based on MTU are
2747 * still valid, after the MTU change.
2749 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2750 hn_set_chim_size(sc, sc->hn_chim_szmax);
2751 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2752 #if __FreeBSD_version >= 1100099
2753 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2754 HN_LRO_LENLIM_MIN(ifp))
2755 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2759 * All done! Resume the interface now.
2769 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2774 if (ifp->if_flags & IFF_UP) {
2775 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2777 * Caller meight hold mutex, e.g.
2778 * bpf; use busy-wait for the RNDIS
2782 hn_rxfilter_config(sc);
2788 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2791 sc->hn_if_flags = ifp->if_flags;
2798 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2800 if (mask & IFCAP_TXCSUM) {
2801 ifp->if_capenable ^= IFCAP_TXCSUM;
2802 if (ifp->if_capenable & IFCAP_TXCSUM)
2803 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2805 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2807 if (mask & IFCAP_TXCSUM_IPV6) {
2808 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2809 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2810 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2812 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2815 /* TODO: flip RNDIS offload parameters for RXCSUM. */
2816 if (mask & IFCAP_RXCSUM)
2817 ifp->if_capenable ^= IFCAP_RXCSUM;
2819 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2820 if (mask & IFCAP_RXCSUM_IPV6)
2821 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2824 if (mask & IFCAP_LRO)
2825 ifp->if_capenable ^= IFCAP_LRO;
2827 if (mask & IFCAP_TSO4) {
2828 ifp->if_capenable ^= IFCAP_TSO4;
2829 if (ifp->if_capenable & IFCAP_TSO4)
2830 ifp->if_hwassist |= CSUM_IP_TSO;
2832 ifp->if_hwassist &= ~CSUM_IP_TSO;
2834 if (mask & IFCAP_TSO6) {
2835 ifp->if_capenable ^= IFCAP_TSO6;
2836 if (ifp->if_capenable & IFCAP_TSO6)
2837 ifp->if_hwassist |= CSUM_IP6_TSO;
2839 ifp->if_hwassist &= ~CSUM_IP6_TSO;
2849 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2853 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2855 * Multicast uses mutex; use busy-wait for
2859 hn_rxfilter_config(sc);
2868 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2872 error = ether_ioctl(ifp, cmd, data);
2879 hn_stop(struct hn_softc *sc, bool detaching)
2881 struct ifnet *ifp = sc->hn_ifp;
2886 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2887 ("synthetic parts were not attached"));
2889 /* Disable polling. */
2892 /* Clear RUNNING bit _before_ hn_suspend_data() */
2893 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2894 hn_suspend_data(sc);
2896 /* Clear OACTIVE bit. */
2897 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2898 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2899 sc->hn_tx_ring[i].hn_oactive = 0;
2902 * If the VF is active, make sure the filter is not 0, even if
2903 * the synthetic NIC is down.
2905 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
2906 hn_rxfilter_config(sc);
2910 hn_init_locked(struct hn_softc *sc)
2912 struct ifnet *ifp = sc->hn_ifp;
2917 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2920 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2923 /* Configure RX filter */
2924 hn_rxfilter_config(sc);
2926 /* Clear OACTIVE bit. */
2927 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2928 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2929 sc->hn_tx_ring[i].hn_oactive = 0;
2931 /* Clear TX 'suspended' bit. */
2932 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2934 /* Everything is ready; unleash! */
2935 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2937 /* Re-enable polling if requested. */
2938 if (sc->hn_pollhz > 0)
2939 hn_polling(sc, sc->hn_pollhz);
2945 struct hn_softc *sc = xsc;
2952 #if __FreeBSD_version >= 1100099
2955 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2957 struct hn_softc *sc = arg1;
2958 unsigned int lenlim;
2961 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2962 error = sysctl_handle_int(oidp, &lenlim, 0, req);
2963 if (error || req->newptr == NULL)
2967 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2968 lenlim > TCP_LRO_LENGTH_MAX) {
2972 hn_set_lro_lenlim(sc, lenlim);
2979 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2981 struct hn_softc *sc = arg1;
2982 int ackcnt, error, i;
2985 * lro_ackcnt_lim is append count limit,
2986 * +1 to turn it into aggregation limit.
2988 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2989 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2990 if (error || req->newptr == NULL)
2993 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2997 * Convert aggregation limit back to append
3002 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
3003 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
3011 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
3013 struct hn_softc *sc = arg1;
3018 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
3021 error = sysctl_handle_int(oidp, &on, 0, req);
3022 if (error || req->newptr == NULL)
3026 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3027 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3030 rxr->hn_trust_hcsum |= hcsum;
3032 rxr->hn_trust_hcsum &= ~hcsum;
3039 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
3041 struct hn_softc *sc = arg1;
3042 int chim_size, error;
3044 chim_size = sc->hn_tx_ring[0].hn_chim_size;
3045 error = sysctl_handle_int(oidp, &chim_size, 0, req);
3046 if (error || req->newptr == NULL)
3049 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
3053 hn_set_chim_size(sc, chim_size);
3058 #if __FreeBSD_version < 1100095
3060 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
3062 struct hn_softc *sc = arg1;
3063 int ofs = arg2, i, error;
3064 struct hn_rx_ring *rxr;
3068 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
3069 rxr = &sc->hn_rx_ring[i];
3070 stat += *((int *)((uint8_t *)rxr + ofs));
3073 error = sysctl_handle_64(oidp, &stat, 0, req);
3074 if (error || req->newptr == NULL)
3077 /* Zero out this stat. */
3078 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
3079 rxr = &sc->hn_rx_ring[i];
3080 *((int *)((uint8_t *)rxr + ofs)) = 0;
3086 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
3088 struct hn_softc *sc = arg1;
3089 int ofs = arg2, i, error;
3090 struct hn_rx_ring *rxr;
3094 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3095 rxr = &sc->hn_rx_ring[i];
3096 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
3099 error = sysctl_handle_64(oidp, &stat, 0, req);
3100 if (error || req->newptr == NULL)
3103 /* Zero out this stat. */
3104 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3105 rxr = &sc->hn_rx_ring[i];
3106 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
3114 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3116 struct hn_softc *sc = arg1;
3117 int ofs = arg2, i, error;
3118 struct hn_rx_ring *rxr;
3122 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3123 rxr = &sc->hn_rx_ring[i];
3124 stat += *((u_long *)((uint8_t *)rxr + ofs));
3127 error = sysctl_handle_long(oidp, &stat, 0, req);
3128 if (error || req->newptr == NULL)
3131 /* Zero out this stat. */
3132 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3133 rxr = &sc->hn_rx_ring[i];
3134 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
3140 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3142 struct hn_softc *sc = arg1;
3143 int ofs = arg2, i, error;
3144 struct hn_tx_ring *txr;
3148 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3149 txr = &sc->hn_tx_ring[i];
3150 stat += *((u_long *)((uint8_t *)txr + ofs));
3153 error = sysctl_handle_long(oidp, &stat, 0, req);
3154 if (error || req->newptr == NULL)
3157 /* Zero out this stat. */
3158 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3159 txr = &sc->hn_tx_ring[i];
3160 *((u_long *)((uint8_t *)txr + ofs)) = 0;
3166 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
3168 struct hn_softc *sc = arg1;
3169 int ofs = arg2, i, error, conf;
3170 struct hn_tx_ring *txr;
3172 txr = &sc->hn_tx_ring[0];
3173 conf = *((int *)((uint8_t *)txr + ofs));
3175 error = sysctl_handle_int(oidp, &conf, 0, req);
3176 if (error || req->newptr == NULL)
3180 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3181 txr = &sc->hn_tx_ring[i];
3182 *((int *)((uint8_t *)txr + ofs)) = conf;
3190 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
3192 struct hn_softc *sc = arg1;
3195 size = sc->hn_agg_size;
3196 error = sysctl_handle_int(oidp, &size, 0, req);
3197 if (error || req->newptr == NULL)
3201 sc->hn_agg_size = size;
3209 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
3211 struct hn_softc *sc = arg1;
3214 pkts = sc->hn_agg_pkts;
3215 error = sysctl_handle_int(oidp, &pkts, 0, req);
3216 if (error || req->newptr == NULL)
3220 sc->hn_agg_pkts = pkts;
3228 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
3230 struct hn_softc *sc = arg1;
3233 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
3234 return (sysctl_handle_int(oidp, &pkts, 0, req));
3238 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
3240 struct hn_softc *sc = arg1;
3243 align = sc->hn_tx_ring[0].hn_agg_align;
3244 return (sysctl_handle_int(oidp, &align, 0, req));
3248 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
3251 vmbus_chan_poll_disable(chan);
3253 vmbus_chan_poll_enable(chan, pollhz);
3257 hn_polling(struct hn_softc *sc, u_int pollhz)
3259 int nsubch = sc->hn_rx_ring_inuse - 1;
3264 struct vmbus_channel **subch;
3267 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
3268 for (i = 0; i < nsubch; ++i)
3269 hn_chan_polling(subch[i], pollhz);
3270 vmbus_subchan_rel(subch, nsubch);
3272 hn_chan_polling(sc->hn_prichan, pollhz);
3276 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
3278 struct hn_softc *sc = arg1;
3281 pollhz = sc->hn_pollhz;
3282 error = sysctl_handle_int(oidp, &pollhz, 0, req);
3283 if (error || req->newptr == NULL)
3287 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
3291 if (sc->hn_pollhz != pollhz) {
3292 sc->hn_pollhz = pollhz;
3293 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
3294 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
3295 hn_polling(sc, sc->hn_pollhz);
3303 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
3305 struct hn_softc *sc = arg1;
3308 snprintf(verstr, sizeof(verstr), "%u.%u",
3309 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
3310 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
3311 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
3315 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
3317 struct hn_softc *sc = arg1;
3324 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
3325 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
3329 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
3331 struct hn_softc *sc = arg1;
3332 char assist_str[128];
3336 hwassist = sc->hn_ifp->if_hwassist;
3338 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
3339 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
3343 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
3345 struct hn_softc *sc = arg1;
3346 char filter_str[128];
3350 filter = sc->hn_rx_filter;
3352 snprintf(filter_str, sizeof(filter_str), "%b", filter,
3354 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
3358 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
3360 struct hn_softc *sc = arg1;
3365 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3366 if (error || req->newptr == NULL)
3369 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3372 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3374 if (sc->hn_rx_ring_inuse > 1) {
3375 error = hn_rss_reconfig(sc);
3377 /* Not RSS capable, at least for now; just save the RSS key. */
3386 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
3388 struct hn_softc *sc = arg1;
3393 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3394 if (error || req->newptr == NULL)
3398 * Don't allow RSS indirect table change, if this interface is not
3399 * RSS capable currently.
3401 if (sc->hn_rx_ring_inuse == 1) {
3406 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3409 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3411 hn_rss_ind_fixup(sc);
3412 error = hn_rss_reconfig(sc);
3419 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3421 struct hn_softc *sc = arg1;
3426 hash = sc->hn_rss_hash;
3428 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3429 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3433 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
3435 struct hn_softc *sc = arg1;
3436 char vf_name[IFNAMSIZ + 1];
3437 struct ifnet *vf_ifp;
3441 vf_ifp = sc->hn_vf_ifp;
3443 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
3445 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
3449 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
3451 struct hn_softc *sc = arg1;
3452 char vf_name[IFNAMSIZ + 1];
3453 struct ifnet *vf_ifp;
3457 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
3459 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
3461 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
3465 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
3467 struct rm_priotracker pt;
3472 error = sysctl_wire_old_buffer(req, 0);
3476 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
3480 rm_rlock(&hn_vfmap_lock, &pt);
3483 for (i = 0; i < hn_vfmap_size; ++i) {
3486 if (hn_vfmap[i] == NULL)
3489 ifp = ifnet_byindex(i);
3492 sbuf_printf(sb, "%s", ifp->if_xname);
3494 sbuf_printf(sb, " %s", ifp->if_xname);
3499 rm_runlock(&hn_vfmap_lock, &pt);
3501 error = sbuf_finish(sb);
3507 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
3509 struct rm_priotracker pt;
3514 error = sysctl_wire_old_buffer(req, 0);
3518 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
3522 rm_rlock(&hn_vfmap_lock, &pt);
3525 for (i = 0; i < hn_vfmap_size; ++i) {
3526 struct ifnet *ifp, *hn_ifp;
3528 hn_ifp = hn_vfmap[i];
3532 ifp = ifnet_byindex(i);
3535 sbuf_printf(sb, "%s:%s", ifp->if_xname,
3538 sbuf_printf(sb, " %s:%s", ifp->if_xname,
3545 rm_runlock(&hn_vfmap_lock, &pt);
3547 error = sbuf_finish(sb);
3553 hn_check_iplen(const struct mbuf *m, int hoff)
3555 const struct ip *ip;
3556 int len, iphlen, iplen;
3557 const struct tcphdr *th;
3558 int thoff; /* TCP data offset */
3560 len = hoff + sizeof(struct ip);
3562 /* The packet must be at least the size of an IP header. */
3563 if (m->m_pkthdr.len < len)
3564 return IPPROTO_DONE;
3566 /* The fixed IP header must reside completely in the first mbuf. */
3568 return IPPROTO_DONE;
3570 ip = mtodo(m, hoff);
3572 /* Bound check the packet's stated IP header length. */
3573 iphlen = ip->ip_hl << 2;
3574 if (iphlen < sizeof(struct ip)) /* minimum header length */
3575 return IPPROTO_DONE;
3577 /* The full IP header must reside completely in the one mbuf. */
3578 if (m->m_len < hoff + iphlen)
3579 return IPPROTO_DONE;
3581 iplen = ntohs(ip->ip_len);
3584 * Check that the amount of data in the buffers is as
3585 * at least much as the IP header would have us expect.
3587 if (m->m_pkthdr.len < hoff + iplen)
3588 return IPPROTO_DONE;
3591 * Ignore IP fragments.
3593 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3594 return IPPROTO_DONE;
3597 * The TCP/IP or UDP/IP header must be entirely contained within
3598 * the first fragment of a packet.
3602 if (iplen < iphlen + sizeof(struct tcphdr))
3603 return IPPROTO_DONE;
3604 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3605 return IPPROTO_DONE;
3606 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3607 thoff = th->th_off << 2;
3608 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3609 return IPPROTO_DONE;
3610 if (m->m_len < hoff + iphlen + thoff)
3611 return IPPROTO_DONE;
3614 if (iplen < iphlen + sizeof(struct udphdr))
3615 return IPPROTO_DONE;
3616 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3617 return IPPROTO_DONE;
3621 return IPPROTO_DONE;
3628 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3630 struct sysctl_oid_list *child;
3631 struct sysctl_ctx_list *ctx;
3632 device_t dev = sc->hn_dev;
3633 #if defined(INET) || defined(INET6)
3634 #if __FreeBSD_version >= 1100095
3641 * Create RXBUF for reception.
3644 * - It is shared by all channels.
3645 * - A large enough buffer is allocated, certain version of NVSes
3646 * may further limit the usable space.
3648 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3649 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3650 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3651 if (sc->hn_rxbuf == NULL) {
3652 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3656 sc->hn_rx_ring_cnt = ring_cnt;
3657 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3659 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3660 M_DEVBUF, M_WAITOK | M_ZERO);
3662 #if defined(INET) || defined(INET6)
3663 #if __FreeBSD_version >= 1100095
3664 lroent_cnt = hn_lro_entry_count;
3665 if (lroent_cnt < TCP_LRO_ENTRIES)
3666 lroent_cnt = TCP_LRO_ENTRIES;
3668 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3670 #endif /* INET || INET6 */
3672 ctx = device_get_sysctl_ctx(dev);
3673 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3675 /* Create dev.hn.UNIT.rx sysctl tree */
3676 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3677 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3679 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3680 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3682 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3683 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3684 &rxr->hn_br_dma, BUS_DMA_WAITOK);
3685 if (rxr->hn_br == NULL) {
3686 device_printf(dev, "allocate bufring failed\n");
3690 if (hn_trust_hosttcp)
3691 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3692 if (hn_trust_hostudp)
3693 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3694 if (hn_trust_hostip)
3695 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3696 rxr->hn_ifp = sc->hn_ifp;
3697 if (i < sc->hn_tx_ring_cnt)
3698 rxr->hn_txr = &sc->hn_tx_ring[i];
3699 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3700 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3702 rxr->hn_rxbuf = sc->hn_rxbuf;
3707 #if defined(INET) || defined(INET6)
3708 #if __FreeBSD_version >= 1100095
3709 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3710 hn_lro_mbufq_depth);
3712 tcp_lro_init(&rxr->hn_lro);
3713 rxr->hn_lro.ifp = sc->hn_ifp;
3715 #if __FreeBSD_version >= 1100099
3716 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3717 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3719 #endif /* INET || INET6 */
3721 if (sc->hn_rx_sysctl_tree != NULL) {
3725 * Create per RX ring sysctl tree:
3726 * dev.hn.UNIT.rx.RINGID
3728 snprintf(name, sizeof(name), "%d", i);
3729 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3730 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3731 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3733 if (rxr->hn_rx_sysctl_tree != NULL) {
3734 SYSCTL_ADD_ULONG(ctx,
3735 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3736 OID_AUTO, "packets", CTLFLAG_RW,
3737 &rxr->hn_pkts, "# of packets received");
3738 SYSCTL_ADD_ULONG(ctx,
3739 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3740 OID_AUTO, "rss_pkts", CTLFLAG_RW,
3742 "# of packets w/ RSS info received");
3744 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3745 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3746 &rxr->hn_pktbuf_len, 0,
3747 "Temporary channel packet buffer length");
3752 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3753 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3754 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3755 #if __FreeBSD_version < 1100095
3756 hn_rx_stat_int_sysctl,
3758 hn_rx_stat_u64_sysctl,
3760 "LU", "LRO queued");
3761 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3762 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3763 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3764 #if __FreeBSD_version < 1100095
3765 hn_rx_stat_int_sysctl,
3767 hn_rx_stat_u64_sysctl,
3769 "LU", "LRO flushed");
3770 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3771 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3772 __offsetof(struct hn_rx_ring, hn_lro_tried),
3773 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3774 #if __FreeBSD_version >= 1100099
3775 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3776 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3777 hn_lro_lenlim_sysctl, "IU",
3778 "Max # of data bytes to be aggregated by LRO");
3779 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3780 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3781 hn_lro_ackcnt_sysctl, "I",
3782 "Max # of ACKs to be aggregated by LRO");
3784 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3785 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3786 hn_trust_hcsum_sysctl, "I",
3787 "Trust tcp segement verification on host side, "
3788 "when csum info is missing");
3789 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3790 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3791 hn_trust_hcsum_sysctl, "I",
3792 "Trust udp datagram verification on host side, "
3793 "when csum info is missing");
3794 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3795 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3796 hn_trust_hcsum_sysctl, "I",
3797 "Trust ip packet verification on host side, "
3798 "when csum info is missing");
3799 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3800 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3801 __offsetof(struct hn_rx_ring, hn_csum_ip),
3802 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3803 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3804 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3805 __offsetof(struct hn_rx_ring, hn_csum_tcp),
3806 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3807 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3808 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3809 __offsetof(struct hn_rx_ring, hn_csum_udp),
3810 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3811 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3812 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3813 __offsetof(struct hn_rx_ring, hn_csum_trusted),
3814 hn_rx_stat_ulong_sysctl, "LU",
3815 "# of packets that we trust host's csum verification");
3816 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3817 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3818 __offsetof(struct hn_rx_ring, hn_small_pkts),
3819 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3820 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3821 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3822 __offsetof(struct hn_rx_ring, hn_ack_failed),
3823 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3824 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3825 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3826 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3827 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3833 hn_destroy_rx_data(struct hn_softc *sc)
3837 if (sc->hn_rxbuf != NULL) {
3838 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3839 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3841 device_printf(sc->hn_dev, "RXBUF is referenced\n");
3842 sc->hn_rxbuf = NULL;
3845 if (sc->hn_rx_ring_cnt == 0)
3848 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3849 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3851 if (rxr->hn_br == NULL)
3853 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3854 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3856 device_printf(sc->hn_dev,
3857 "%dth channel bufring is referenced", i);
3861 #if defined(INET) || defined(INET6)
3862 tcp_lro_free(&rxr->hn_lro);
3864 free(rxr->hn_pktbuf, M_DEVBUF);
3866 free(sc->hn_rx_ring, M_DEVBUF);
3867 sc->hn_rx_ring = NULL;
3869 sc->hn_rx_ring_cnt = 0;
3870 sc->hn_rx_ring_inuse = 0;
3874 hn_tx_ring_create(struct hn_softc *sc, int id)
3876 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3877 device_t dev = sc->hn_dev;
3878 bus_dma_tag_t parent_dtag;
3882 txr->hn_tx_idx = id;
3884 #ifndef HN_USE_TXDESC_BUFRING
3885 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3887 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3889 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3890 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3891 M_DEVBUF, M_WAITOK | M_ZERO);
3892 #ifndef HN_USE_TXDESC_BUFRING
3893 SLIST_INIT(&txr->hn_txlist);
3895 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3896 M_WAITOK, &txr->hn_tx_lock);
3899 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3900 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3901 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3903 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3906 #ifdef HN_IFSTART_SUPPORT
3907 if (hn_use_if_start) {
3908 txr->hn_txeof = hn_start_txeof;
3909 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3910 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3916 txr->hn_txeof = hn_xmit_txeof;
3917 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3918 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3920 br_depth = hn_get_txswq_depth(txr);
3921 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3922 M_WAITOK, &txr->hn_tx_lock);
3925 txr->hn_direct_tx_size = hn_direct_tx_size;
3928 * Always schedule transmission instead of trying to do direct
3929 * transmission. This one gives the best performance so far.
3931 txr->hn_sched_tx = 1;
3933 parent_dtag = bus_get_dma_tag(dev);
3935 /* DMA tag for RNDIS packet messages. */
3936 error = bus_dma_tag_create(parent_dtag, /* parent */
3937 HN_RNDIS_PKT_ALIGN, /* alignment */
3938 HN_RNDIS_PKT_BOUNDARY, /* boundary */
3939 BUS_SPACE_MAXADDR, /* lowaddr */
3940 BUS_SPACE_MAXADDR, /* highaddr */
3941 NULL, NULL, /* filter, filterarg */
3942 HN_RNDIS_PKT_LEN, /* maxsize */
3944 HN_RNDIS_PKT_LEN, /* maxsegsize */
3946 NULL, /* lockfunc */
3947 NULL, /* lockfuncarg */
3948 &txr->hn_tx_rndis_dtag);
3950 device_printf(dev, "failed to create rndis dmatag\n");
3954 /* DMA tag for data. */
3955 error = bus_dma_tag_create(parent_dtag, /* parent */
3957 HN_TX_DATA_BOUNDARY, /* boundary */
3958 BUS_SPACE_MAXADDR, /* lowaddr */
3959 BUS_SPACE_MAXADDR, /* highaddr */
3960 NULL, NULL, /* filter, filterarg */
3961 HN_TX_DATA_MAXSIZE, /* maxsize */
3962 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
3963 HN_TX_DATA_SEGSIZE, /* maxsegsize */
3965 NULL, /* lockfunc */
3966 NULL, /* lockfuncarg */
3967 &txr->hn_tx_data_dtag);
3969 device_printf(dev, "failed to create data dmatag\n");
3973 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3974 struct hn_txdesc *txd = &txr->hn_txdesc[i];
3977 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3978 STAILQ_INIT(&txd->agg_list);
3981 * Allocate and load RNDIS packet message.
3983 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3984 (void **)&txd->rndis_pkt,
3985 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3986 &txd->rndis_pkt_dmap);
3989 "failed to allocate rndis_packet_msg, %d\n", i);
3993 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3994 txd->rndis_pkt_dmap,
3995 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3996 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
4000 "failed to load rndis_packet_msg, %d\n", i);
4001 bus_dmamem_free(txr->hn_tx_rndis_dtag,
4002 txd->rndis_pkt, txd->rndis_pkt_dmap);
4006 /* DMA map for TX data. */
4007 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
4011 "failed to allocate tx data dmamap\n");
4012 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
4013 txd->rndis_pkt_dmap);
4014 bus_dmamem_free(txr->hn_tx_rndis_dtag,
4015 txd->rndis_pkt, txd->rndis_pkt_dmap);
4019 /* All set, put it to list */
4020 txd->flags |= HN_TXD_FLAG_ONLIST;
4021 #ifndef HN_USE_TXDESC_BUFRING
4022 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
4024 buf_ring_enqueue(txr->hn_txdesc_br, txd);
4027 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
4029 if (sc->hn_tx_sysctl_tree != NULL) {
4030 struct sysctl_oid_list *child;
4031 struct sysctl_ctx_list *ctx;
4035 * Create per TX ring sysctl tree:
4036 * dev.hn.UNIT.tx.RINGID
4038 ctx = device_get_sysctl_ctx(dev);
4039 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
4041 snprintf(name, sizeof(name), "%d", id);
4042 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
4043 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4045 if (txr->hn_tx_sysctl_tree != NULL) {
4046 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
4049 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
4050 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
4051 "# of available TX descs");
4053 #ifdef HN_IFSTART_SUPPORT
4054 if (!hn_use_if_start)
4057 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
4058 CTLFLAG_RD, &txr->hn_oactive, 0,
4061 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
4062 CTLFLAG_RW, &txr->hn_pkts,
4063 "# of packets transmitted");
4064 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
4065 CTLFLAG_RW, &txr->hn_sends, "# of sends");
4073 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
4075 struct hn_tx_ring *txr = txd->txr;
4077 KASSERT(txd->m == NULL, ("still has mbuf installed"));
4078 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
4080 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
4081 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
4082 txd->rndis_pkt_dmap);
4083 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
4087 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
4090 KASSERT(txd->refs == 0 || txd->refs == 1,
4091 ("invalid txd refs %d", txd->refs));
4093 /* Aggregated txds will be freed by their aggregating txd. */
4094 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
4097 freed = hn_txdesc_put(txr, txd);
4098 KASSERT(freed, ("can't free txdesc"));
4103 hn_tx_ring_destroy(struct hn_tx_ring *txr)
4107 if (txr->hn_txdesc == NULL)
4112 * Because the freeing of aggregated txds will be deferred
4113 * to the aggregating txd, two passes are used here:
4114 * - The first pass GCes any pending txds. This GC is necessary,
4115 * since if the channels are revoked, hypervisor will not
4116 * deliver send-done for all pending txds.
4117 * - The second pass frees the busdma stuffs, i.e. after all txds
4120 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4121 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
4122 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4123 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
4125 if (txr->hn_tx_data_dtag != NULL)
4126 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
4127 if (txr->hn_tx_rndis_dtag != NULL)
4128 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
4130 #ifdef HN_USE_TXDESC_BUFRING
4131 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
4134 free(txr->hn_txdesc, M_DEVBUF);
4135 txr->hn_txdesc = NULL;
4137 if (txr->hn_mbuf_br != NULL)
4138 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
4140 #ifndef HN_USE_TXDESC_BUFRING
4141 mtx_destroy(&txr->hn_txlist_spin);
4143 mtx_destroy(&txr->hn_tx_lock);
4147 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
4149 struct sysctl_oid_list *child;
4150 struct sysctl_ctx_list *ctx;
4154 * Create TXBUF for chimney sending.
4156 * NOTE: It is shared by all channels.
4158 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
4159 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
4160 BUS_DMA_WAITOK | BUS_DMA_ZERO);
4161 if (sc->hn_chim == NULL) {
4162 device_printf(sc->hn_dev, "allocate txbuf failed\n");
4166 sc->hn_tx_ring_cnt = ring_cnt;
4167 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4169 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
4170 M_DEVBUF, M_WAITOK | M_ZERO);
4172 ctx = device_get_sysctl_ctx(sc->hn_dev);
4173 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
4175 /* Create dev.hn.UNIT.tx sysctl tree */
4176 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
4177 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4179 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4182 error = hn_tx_ring_create(sc, i);
4187 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
4188 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4189 __offsetof(struct hn_tx_ring, hn_no_txdescs),
4190 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
4191 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
4192 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4193 __offsetof(struct hn_tx_ring, hn_send_failed),
4194 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
4195 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
4196 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4197 __offsetof(struct hn_tx_ring, hn_txdma_failed),
4198 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
4199 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
4200 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4201 __offsetof(struct hn_tx_ring, hn_flush_failed),
4202 hn_tx_stat_ulong_sysctl, "LU",
4203 "# of packet transmission aggregation flush failure");
4204 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
4205 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4206 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
4207 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
4208 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
4209 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4210 __offsetof(struct hn_tx_ring, hn_tx_chimney),
4211 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
4212 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
4213 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4214 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
4215 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
4216 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
4217 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
4218 "# of total TX descs");
4219 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
4220 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
4221 "Chimney send packet size upper boundary");
4222 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
4223 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4224 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
4225 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
4226 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4227 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
4228 hn_tx_conf_int_sysctl, "I",
4229 "Size of the packet for direct transmission");
4230 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
4231 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4232 __offsetof(struct hn_tx_ring, hn_sched_tx),
4233 hn_tx_conf_int_sysctl, "I",
4234 "Always schedule transmission "
4235 "instead of doing direct transmission");
4236 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
4237 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
4238 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
4239 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
4240 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
4241 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
4242 "Applied packet transmission aggregation size");
4243 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
4244 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
4245 hn_txagg_pktmax_sysctl, "I",
4246 "Applied packet transmission aggregation packets");
4247 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
4248 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
4249 hn_txagg_align_sysctl, "I",
4250 "Applied packet transmission aggregation alignment");
4256 hn_set_chim_size(struct hn_softc *sc, int chim_size)
4260 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4261 sc->hn_tx_ring[i].hn_chim_size = chim_size;
4265 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
4267 struct ifnet *ifp = sc->hn_ifp;
4270 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
4273 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
4274 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
4275 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
4277 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
4278 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
4279 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
4281 if (tso_maxlen < tso_minlen)
4282 tso_maxlen = tso_minlen;
4283 else if (tso_maxlen > IP_MAXPACKET)
4284 tso_maxlen = IP_MAXPACKET;
4285 if (tso_maxlen > sc->hn_ndis_tso_szmax)
4286 tso_maxlen = sc->hn_ndis_tso_szmax;
4287 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4289 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
4293 hn_fixup_tx_data(struct hn_softc *sc)
4295 uint64_t csum_assist;
4298 hn_set_chim_size(sc, sc->hn_chim_szmax);
4299 if (hn_tx_chimney_size > 0 &&
4300 hn_tx_chimney_size < sc->hn_chim_szmax)
4301 hn_set_chim_size(sc, hn_tx_chimney_size);
4304 if (sc->hn_caps & HN_CAP_IPCS)
4305 csum_assist |= CSUM_IP;
4306 if (sc->hn_caps & HN_CAP_TCP4CS)
4307 csum_assist |= CSUM_IP_TCP;
4308 if (sc->hn_caps & HN_CAP_UDP4CS)
4309 csum_assist |= CSUM_IP_UDP;
4310 if (sc->hn_caps & HN_CAP_TCP6CS)
4311 csum_assist |= CSUM_IP6_TCP;
4312 if (sc->hn_caps & HN_CAP_UDP6CS)
4313 csum_assist |= CSUM_IP6_UDP;
4314 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4315 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
4317 if (sc->hn_caps & HN_CAP_HASHVAL) {
4319 * Support HASHVAL pktinfo on TX path.
4322 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
4323 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4324 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
4329 hn_destroy_tx_data(struct hn_softc *sc)
4333 if (sc->hn_chim != NULL) {
4334 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
4335 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
4337 device_printf(sc->hn_dev,
4338 "chimney sending buffer is referenced");
4343 if (sc->hn_tx_ring_cnt == 0)
4346 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4347 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
4349 free(sc->hn_tx_ring, M_DEVBUF);
4350 sc->hn_tx_ring = NULL;
4352 sc->hn_tx_ring_cnt = 0;
4353 sc->hn_tx_ring_inuse = 0;
4356 #ifdef HN_IFSTART_SUPPORT
4359 hn_start_taskfunc(void *xtxr, int pending __unused)
4361 struct hn_tx_ring *txr = xtxr;
4363 mtx_lock(&txr->hn_tx_lock);
4364 hn_start_locked(txr, 0);
4365 mtx_unlock(&txr->hn_tx_lock);
4369 hn_start_locked(struct hn_tx_ring *txr, int len)
4371 struct hn_softc *sc = txr->hn_sc;
4372 struct ifnet *ifp = sc->hn_ifp;
4375 KASSERT(hn_use_if_start,
4376 ("hn_start_locked is called, when if_start is disabled"));
4377 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4378 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4379 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4381 if (__predict_false(txr->hn_suspended))
4384 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
4388 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
4389 struct hn_txdesc *txd;
4390 struct mbuf *m_head;
4393 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
4397 if (len > 0 && m_head->m_pkthdr.len > len) {
4399 * This sending could be time consuming; let callers
4400 * dispatch this packet sending (and sending of any
4401 * following up packets) to tx taskqueue.
4403 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4408 #if defined(INET6) || defined(INET)
4409 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
4410 m_head = hn_tso_fixup(m_head);
4411 if (__predict_false(m_head == NULL)) {
4412 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4418 txd = hn_txdesc_get(txr);
4420 txr->hn_no_txdescs++;
4421 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4422 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4426 error = hn_encap(ifp, txr, txd, &m_head);
4428 /* Both txd and m_head are freed */
4429 KASSERT(txr->hn_agg_txd == NULL,
4430 ("encap failed w/ pending aggregating txdesc"));
4434 if (txr->hn_agg_pktleft == 0) {
4435 if (txr->hn_agg_txd != NULL) {
4436 KASSERT(m_head == NULL,
4437 ("pending mbuf for aggregating txdesc"));
4438 error = hn_flush_txagg(ifp, txr);
4439 if (__predict_false(error)) {
4440 atomic_set_int(&ifp->if_drv_flags,
4445 KASSERT(m_head != NULL, ("mbuf was freed"));
4446 error = hn_txpkt(ifp, txr, txd);
4447 if (__predict_false(error)) {
4448 /* txd is freed, but m_head is not */
4449 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4450 atomic_set_int(&ifp->if_drv_flags,
4458 KASSERT(txr->hn_agg_txd != NULL,
4459 ("no aggregating txdesc"));
4460 KASSERT(m_head == NULL,
4461 ("pending mbuf for aggregating txdesc"));
4466 /* Flush pending aggerated transmission. */
4467 if (txr->hn_agg_txd != NULL)
4468 hn_flush_txagg(ifp, txr);
4473 hn_start(struct ifnet *ifp)
4475 struct hn_softc *sc = ifp->if_softc;
4476 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
4478 if (txr->hn_sched_tx)
4481 if (mtx_trylock(&txr->hn_tx_lock)) {
4484 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4485 mtx_unlock(&txr->hn_tx_lock);
4490 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4494 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
4496 struct hn_tx_ring *txr = xtxr;
4498 mtx_lock(&txr->hn_tx_lock);
4499 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
4500 hn_start_locked(txr, 0);
4501 mtx_unlock(&txr->hn_tx_lock);
4505 hn_start_txeof(struct hn_tx_ring *txr)
4507 struct hn_softc *sc = txr->hn_sc;
4508 struct ifnet *ifp = sc->hn_ifp;
4510 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4512 if (txr->hn_sched_tx)
4515 if (mtx_trylock(&txr->hn_tx_lock)) {
4518 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4519 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4520 mtx_unlock(&txr->hn_tx_lock);
4522 taskqueue_enqueue(txr->hn_tx_taskq,
4528 * Release the OACTIVE earlier, with the hope, that
4529 * others could catch up. The task will clear the
4530 * flag again with the hn_tx_lock to avoid possible
4533 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4534 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4538 #endif /* HN_IFSTART_SUPPORT */
4541 hn_xmit(struct hn_tx_ring *txr, int len)
4543 struct hn_softc *sc = txr->hn_sc;
4544 struct ifnet *ifp = sc->hn_ifp;
4545 struct mbuf *m_head;
4548 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4549 #ifdef HN_IFSTART_SUPPORT
4550 KASSERT(hn_use_if_start == 0,
4551 ("hn_xmit is called, when if_start is enabled"));
4553 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4555 if (__predict_false(txr->hn_suspended))
4558 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4561 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4562 struct hn_txdesc *txd;
4565 if (len > 0 && m_head->m_pkthdr.len > len) {
4567 * This sending could be time consuming; let callers
4568 * dispatch this packet sending (and sending of any
4569 * following up packets) to tx taskqueue.
4571 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4576 txd = hn_txdesc_get(txr);
4578 txr->hn_no_txdescs++;
4579 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4580 txr->hn_oactive = 1;
4584 error = hn_encap(ifp, txr, txd, &m_head);
4586 /* Both txd and m_head are freed; discard */
4587 KASSERT(txr->hn_agg_txd == NULL,
4588 ("encap failed w/ pending aggregating txdesc"));
4589 drbr_advance(ifp, txr->hn_mbuf_br);
4593 if (txr->hn_agg_pktleft == 0) {
4594 if (txr->hn_agg_txd != NULL) {
4595 KASSERT(m_head == NULL,
4596 ("pending mbuf for aggregating txdesc"));
4597 error = hn_flush_txagg(ifp, txr);
4598 if (__predict_false(error)) {
4599 txr->hn_oactive = 1;
4603 KASSERT(m_head != NULL, ("mbuf was freed"));
4604 error = hn_txpkt(ifp, txr, txd);
4605 if (__predict_false(error)) {
4606 /* txd is freed, but m_head is not */
4607 drbr_putback(ifp, txr->hn_mbuf_br,
4609 txr->hn_oactive = 1;
4616 KASSERT(txr->hn_agg_txd != NULL,
4617 ("no aggregating txdesc"));
4618 KASSERT(m_head == NULL,
4619 ("pending mbuf for aggregating txdesc"));
4624 drbr_advance(ifp, txr->hn_mbuf_br);
4627 /* Flush pending aggerated transmission. */
4628 if (txr->hn_agg_txd != NULL)
4629 hn_flush_txagg(ifp, txr);
4634 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4636 struct hn_softc *sc = ifp->if_softc;
4637 struct hn_tx_ring *txr;
4640 #if defined(INET6) || defined(INET)
4642 * Perform TSO packet header fixup now, since the TSO
4643 * packet header should be cache-hot.
4645 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4646 m = hn_tso_fixup(m);
4647 if (__predict_false(m == NULL)) {
4648 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4655 * Select the TX ring based on flowid
4657 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
4658 #if defined(INET6) || defined(INET)
4661 if (m->m_pkthdr.len < 128 &&
4662 (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
4663 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
4664 m = hn_check_tcpsyn(m, &tcpsyn);
4665 if (__predict_false(m == NULL)) {
4667 IFCOUNTER_OERRORS, 1);
4672 const int tcpsyn = 0;
4677 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4679 txr = &sc->hn_tx_ring[idx];
4681 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4683 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4687 if (txr->hn_oactive)
4690 if (txr->hn_sched_tx)
4693 if (mtx_trylock(&txr->hn_tx_lock)) {
4696 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4697 mtx_unlock(&txr->hn_tx_lock);
4702 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4707 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4711 mtx_lock(&txr->hn_tx_lock);
4712 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4714 mtx_unlock(&txr->hn_tx_lock);
4718 hn_xmit_qflush(struct ifnet *ifp)
4720 struct hn_softc *sc = ifp->if_softc;
4723 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4724 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4729 hn_xmit_txeof(struct hn_tx_ring *txr)
4732 if (txr->hn_sched_tx)
4735 if (mtx_trylock(&txr->hn_tx_lock)) {
4738 txr->hn_oactive = 0;
4739 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4740 mtx_unlock(&txr->hn_tx_lock);
4742 taskqueue_enqueue(txr->hn_tx_taskq,
4748 * Release the oactive earlier, with the hope, that
4749 * others could catch up. The task will clear the
4750 * oactive again with the hn_tx_lock to avoid possible
4753 txr->hn_oactive = 0;
4754 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4759 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4761 struct hn_tx_ring *txr = xtxr;
4763 mtx_lock(&txr->hn_tx_lock);
4765 mtx_unlock(&txr->hn_tx_lock);
4769 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4771 struct hn_tx_ring *txr = xtxr;
4773 mtx_lock(&txr->hn_tx_lock);
4774 txr->hn_oactive = 0;
4776 mtx_unlock(&txr->hn_tx_lock);
4780 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4782 struct vmbus_chan_br cbr;
4783 struct hn_rx_ring *rxr;
4784 struct hn_tx_ring *txr = NULL;
4787 idx = vmbus_chan_subidx(chan);
4790 * Link this channel to RX/TX ring.
4792 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4793 ("invalid channel index %d, should > 0 && < %d",
4794 idx, sc->hn_rx_ring_inuse));
4795 rxr = &sc->hn_rx_ring[idx];
4796 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4797 ("RX ring %d already attached", idx));
4798 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4799 rxr->hn_chan = chan;
4802 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4803 idx, vmbus_chan_id(chan));
4806 if (idx < sc->hn_tx_ring_inuse) {
4807 txr = &sc->hn_tx_ring[idx];
4808 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4809 ("TX ring %d already attached", idx));
4810 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4812 txr->hn_chan = chan;
4814 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4815 idx, vmbus_chan_id(chan));
4819 /* Bind this channel to a proper CPU. */
4820 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4825 cbr.cbr = rxr->hn_br;
4826 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4827 cbr.cbr_txsz = HN_TXBR_SIZE;
4828 cbr.cbr_rxsz = HN_RXBR_SIZE;
4829 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4831 if (error == EISCONN) {
4832 if_printf(sc->hn_ifp, "bufring is connected after "
4833 "chan%u open failure\n", vmbus_chan_id(chan));
4834 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4836 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4837 vmbus_chan_id(chan), error);
4844 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4846 struct hn_rx_ring *rxr;
4849 idx = vmbus_chan_subidx(chan);
4852 * Link this channel to RX/TX ring.
4854 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4855 ("invalid channel index %d, should > 0 && < %d",
4856 idx, sc->hn_rx_ring_inuse));
4857 rxr = &sc->hn_rx_ring[idx];
4858 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4859 ("RX ring %d is not attached", idx));
4860 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4862 if (idx < sc->hn_tx_ring_inuse) {
4863 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4865 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4866 ("TX ring %d is not attached attached", idx));
4867 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4871 * Close this channel.
4874 * Channel closing does _not_ destroy the target channel.
4876 error = vmbus_chan_close_direct(chan);
4877 if (error == EISCONN) {
4878 if_printf(sc->hn_ifp, "chan%u bufring is connected "
4879 "after being closed\n", vmbus_chan_id(chan));
4880 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4882 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4883 vmbus_chan_id(chan), error);
4888 hn_attach_subchans(struct hn_softc *sc)
4890 struct vmbus_channel **subchans;
4891 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4894 KASSERT(subchan_cnt > 0, ("no sub-channels"));
4896 /* Attach the sub-channels. */
4897 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4898 for (i = 0; i < subchan_cnt; ++i) {
4901 error1 = hn_chan_attach(sc, subchans[i]);
4904 /* Move on; all channels will be detached later. */
4907 vmbus_subchan_rel(subchans, subchan_cnt);
4910 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4913 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4921 hn_detach_allchans(struct hn_softc *sc)
4923 struct vmbus_channel **subchans;
4924 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4927 if (subchan_cnt == 0)
4930 /* Detach the sub-channels. */
4931 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4932 for (i = 0; i < subchan_cnt; ++i)
4933 hn_chan_detach(sc, subchans[i]);
4934 vmbus_subchan_rel(subchans, subchan_cnt);
4938 * Detach the primary channel, _after_ all sub-channels
4941 hn_chan_detach(sc, sc->hn_prichan);
4943 /* Wait for sub-channels to be destroyed, if any. */
4944 vmbus_subchan_drain(sc->hn_prichan);
4947 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4948 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4949 HN_RX_FLAG_ATTACHED) == 0,
4950 ("%dth RX ring is still attached", i));
4952 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4953 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4954 HN_TX_FLAG_ATTACHED) == 0,
4955 ("%dth TX ring is still attached", i));
4961 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4963 struct vmbus_channel **subchans;
4964 int nchan, rxr_cnt, error;
4966 nchan = *nsubch + 1;
4969 * Multiple RX/TX rings are not requested.
4976 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4979 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4981 /* No RSS; this is benign. */
4986 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4990 if (nchan > rxr_cnt)
4993 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4999 * Allocate sub-channels from NVS.
5001 *nsubch = nchan - 1;
5002 error = hn_nvs_alloc_subchans(sc, nsubch);
5003 if (error || *nsubch == 0) {
5004 /* Failed to allocate sub-channels. */
5010 * Wait for all sub-channels to become ready before moving on.
5012 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
5013 vmbus_subchan_rel(subchans, *nsubch);
5018 hn_synth_attachable(const struct hn_softc *sc)
5022 if (sc->hn_flags & HN_FLAG_ERRORS)
5025 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5026 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5028 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
5035 * Make sure that the RX filter is zero after the successful
5036 * RNDIS initialization.
5039 * Under certain conditions on certain versions of Hyper-V,
5040 * the RNDIS rxfilter is _not_ zero on the hypervisor side
5041 * after the successful RNDIS initialization, which breaks
5042 * the assumption of any following code (well, it breaks the
5043 * RNDIS API contract actually). Clear the RNDIS rxfilter
5044 * explicitly, drain packets sneaking through, and drain the
5045 * interrupt taskqueues scheduled due to the stealth packets.
5048 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
5052 hn_drain_rxtx(sc, nchan);
5056 hn_synth_attach(struct hn_softc *sc, int mtu)
5058 #define ATTACHED_NVS 0x0002
5059 #define ATTACHED_RNDIS 0x0004
5061 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
5062 int error, nsubch, nchan = 1, i, rndis_inited;
5063 uint32_t old_caps, attached = 0;
5065 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
5066 ("synthetic parts were attached"));
5068 if (!hn_synth_attachable(sc))
5071 /* Save capabilities for later verification. */
5072 old_caps = sc->hn_caps;
5075 /* Clear RSS stuffs. */
5076 sc->hn_rss_ind_size = 0;
5077 sc->hn_rss_hash = 0;
5080 * Attach the primary channel _before_ attaching NVS and RNDIS.
5082 error = hn_chan_attach(sc, sc->hn_prichan);
5089 error = hn_nvs_attach(sc, mtu);
5092 attached |= ATTACHED_NVS;
5095 * Attach RNDIS _after_ NVS is attached.
5097 error = hn_rndis_attach(sc, mtu, &rndis_inited);
5099 attached |= ATTACHED_RNDIS;
5104 * Make sure capabilities are not changed.
5106 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
5107 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
5108 old_caps, sc->hn_caps);
5114 * Allocate sub-channels for multi-TX/RX rings.
5117 * The # of RX rings that can be used is equivalent to the # of
5118 * channels to be requested.
5120 nsubch = sc->hn_rx_ring_cnt - 1;
5121 error = hn_synth_alloc_subchans(sc, &nsubch);
5124 /* NOTE: _Full_ synthetic parts detach is required now. */
5125 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
5128 * Set the # of TX/RX rings that could be used according to
5129 * the # of channels that NVS offered.
5132 hn_set_ring_inuse(sc, nchan);
5134 /* Only the primary channel can be used; done */
5139 * Attach the sub-channels.
5141 * NOTE: hn_set_ring_inuse() _must_ have been called.
5143 error = hn_attach_subchans(sc);
5148 * Configure RSS key and indirect table _after_ all sub-channels
5151 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
5153 * RSS key is not set yet; set it to the default RSS key.
5156 if_printf(sc->hn_ifp, "setup default RSS key\n");
5157 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
5158 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
5161 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
5163 * RSS indirect table is not set yet; set it up in round-
5167 if_printf(sc->hn_ifp, "setup default RSS indirect "
5170 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
5171 rss->rss_ind[i] = i % nchan;
5172 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
5175 * # of usable channels may be changed, so we have to
5176 * make sure that all entries in RSS indirect table
5179 * NOTE: hn_set_ring_inuse() _must_ have been called.
5181 hn_rss_ind_fixup(sc);
5184 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
5189 * Fixup transmission aggregation setup.
5192 hn_rndis_init_fixat(sc, nchan);
5196 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
5197 hn_rndis_init_fixat(sc, nchan);
5198 hn_synth_detach(sc);
5200 if (attached & ATTACHED_RNDIS) {
5201 hn_rndis_init_fixat(sc, nchan);
5202 hn_rndis_detach(sc);
5204 if (attached & ATTACHED_NVS)
5206 hn_chan_detach(sc, sc->hn_prichan);
5207 /* Restore old capabilities. */
5208 sc->hn_caps = old_caps;
5212 #undef ATTACHED_RNDIS
5218 * The interface must have been suspended though hn_suspend(), before
5219 * this function get called.
5222 hn_synth_detach(struct hn_softc *sc)
5225 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
5226 ("synthetic parts were not attached"));
5228 /* Detach the RNDIS first. */
5229 hn_rndis_detach(sc);
5234 /* Detach all of the channels. */
5235 hn_detach_allchans(sc);
5237 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
5241 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
5243 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
5244 ("invalid ring count %d", ring_cnt));
5246 if (sc->hn_tx_ring_cnt > ring_cnt)
5247 sc->hn_tx_ring_inuse = ring_cnt;
5249 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5250 sc->hn_rx_ring_inuse = ring_cnt;
5253 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
5254 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
5259 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
5264 * The TX bufring will not be drained by the hypervisor,
5265 * if the primary channel is revoked.
5267 while (!vmbus_chan_rx_empty(chan) ||
5268 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
5269 !vmbus_chan_tx_empty(chan)))
5271 vmbus_chan_intr_drain(chan);
5275 hn_disable_rx(struct hn_softc *sc)
5279 * Disable RX by clearing RX filter forcefully.
5281 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
5282 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
5285 * Give RNDIS enough time to flush all pending data packets.
5287 pause("waitrx", (200 * hz) / 1000);
5292 * RX/TX _must_ have been suspended/disabled, before this function
5296 hn_drain_rxtx(struct hn_softc *sc, int nchan)
5298 struct vmbus_channel **subch = NULL;
5302 * Drain RX/TX bufrings and interrupts.
5306 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
5308 if (subch != NULL) {
5311 for (i = 0; i < nsubch; ++i)
5312 hn_chan_drain(sc, subch[i]);
5314 hn_chan_drain(sc, sc->hn_prichan);
5317 vmbus_subchan_rel(subch, nsubch);
5321 hn_suspend_data(struct hn_softc *sc)
5323 struct hn_tx_ring *txr;
5331 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5332 txr = &sc->hn_tx_ring[i];
5334 mtx_lock(&txr->hn_tx_lock);
5335 txr->hn_suspended = 1;
5336 mtx_unlock(&txr->hn_tx_lock);
5337 /* No one is able send more packets now. */
5340 * Wait for all pending sends to finish.
5343 * We will _not_ receive all pending send-done, if the
5344 * primary channel is revoked.
5346 while (hn_tx_ring_pending(txr) &&
5347 !vmbus_chan_is_revoked(sc->hn_prichan))
5348 pause("hnwtx", 1 /* 1 tick */);
5359 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
5362 * Drain any pending TX tasks.
5365 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
5366 * tasks will have to be drained _after_ the above hn_drain_rxtx().
5368 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5369 txr = &sc->hn_tx_ring[i];
5371 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
5372 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
5377 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
5380 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
5384 hn_suspend_mgmt(struct hn_softc *sc)
5391 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
5392 * through hn_mgmt_taskq.
5394 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
5395 vmbus_chan_run_task(sc->hn_prichan, &task);
5398 * Make sure that all pending management tasks are completed.
5400 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
5401 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
5402 taskqueue_drain_all(sc->hn_mgmt_taskq0);
5406 hn_suspend(struct hn_softc *sc)
5409 /* Disable polling. */
5412 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5413 (sc->hn_flags & HN_FLAG_RXVF))
5414 hn_suspend_data(sc);
5415 hn_suspend_mgmt(sc);
5419 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
5423 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
5424 ("invalid TX ring count %d", tx_ring_cnt));
5426 for (i = 0; i < tx_ring_cnt; ++i) {
5427 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5429 mtx_lock(&txr->hn_tx_lock);
5430 txr->hn_suspended = 0;
5431 mtx_unlock(&txr->hn_tx_lock);
5436 hn_resume_data(struct hn_softc *sc)
5445 hn_rxfilter_config(sc);
5448 * Make sure to clear suspend status on "all" TX rings,
5449 * since hn_tx_ring_inuse can be changed after
5450 * hn_suspend_data().
5452 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
5454 #ifdef HN_IFSTART_SUPPORT
5455 if (!hn_use_if_start)
5459 * Flush unused drbrs, since hn_tx_ring_inuse may be
5462 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
5463 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5469 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5470 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5473 * Use txeof task, so that any pending oactive can be
5476 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5481 hn_resume_mgmt(struct hn_softc *sc)
5484 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
5487 * Kick off network change detection, if it was pending.
5488 * If no network change was pending, start link status
5489 * checks, which is more lightweight than network change
5492 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
5493 hn_change_network(sc);
5495 hn_update_link_status(sc);
5499 hn_resume(struct hn_softc *sc)
5502 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5503 (sc->hn_flags & HN_FLAG_RXVF))
5507 * When the VF is activated, the synthetic interface is changed
5508 * to DOWN in hn_rxvf_change(). Here, if the VF is still active,
5509 * we don't call hn_resume_mgmt() until the VF is deactivated in
5512 if (!(sc->hn_flags & HN_FLAG_RXVF))
5516 * Re-enable polling if this interface is running and
5517 * the polling is requested.
5519 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
5520 hn_polling(sc, sc->hn_pollhz);
5524 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
5526 const struct rndis_status_msg *msg;
5529 if (dlen < sizeof(*msg)) {
5530 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
5535 switch (msg->rm_status) {
5536 case RNDIS_STATUS_MEDIA_CONNECT:
5537 case RNDIS_STATUS_MEDIA_DISCONNECT:
5538 hn_update_link_status(sc);
5541 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
5542 case RNDIS_STATUS_LINK_SPEED_CHANGE:
5543 /* Not really useful; ignore. */
5546 case RNDIS_STATUS_NETWORK_CHANGE:
5547 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
5548 if (dlen < ofs + msg->rm_stbuflen ||
5549 msg->rm_stbuflen < sizeof(uint32_t)) {
5550 if_printf(sc->hn_ifp, "network changed\n");
5554 memcpy(&change, ((const uint8_t *)msg) + ofs,
5556 if_printf(sc->hn_ifp, "network changed, change %u\n",
5559 hn_change_network(sc);
5563 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
5570 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
5572 const struct rndis_pktinfo *pi = info_data;
5575 while (info_dlen != 0) {
5579 if (__predict_false(info_dlen < sizeof(*pi)))
5581 if (__predict_false(info_dlen < pi->rm_size))
5583 info_dlen -= pi->rm_size;
5585 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
5587 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
5589 dlen = pi->rm_size - pi->rm_pktinfooffset;
5592 switch (pi->rm_type) {
5593 case NDIS_PKTINFO_TYPE_VLAN:
5594 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
5596 info->vlan_info = *((const uint32_t *)data);
5597 mask |= HN_RXINFO_VLAN;
5600 case NDIS_PKTINFO_TYPE_CSUM:
5601 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
5603 info->csum_info = *((const uint32_t *)data);
5604 mask |= HN_RXINFO_CSUM;
5607 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5608 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5610 info->hash_value = *((const uint32_t *)data);
5611 mask |= HN_RXINFO_HASHVAL;
5614 case HN_NDIS_PKTINFO_TYPE_HASHINF:
5615 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5617 info->hash_info = *((const uint32_t *)data);
5618 mask |= HN_RXINFO_HASHINF;
5625 if (mask == HN_RXINFO_ALL) {
5626 /* All found; done */
5630 pi = (const struct rndis_pktinfo *)
5631 ((const uint8_t *)pi + pi->rm_size);
5636 * - If there is no hash value, invalidate the hash info.
5638 if ((mask & HN_RXINFO_HASHVAL) == 0)
5639 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5643 static __inline bool
5644 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5647 if (off < check_off) {
5648 if (__predict_true(off + len <= check_off))
5650 } else if (off > check_off) {
5651 if (__predict_true(check_off + check_len <= off))
5658 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5660 const struct rndis_packet_msg *pkt;
5661 struct hn_rxinfo info;
5662 int data_off, pktinfo_off, data_len, pktinfo_len;
5667 if (__predict_false(dlen < sizeof(*pkt))) {
5668 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5673 if (__predict_false(dlen < pkt->rm_len)) {
5674 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5675 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5678 if (__predict_false(pkt->rm_len <
5679 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5680 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5681 "msglen %u, data %u, oob %u, pktinfo %u\n",
5682 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5683 pkt->rm_pktinfolen);
5686 if (__predict_false(pkt->rm_datalen == 0)) {
5687 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5694 #define IS_OFFSET_INVALID(ofs) \
5695 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
5696 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5698 /* XXX Hyper-V does not meet data offset alignment requirement */
5699 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5700 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5701 "data offset %u\n", pkt->rm_dataoffset);
5704 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5705 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5706 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5707 "oob offset %u\n", pkt->rm_oobdataoffset);
5710 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5711 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5712 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5713 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5717 #undef IS_OFFSET_INVALID
5719 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5720 data_len = pkt->rm_datalen;
5721 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5722 pktinfo_len = pkt->rm_pktinfolen;
5725 * Check OOB coverage.
5727 if (__predict_false(pkt->rm_oobdatalen != 0)) {
5728 int oob_off, oob_len;
5730 if_printf(rxr->hn_ifp, "got oobdata\n");
5731 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5732 oob_len = pkt->rm_oobdatalen;
5734 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5735 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5736 "oob overflow, msglen %u, oob abs %d len %d\n",
5737 pkt->rm_len, oob_off, oob_len);
5742 * Check against data.
5744 if (hn_rndis_check_overlap(oob_off, oob_len,
5745 data_off, data_len)) {
5746 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5747 "oob overlaps data, oob abs %d len %d, "
5748 "data abs %d len %d\n",
5749 oob_off, oob_len, data_off, data_len);
5754 * Check against pktinfo.
5756 if (pktinfo_len != 0 &&
5757 hn_rndis_check_overlap(oob_off, oob_len,
5758 pktinfo_off, pktinfo_len)) {
5759 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5760 "oob overlaps pktinfo, oob abs %d len %d, "
5761 "pktinfo abs %d len %d\n",
5762 oob_off, oob_len, pktinfo_off, pktinfo_len);
5768 * Check per-packet-info coverage and find useful per-packet-info.
5770 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5771 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5772 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5773 if (__predict_true(pktinfo_len != 0)) {
5777 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5778 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5779 "pktinfo overflow, msglen %u, "
5780 "pktinfo abs %d len %d\n",
5781 pkt->rm_len, pktinfo_off, pktinfo_len);
5786 * Check packet info coverage.
5788 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5789 data_off, data_len);
5790 if (__predict_false(overlap)) {
5791 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5792 "pktinfo overlap data, pktinfo abs %d len %d, "
5793 "data abs %d len %d\n",
5794 pktinfo_off, pktinfo_len, data_off, data_len);
5799 * Find useful per-packet-info.
5801 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5802 pktinfo_len, &info);
5803 if (__predict_false(error)) {
5804 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5810 if (__predict_false(data_off + data_len > pkt->rm_len)) {
5811 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5812 "data overflow, msglen %u, data abs %d len %d\n",
5813 pkt->rm_len, data_off, data_len);
5816 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5819 static __inline void
5820 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5822 const struct rndis_msghdr *hdr;
5824 if (__predict_false(dlen < sizeof(*hdr))) {
5825 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5830 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5831 /* Hot data path. */
5832 hn_rndis_rx_data(rxr, data, dlen);
5837 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5838 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5840 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5844 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5846 const struct hn_nvs_hdr *hdr;
5848 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5849 if_printf(sc->hn_ifp, "invalid nvs notify\n");
5852 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5854 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5855 /* Useless; ignore */
5858 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5862 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5863 const struct vmbus_chanpkt_hdr *pkt)
5865 struct hn_nvs_sendctx *sndc;
5867 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5868 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5869 VMBUS_CHANPKT_DATALEN(pkt));
5872 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5878 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5879 const struct vmbus_chanpkt_hdr *pkthdr)
5881 const struct vmbus_chanpkt_rxbuf *pkt;
5882 const struct hn_nvs_hdr *nvs_hdr;
5885 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5886 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5889 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5891 /* Make sure that this is a RNDIS message. */
5892 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5893 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5898 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5899 if (__predict_false(hlen < sizeof(*pkt))) {
5900 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5903 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5905 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5906 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5911 count = pkt->cp_rxbuf_cnt;
5912 if (__predict_false(hlen <
5913 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5914 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5918 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5919 for (i = 0; i < count; ++i) {
5922 ofs = pkt->cp_rxbuf[i].rb_ofs;
5923 len = pkt->cp_rxbuf[i].rb_len;
5924 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5925 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5926 "ofs %d, len %d\n", i, ofs, len);
5929 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5933 * Ack the consumed RXBUF associated w/ this channel packet,
5934 * so that this RXBUF can be recycled by the hypervisor.
5936 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5940 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5943 struct hn_nvs_rndis_ack ack;
5946 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5947 ack.nvs_status = HN_NVS_STATUS_OK;
5951 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5952 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5953 if (__predict_false(error == EAGAIN)) {
5956 * This should _not_ happen in real world, since the
5957 * consumption of the TX bufring from the TX path is
5960 if (rxr->hn_ack_failed == 0)
5961 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5962 rxr->hn_ack_failed++;
5969 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5974 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5976 struct hn_rx_ring *rxr = xrxr;
5977 struct hn_softc *sc = rxr->hn_ifp->if_softc;
5980 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5983 pktlen = rxr->hn_pktbuf_len;
5984 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5985 if (__predict_false(error == ENOBUFS)) {
5990 * Expand channel packet buffer.
5993 * Use M_WAITOK here, since allocation failure
5996 nlen = rxr->hn_pktbuf_len * 2;
5997 while (nlen < pktlen)
5999 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
6001 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
6002 rxr->hn_pktbuf_len, nlen);
6004 free(rxr->hn_pktbuf, M_DEVBUF);
6005 rxr->hn_pktbuf = nbuf;
6006 rxr->hn_pktbuf_len = nlen;
6009 } else if (__predict_false(error == EAGAIN)) {
6010 /* No more channel packets; done! */
6013 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
6015 switch (pkt->cph_type) {
6016 case VMBUS_CHANPKT_TYPE_COMP:
6017 hn_nvs_handle_comp(sc, chan, pkt);
6020 case VMBUS_CHANPKT_TYPE_RXBUF:
6021 hn_nvs_handle_rxbuf(rxr, chan, pkt);
6024 case VMBUS_CHANPKT_TYPE_INBAND:
6025 hn_nvs_handle_notify(sc, pkt);
6029 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
6034 hn_chan_rollup(rxr, rxr->hn_txr);
6038 hn_sysinit(void *arg __unused)
6043 * Initialize VF map.
6045 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
6046 hn_vfmap_size = HN_VFMAP_SIZE_DEF;
6047 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
6051 * Fix the # of TX taskqueues.
6053 if (hn_tx_taskq_cnt <= 0)
6054 hn_tx_taskq_cnt = 1;
6055 else if (hn_tx_taskq_cnt > mp_ncpus)
6056 hn_tx_taskq_cnt = mp_ncpus;
6059 * Fix the TX taskqueue mode.
6061 switch (hn_tx_taskq_mode) {
6062 case HN_TX_TASKQ_M_INDEP:
6063 case HN_TX_TASKQ_M_GLOBAL:
6064 case HN_TX_TASKQ_M_EVTTQ:
6067 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
6071 if (vm_guest != VM_GUEST_HV)
6074 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
6077 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
6078 M_DEVBUF, M_WAITOK);
6079 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
6080 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
6081 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
6082 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
6086 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
6089 hn_sysuninit(void *arg __unused)
6092 if (hn_tx_taskque != NULL) {
6095 for (i = 0; i < hn_tx_taskq_cnt; ++i)
6096 taskqueue_free(hn_tx_taskque[i]);
6097 free(hn_tx_taskque, M_DEVBUF);
6100 if (hn_vfmap != NULL)
6101 free(hn_vfmap, M_DEVBUF);
6102 rm_destroy(&hn_vfmap_lock);
6104 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);