2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
62 #include <sys/param.h>
64 #include <sys/kernel.h>
65 #include <sys/limits.h>
66 #include <sys/malloc.h>
68 #include <sys/module.h>
70 #include <sys/queue.h>
72 #include <sys/rmlock.h>
75 #include <sys/socket.h>
76 #include <sys/sockio.h>
78 #include <sys/sysctl.h>
79 #include <sys/systm.h>
80 #include <sys/taskqueue.h>
81 #include <sys/buf_ring.h>
82 #include <sys/eventhandler.h>
84 #include <machine/atomic.h>
85 #include <machine/in_cksum.h>
88 #include <net/ethernet.h>
90 #include <net/if_arp.h>
91 #include <net/if_dl.h>
92 #include <net/if_media.h>
93 #include <net/if_types.h>
94 #include <net/if_var.h>
95 #include <net/if_vlan_var.h>
96 #include <net/rndis.h>
98 #include <netinet/in_systm.h>
99 #include <netinet/in.h>
100 #include <netinet/ip.h>
101 #include <netinet/ip6.h>
102 #include <netinet/tcp.h>
103 #include <netinet/tcp_lro.h>
104 #include <netinet/udp.h>
106 #include <dev/hyperv/include/hyperv.h>
107 #include <dev/hyperv/include/hyperv_busdma.h>
108 #include <dev/hyperv/include/vmbus.h>
109 #include <dev/hyperv/include/vmbus_xact.h>
111 #include <dev/hyperv/netvsc/ndis.h>
112 #include <dev/hyperv/netvsc/if_hnreg.h>
113 #include <dev/hyperv/netvsc/if_hnvar.h>
114 #include <dev/hyperv/netvsc/hn_nvs.h>
115 #include <dev/hyperv/netvsc/hn_rndis.h>
117 #include "vmbus_if.h"
119 #define HN_IFSTART_SUPPORT
121 #define HN_RING_CNT_DEF_MAX 8
123 #define HN_VFMAP_SIZE_DEF 8
125 /* YYY should get it from the underlying channel */
126 #define HN_TX_DESC_CNT 512
128 #define HN_RNDIS_PKT_LEN \
129 (sizeof(struct rndis_packet_msg) + \
130 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
131 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
132 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
133 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
134 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
135 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
137 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
138 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
139 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
140 /* -1 for RNDIS packet message */
141 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
143 #define HN_DIRECT_TX_SIZE_DEF 128
145 #define HN_EARLY_TXEOF_THRESH 8
147 #define HN_PKTBUF_LEN_DEF (16 * 1024)
149 #define HN_LROENT_CNT_DEF 128
151 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
152 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
153 /* YYY 2*MTU is a bit rough, but should be good enough. */
154 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
156 #define HN_LRO_ACKCNT_DEF 1
158 #define HN_LOCK_INIT(sc) \
159 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
160 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
161 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
162 #define HN_LOCK(sc) \
164 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
167 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
169 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
170 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
171 #define HN_CSUM_IP_HWASSIST(sc) \
172 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
173 #define HN_CSUM_IP6_HWASSIST(sc) \
174 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
176 #define HN_PKTSIZE_MIN(align) \
177 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
178 HN_RNDIS_PKT_LEN, (align))
179 #define HN_PKTSIZE(m, align) \
180 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
182 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
185 #ifndef HN_USE_TXDESC_BUFRING
186 SLIST_ENTRY(hn_txdesc) link;
188 STAILQ_ENTRY(hn_txdesc) agg_link;
190 /* Aggregated txdescs, in sending order. */
191 STAILQ_HEAD(, hn_txdesc) agg_list;
193 /* The oldest packet, if transmission aggregation happens. */
195 struct hn_tx_ring *txr;
197 uint32_t flags; /* HN_TXD_FLAG_ */
198 struct hn_nvs_sendctx send_ctx;
202 bus_dmamap_t data_dmap;
204 bus_addr_t rndis_pkt_paddr;
205 struct rndis_packet_msg *rndis_pkt;
206 bus_dmamap_t rndis_pkt_dmap;
209 #define HN_TXD_FLAG_ONLIST 0x0001
210 #define HN_TXD_FLAG_DMAMAP 0x0002
211 #define HN_TXD_FLAG_ONAGG 0x0004
220 struct hn_update_vf {
221 struct hn_rx_ring *rxr;
225 #define HN_RXINFO_VLAN 0x0001
226 #define HN_RXINFO_CSUM 0x0002
227 #define HN_RXINFO_HASHINF 0x0004
228 #define HN_RXINFO_HASHVAL 0x0008
229 #define HN_RXINFO_ALL \
232 HN_RXINFO_HASHINF | \
235 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
236 #define HN_NDIS_RXCSUM_INFO_INVALID 0
237 #define HN_NDIS_HASH_INFO_INVALID 0
239 static int hn_probe(device_t);
240 static int hn_attach(device_t);
241 static int hn_detach(device_t);
242 static int hn_shutdown(device_t);
243 static void hn_chan_callback(struct vmbus_channel *,
246 static void hn_init(void *);
247 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
248 #ifdef HN_IFSTART_SUPPORT
249 static void hn_start(struct ifnet *);
251 static int hn_transmit(struct ifnet *, struct mbuf *);
252 static void hn_xmit_qflush(struct ifnet *);
253 static int hn_ifmedia_upd(struct ifnet *);
254 static void hn_ifmedia_sts(struct ifnet *,
255 struct ifmediareq *);
257 static void hn_ifnet_event(void *, struct ifnet *, int);
258 static void hn_ifaddr_event(void *, struct ifnet *);
259 static void hn_ifnet_attevent(void *, struct ifnet *);
260 static void hn_ifnet_detevent(void *, struct ifnet *);
262 static int hn_rndis_rxinfo(const void *, int,
264 static void hn_rndis_rx_data(struct hn_rx_ring *,
266 static void hn_rndis_rx_status(struct hn_softc *,
268 static void hn_rndis_init_fixat(struct hn_softc *, int);
270 static void hn_nvs_handle_notify(struct hn_softc *,
271 const struct vmbus_chanpkt_hdr *);
272 static void hn_nvs_handle_comp(struct hn_softc *,
273 struct vmbus_channel *,
274 const struct vmbus_chanpkt_hdr *);
275 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
276 struct vmbus_channel *,
277 const struct vmbus_chanpkt_hdr *);
278 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
279 struct vmbus_channel *, uint64_t);
281 #if __FreeBSD_version >= 1100099
282 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
283 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
285 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
286 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
287 #if __FreeBSD_version < 1100095
288 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
290 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
292 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
293 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
294 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
295 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
296 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
297 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
298 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
299 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
300 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
301 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
302 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
303 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
304 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
305 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
306 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
307 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
308 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
309 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
310 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
312 static void hn_stop(struct hn_softc *, bool);
313 static void hn_init_locked(struct hn_softc *);
314 static int hn_chan_attach(struct hn_softc *,
315 struct vmbus_channel *);
316 static void hn_chan_detach(struct hn_softc *,
317 struct vmbus_channel *);
318 static int hn_attach_subchans(struct hn_softc *);
319 static void hn_detach_allchans(struct hn_softc *);
320 static void hn_chan_rollup(struct hn_rx_ring *,
321 struct hn_tx_ring *);
322 static void hn_set_ring_inuse(struct hn_softc *, int);
323 static int hn_synth_attach(struct hn_softc *, int);
324 static void hn_synth_detach(struct hn_softc *);
325 static int hn_synth_alloc_subchans(struct hn_softc *,
327 static bool hn_synth_attachable(const struct hn_softc *);
328 static void hn_suspend(struct hn_softc *);
329 static void hn_suspend_data(struct hn_softc *);
330 static void hn_suspend_mgmt(struct hn_softc *);
331 static void hn_resume(struct hn_softc *);
332 static void hn_resume_data(struct hn_softc *);
333 static void hn_resume_mgmt(struct hn_softc *);
334 static void hn_suspend_mgmt_taskfunc(void *, int);
335 static void hn_chan_drain(struct hn_softc *,
336 struct vmbus_channel *);
337 static void hn_disable_rx(struct hn_softc *);
338 static void hn_drain_rxtx(struct hn_softc *, int);
339 static void hn_polling(struct hn_softc *, u_int);
340 static void hn_chan_polling(struct vmbus_channel *, u_int);
342 static void hn_update_link_status(struct hn_softc *);
343 static void hn_change_network(struct hn_softc *);
344 static void hn_link_taskfunc(void *, int);
345 static void hn_netchg_init_taskfunc(void *, int);
346 static void hn_netchg_status_taskfunc(void *, int);
347 static void hn_link_status(struct hn_softc *);
349 static int hn_create_rx_data(struct hn_softc *, int);
350 static void hn_destroy_rx_data(struct hn_softc *);
351 static int hn_check_iplen(const struct mbuf *, int);
352 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
353 static int hn_rxfilter_config(struct hn_softc *);
354 static int hn_rss_reconfig(struct hn_softc *);
355 static void hn_rss_ind_fixup(struct hn_softc *);
356 static int hn_rxpkt(struct hn_rx_ring *, const void *,
357 int, const struct hn_rxinfo *);
359 static int hn_tx_ring_create(struct hn_softc *, int);
360 static void hn_tx_ring_destroy(struct hn_tx_ring *);
361 static int hn_create_tx_data(struct hn_softc *, int);
362 static void hn_fixup_tx_data(struct hn_softc *);
363 static void hn_destroy_tx_data(struct hn_softc *);
364 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
365 static void hn_txdesc_gc(struct hn_tx_ring *,
367 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
368 struct hn_txdesc *, struct mbuf **);
369 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
371 static void hn_set_chim_size(struct hn_softc *, int);
372 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
373 static bool hn_tx_ring_pending(struct hn_tx_ring *);
374 static void hn_tx_ring_qflush(struct hn_tx_ring *);
375 static void hn_resume_tx(struct hn_softc *, int);
376 static void hn_set_txagg(struct hn_softc *);
377 static void *hn_try_txagg(struct ifnet *,
378 struct hn_tx_ring *, struct hn_txdesc *,
380 static int hn_get_txswq_depth(const struct hn_tx_ring *);
381 static void hn_txpkt_done(struct hn_nvs_sendctx *,
382 struct hn_softc *, struct vmbus_channel *,
384 static int hn_txpkt_sglist(struct hn_tx_ring *,
386 static int hn_txpkt_chim(struct hn_tx_ring *,
388 static int hn_xmit(struct hn_tx_ring *, int);
389 static void hn_xmit_taskfunc(void *, int);
390 static void hn_xmit_txeof(struct hn_tx_ring *);
391 static void hn_xmit_txeof_taskfunc(void *, int);
392 #ifdef HN_IFSTART_SUPPORT
393 static int hn_start_locked(struct hn_tx_ring *, int);
394 static void hn_start_taskfunc(void *, int);
395 static void hn_start_txeof(struct hn_tx_ring *);
396 static void hn_start_txeof_taskfunc(void *, int);
399 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
400 "Hyper-V network interface");
402 /* Trust tcp segements verification on host side. */
403 static int hn_trust_hosttcp = 1;
404 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
405 &hn_trust_hosttcp, 0,
406 "Trust tcp segement verification on host side, "
407 "when csum info is missing (global setting)");
409 /* Trust udp datagrams verification on host side. */
410 static int hn_trust_hostudp = 1;
411 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
412 &hn_trust_hostudp, 0,
413 "Trust udp datagram verification on host side, "
414 "when csum info is missing (global setting)");
416 /* Trust ip packets verification on host side. */
417 static int hn_trust_hostip = 1;
418 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
420 "Trust ip packet verification on host side, "
421 "when csum info is missing (global setting)");
423 /* Limit TSO burst size */
424 static int hn_tso_maxlen = IP_MAXPACKET;
425 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
426 &hn_tso_maxlen, 0, "TSO burst limit");
428 /* Limit chimney send size */
429 static int hn_tx_chimney_size = 0;
430 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
431 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
433 /* Limit the size of packet for direct transmission */
434 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
435 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
436 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
438 /* # of LRO entries per RX ring */
439 #if defined(INET) || defined(INET6)
440 #if __FreeBSD_version >= 1100095
441 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
442 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
443 &hn_lro_entry_count, 0, "LRO entry count");
447 static int hn_tx_taskq_cnt = 1;
448 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
449 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
451 #define HN_TX_TASKQ_M_INDEP 0
452 #define HN_TX_TASKQ_M_GLOBAL 1
453 #define HN_TX_TASKQ_M_EVTTQ 2
455 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
456 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
457 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
458 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
460 #ifndef HN_USE_TXDESC_BUFRING
461 static int hn_use_txdesc_bufring = 0;
463 static int hn_use_txdesc_bufring = 1;
465 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
466 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
468 #ifdef HN_IFSTART_SUPPORT
469 /* Use ifnet.if_start instead of ifnet.if_transmit */
470 static int hn_use_if_start = 0;
471 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
472 &hn_use_if_start, 0, "Use if_start TX method");
475 /* # of channels to use */
476 static int hn_chan_cnt = 0;
477 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
479 "# of channels to use; each channel has one RX ring and one TX ring");
481 /* # of transmit rings to use */
482 static int hn_tx_ring_cnt = 0;
483 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
484 &hn_tx_ring_cnt, 0, "# of TX rings to use");
486 /* Software TX ring deptch */
487 static int hn_tx_swq_depth = 0;
488 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
489 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
491 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
492 #if __FreeBSD_version >= 1100095
493 static u_int hn_lro_mbufq_depth = 0;
494 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
495 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
498 /* Packet transmission aggregation size limit */
499 static int hn_tx_agg_size = -1;
500 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
501 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
503 /* Packet transmission aggregation count limit */
504 static int hn_tx_agg_pkts = -1;
505 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
506 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
509 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
510 0, 0, hn_vflist_sysctl, "A", "VF list");
513 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
514 0, 0, hn_vfmap_sysctl, "A", "VF mapping");
516 static u_int hn_cpu_index; /* next CPU for channel */
517 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
519 static struct rmlock hn_vfmap_lock;
520 static int hn_vfmap_size;
521 static struct ifnet **hn_vfmap;
524 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
525 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
526 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
527 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
528 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
529 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
532 static device_method_t hn_methods[] = {
533 /* Device interface */
534 DEVMETHOD(device_probe, hn_probe),
535 DEVMETHOD(device_attach, hn_attach),
536 DEVMETHOD(device_detach, hn_detach),
537 DEVMETHOD(device_shutdown, hn_shutdown),
541 static driver_t hn_driver = {
544 sizeof(struct hn_softc)
547 static devclass_t hn_devclass;
549 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
550 MODULE_VERSION(hn, 1);
551 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
553 #if __FreeBSD_version >= 1100099
555 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
559 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
560 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
565 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
568 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
569 txd->chim_size == 0, ("invalid rndis sglist txd"));
570 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
571 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
575 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
577 struct hn_nvs_rndis rndis;
579 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
580 txd->chim_size > 0, ("invalid rndis chim txd"));
582 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
583 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
584 rndis.nvs_chim_idx = txd->chim_index;
585 rndis.nvs_chim_sz = txd->chim_size;
587 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
588 &rndis, sizeof(rndis), &txd->send_ctx));
591 static __inline uint32_t
592 hn_chim_alloc(struct hn_softc *sc)
594 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
595 u_long *bmap = sc->hn_chim_bmap;
596 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
598 for (i = 0; i < bmap_cnt; ++i) {
601 idx = ffsl(~bmap[i]);
605 --idx; /* ffsl is 1-based */
606 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
607 ("invalid i %d and idx %d", i, idx));
609 if (atomic_testandset_long(&bmap[i], idx))
612 ret = i * LONG_BIT + idx;
619 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
624 idx = chim_idx / LONG_BIT;
625 KASSERT(idx < sc->hn_chim_bmap_cnt,
626 ("invalid chimney index 0x%x", chim_idx));
628 mask = 1UL << (chim_idx % LONG_BIT);
629 KASSERT(sc->hn_chim_bmap[idx] & mask,
630 ("index bitmap 0x%lx, chimney index %u, "
631 "bitmap idx %d, bitmask 0x%lx",
632 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
634 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
637 #if defined(INET6) || defined(INET)
639 #define PULLUP_HDR(m, len) \
641 if (__predict_false((m)->m_len < (len))) { \
642 (m) = m_pullup((m), (len)); \
649 * NOTE: If this function failed, the m_head would be freed.
651 static __inline struct mbuf *
652 hn_tso_fixup(struct mbuf *m_head)
654 struct ether_vlan_header *evl;
658 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
660 PULLUP_HDR(m_head, sizeof(*evl));
661 evl = mtod(m_head, struct ether_vlan_header *);
662 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
663 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
665 ehlen = ETHER_HDR_LEN;
668 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
672 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
673 ip = mtodo(m_head, ehlen);
674 iphlen = ip->ip_hl << 2;
676 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
677 th = mtodo(m_head, ehlen + iphlen);
681 th->th_sum = in_pseudo(ip->ip_src.s_addr,
682 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
685 #if defined(INET6) && defined(INET)
692 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
693 ip6 = mtodo(m_head, ehlen);
694 if (ip6->ip6_nxt != IPPROTO_TCP) {
699 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
700 th = mtodo(m_head, ehlen + sizeof(*ip6));
703 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
711 * NOTE: If this function failed, the m_head would be freed.
713 static __inline struct mbuf *
714 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
716 const struct ether_vlan_header *evl;
717 const struct tcphdr *th;
722 PULLUP_HDR(m_head, sizeof(*evl));
723 evl = mtod(m_head, const struct ether_vlan_header *);
724 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
725 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
727 ehlen = ETHER_HDR_LEN;
730 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TCP) {
734 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
735 ip = mtodo(m_head, ehlen);
736 iphlen = ip->ip_hl << 2;
738 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
739 th = mtodo(m_head, ehlen + iphlen);
740 if (th->th_flags & TH_SYN)
744 #if defined(INET6) && defined(INET)
749 const struct ip6_hdr *ip6;
751 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
752 ip6 = mtodo(m_head, ehlen);
753 if (ip6->ip6_nxt != IPPROTO_TCP)
756 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
757 th = mtodo(m_head, ehlen + sizeof(*ip6));
758 if (th->th_flags & TH_SYN)
767 #endif /* INET6 || INET */
770 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
776 if (sc->hn_rx_filter != filter) {
777 error = hn_rndis_set_rxfilter(sc, filter);
779 sc->hn_rx_filter = filter;
785 hn_rxfilter_config(struct hn_softc *sc)
787 struct ifnet *ifp = sc->hn_ifp;
792 if ((ifp->if_flags & IFF_PROMISC) ||
793 (sc->hn_flags & HN_FLAG_VF)) {
794 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
796 filter = NDIS_PACKET_TYPE_DIRECTED;
797 if (ifp->if_flags & IFF_BROADCAST)
798 filter |= NDIS_PACKET_TYPE_BROADCAST;
799 /* TODO: support multicast list */
800 if ((ifp->if_flags & IFF_ALLMULTI) ||
801 !TAILQ_EMPTY(&ifp->if_multiaddrs))
802 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
804 return (hn_set_rxfilter(sc, filter));
808 hn_set_txagg(struct hn_softc *sc)
814 * Setup aggregation size.
816 if (sc->hn_agg_size < 0)
819 size = sc->hn_agg_size;
821 if (sc->hn_rndis_agg_size < size)
822 size = sc->hn_rndis_agg_size;
824 /* NOTE: We only aggregate packets using chimney sending buffers. */
825 if (size > (uint32_t)sc->hn_chim_szmax)
826 size = sc->hn_chim_szmax;
828 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
835 /* NOTE: Type of the per TX ring setting is 'int'. */
840 * Setup aggregation packet count.
842 if (sc->hn_agg_pkts < 0)
845 pkts = sc->hn_agg_pkts;
847 if (sc->hn_rndis_agg_pkts < pkts)
848 pkts = sc->hn_rndis_agg_pkts;
857 /* NOTE: Type of the per TX ring setting is 'short'. */
862 /* NOTE: Type of the per TX ring setting is 'short'. */
863 if (sc->hn_rndis_agg_align > SHRT_MAX) {
870 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
871 size, pkts, sc->hn_rndis_agg_align);
874 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
875 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
877 mtx_lock(&txr->hn_tx_lock);
878 txr->hn_agg_szmax = size;
879 txr->hn_agg_pktmax = pkts;
880 txr->hn_agg_align = sc->hn_rndis_agg_align;
881 mtx_unlock(&txr->hn_tx_lock);
886 hn_get_txswq_depth(const struct hn_tx_ring *txr)
889 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
890 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
891 return txr->hn_txdesc_cnt;
892 return hn_tx_swq_depth;
896 hn_rss_reconfig(struct hn_softc *sc)
902 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
909 * Direct reconfiguration by setting the UNCHG flags does
910 * _not_ work properly.
913 if_printf(sc->hn_ifp, "disable RSS\n");
914 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
916 if_printf(sc->hn_ifp, "RSS disable failed\n");
921 * Reenable the RSS w/ the updated RSS key or indirect
925 if_printf(sc->hn_ifp, "reconfig RSS\n");
926 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
928 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
935 hn_rss_ind_fixup(struct hn_softc *sc)
937 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
940 nchan = sc->hn_rx_ring_inuse;
941 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
944 * Check indirect table to make sure that all channels in it
947 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
948 if (rss->rss_ind[i] >= nchan) {
949 if_printf(sc->hn_ifp,
950 "RSS indirect table %d fixup: %u -> %d\n",
951 i, rss->rss_ind[i], nchan - 1);
952 rss->rss_ind[i] = nchan - 1;
958 hn_ifmedia_upd(struct ifnet *ifp __unused)
965 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
967 struct hn_softc *sc = ifp->if_softc;
969 ifmr->ifm_status = IFM_AVALID;
970 ifmr->ifm_active = IFM_ETHER;
972 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
973 ifmr->ifm_active |= IFM_NONE;
976 ifmr->ifm_status |= IFM_ACTIVE;
977 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
981 hn_update_vf_task(void *arg, int pending __unused)
983 struct hn_update_vf *uv = arg;
985 uv->rxr->hn_rxvf_ifp = uv->vf;
989 hn_update_vf(struct hn_softc *sc, struct ifnet *vf)
991 struct hn_rx_ring *rxr;
992 struct hn_update_vf uv;
998 TASK_INIT(&task, 0, hn_update_vf_task, &uv);
1000 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1001 rxr = &sc->hn_rx_ring[i];
1003 if (i < sc->hn_rx_ring_inuse) {
1006 vmbus_chan_run_task(rxr->hn_chan, &task);
1008 rxr->hn_rxvf_ifp = vf;
1013 static __inline bool
1014 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1016 const struct ifnet *hn_ifp;
1018 hn_ifp = sc->hn_ifp;
1023 if (ifp->if_alloctype != IFT_ETHER)
1026 /* Ignore lagg/vlan interfaces */
1027 if (strcmp(ifp->if_dname, "lagg") == 0 ||
1028 strcmp(ifp->if_dname, "vlan") == 0)
1031 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1038 hn_set_vf(struct hn_softc *sc, struct ifnet *ifp, bool vf)
1040 struct ifnet *hn_ifp;
1044 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1047 if (!hn_ismyvf(sc, ifp))
1050 hn_ifp = sc->hn_ifp;
1052 /* Now we're sure 'ifp' is a real VF device. */
1054 if (sc->hn_flags & HN_FLAG_VF)
1057 sc->hn_flags |= HN_FLAG_VF;
1058 hn_rxfilter_config(sc);
1060 if (!(sc->hn_flags & HN_FLAG_VF))
1063 sc->hn_flags &= ~HN_FLAG_VF;
1064 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1065 hn_rxfilter_config(sc);
1067 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1070 hn_nvs_set_datapath(sc,
1071 vf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTHETIC);
1073 hn_update_vf(sc, vf ? ifp : NULL);
1076 hn_suspend_mgmt(sc);
1077 sc->hn_link_flags &=
1078 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1079 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1084 devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp),
1085 vf ? "VF_UP" : "VF_DOWN", NULL);
1088 if_printf(hn_ifp, "Data path is switched %s %s\n",
1089 vf ? "to" : "from", if_name(ifp));
1095 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1097 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1100 hn_set_vf(arg, ifp, event == IFNET_EVENT_UP);
1104 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1106 hn_set_vf(arg, ifp, ifp->if_flags & IFF_UP);
1110 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1112 struct hn_softc *sc = xsc;
1116 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1119 if (!hn_ismyvf(sc, ifp))
1122 if (sc->hn_vf_ifp != NULL) {
1123 if_printf(sc->hn_ifp, "%s was attached as VF\n",
1124 sc->hn_vf_ifp->if_xname);
1128 rm_wlock(&hn_vfmap_lock);
1130 if (ifp->if_index >= hn_vfmap_size) {
1131 struct ifnet **newmap;
1134 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1135 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1138 memcpy(newmap, hn_vfmap,
1139 sizeof(struct ifnet *) * hn_vfmap_size);
1140 free(hn_vfmap, M_DEVBUF);
1142 hn_vfmap_size = newsize;
1144 KASSERT(hn_vfmap[ifp->if_index] == NULL,
1145 ("%s: ifindex %d was mapped to %s",
1146 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1147 hn_vfmap[ifp->if_index] = sc->hn_ifp;
1149 rm_wunlock(&hn_vfmap_lock);
1151 sc->hn_vf_ifp = ifp;
1157 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1159 struct hn_softc *sc = xsc;
1163 if (sc->hn_vf_ifp == NULL)
1166 if (!hn_ismyvf(sc, ifp))
1169 sc->hn_vf_ifp = NULL;
1171 rm_wlock(&hn_vfmap_lock);
1173 KASSERT(ifp->if_index < hn_vfmap_size,
1174 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
1175 if (hn_vfmap[ifp->if_index] != NULL) {
1176 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
1177 ("%s: ifindex %d was mapped to %s",
1178 ifp->if_xname, ifp->if_index,
1179 hn_vfmap[ifp->if_index]->if_xname));
1180 hn_vfmap[ifp->if_index] = NULL;
1183 rm_wunlock(&hn_vfmap_lock);
1188 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
1189 static const struct hyperv_guid g_net_vsc_device_type = {
1190 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
1191 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
1195 hn_probe(device_t dev)
1198 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
1199 &g_net_vsc_device_type) == 0) {
1200 device_set_desc(dev, "Hyper-V Network Interface");
1201 return BUS_PROBE_DEFAULT;
1207 hn_attach(device_t dev)
1209 struct hn_softc *sc = device_get_softc(dev);
1210 struct sysctl_oid_list *child;
1211 struct sysctl_ctx_list *ctx;
1212 uint8_t eaddr[ETHER_ADDR_LEN];
1213 struct ifnet *ifp = NULL;
1214 int error, ring_cnt, tx_ring_cnt;
1217 sc->hn_prichan = vmbus_get_channel(dev);
1221 * Initialize these tunables once.
1223 sc->hn_agg_size = hn_tx_agg_size;
1224 sc->hn_agg_pkts = hn_tx_agg_pkts;
1227 * Setup taskqueue for transmission.
1229 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
1233 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
1234 M_DEVBUF, M_WAITOK);
1235 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
1236 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
1237 M_WAITOK, taskqueue_thread_enqueue,
1238 &sc->hn_tx_taskqs[i]);
1239 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
1240 "%s tx%d", device_get_nameunit(dev), i);
1242 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
1243 sc->hn_tx_taskqs = hn_tx_taskque;
1247 * Setup taskqueue for mangement tasks, e.g. link status.
1249 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
1250 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
1251 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
1252 device_get_nameunit(dev));
1253 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
1254 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
1255 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
1256 hn_netchg_status_taskfunc, sc);
1259 * Allocate ifnet and setup its name earlier, so that if_printf
1260 * can be used by functions, which will be called after
1263 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
1265 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
1268 * Initialize ifmedia earlier so that it can be unconditionally
1269 * destroyed, if error happened later on.
1271 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
1274 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
1275 * to use (tx_ring_cnt).
1278 * The # of RX rings to use is same as the # of channels to use.
1280 ring_cnt = hn_chan_cnt;
1281 if (ring_cnt <= 0) {
1283 ring_cnt = mp_ncpus;
1284 if (ring_cnt > HN_RING_CNT_DEF_MAX)
1285 ring_cnt = HN_RING_CNT_DEF_MAX;
1286 } else if (ring_cnt > mp_ncpus) {
1287 ring_cnt = mp_ncpus;
1290 tx_ring_cnt = hn_tx_ring_cnt;
1291 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
1292 tx_ring_cnt = ring_cnt;
1293 #ifdef HN_IFSTART_SUPPORT
1294 if (hn_use_if_start) {
1295 /* ifnet.if_start only needs one TX ring. */
1301 * Set the leader CPU for channels.
1303 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1306 * Create enough TX/RX rings, even if only limited number of
1307 * channels can be allocated.
1309 error = hn_create_tx_data(sc, tx_ring_cnt);
1312 error = hn_create_rx_data(sc, ring_cnt);
1317 * Create transaction context for NVS and RNDIS transactions.
1319 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1320 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1321 if (sc->hn_xact == NULL) {
1327 * Install orphan handler for the revocation of this device's
1331 * The processing order is critical here:
1332 * Install the orphan handler, _before_ testing whether this
1333 * device's primary channel has been revoked or not.
1335 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1336 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1342 * Attach the synthetic parts, i.e. NVS and RNDIS.
1344 error = hn_synth_attach(sc, ETHERMTU);
1348 error = hn_rndis_get_eaddr(sc, eaddr);
1352 #if __FreeBSD_version >= 1100099
1353 if (sc->hn_rx_ring_inuse > 1) {
1355 * Reduce TCP segment aggregation limit for multiple
1356 * RX rings to increase ACK timeliness.
1358 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1363 * Fixup TX stuffs after synthetic parts are attached.
1365 hn_fixup_tx_data(sc);
1367 ctx = device_get_sysctl_ctx(dev);
1368 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1369 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1370 &sc->hn_nvs_ver, 0, "NVS version");
1371 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1372 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1373 hn_ndis_version_sysctl, "A", "NDIS version");
1374 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1375 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1376 hn_caps_sysctl, "A", "capabilities");
1377 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1378 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1379 hn_hwassist_sysctl, "A", "hwassist");
1380 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1381 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1382 hn_rxfilter_sysctl, "A", "rxfilter");
1383 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1384 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1385 hn_rss_hash_sysctl, "A", "RSS hash");
1386 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1387 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1388 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1389 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1390 hn_rss_key_sysctl, "IU", "RSS key");
1391 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1392 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1393 hn_rss_ind_sysctl, "IU", "RSS indirect table");
1394 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1395 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1396 "RNDIS offered packet transmission aggregation size limit");
1397 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1398 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1399 "RNDIS offered packet transmission aggregation count limit");
1400 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1401 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1402 "RNDIS packet transmission aggregation alignment");
1403 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1404 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1405 hn_txagg_size_sysctl, "I",
1406 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1407 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1408 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1409 hn_txagg_pkts_sysctl, "I",
1410 "Packet transmission aggregation packets, "
1411 "0 -- disable, -1 -- auto");
1412 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1413 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1414 hn_polling_sysctl, "I",
1415 "Polling frequency: [100,1000000], 0 disable polling");
1416 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
1417 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1418 hn_vf_sysctl, "A", "Virtual Function's name");
1419 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
1420 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1421 hn_rxvf_sysctl, "A", "activated Virtual Function's name");
1424 * Setup the ifmedia, which has been initialized earlier.
1426 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1427 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1428 /* XXX ifmedia_set really should do this for us */
1429 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1432 * Setup the ifnet for this interface.
1436 ifp->if_baudrate = IF_Gbps(10);
1438 /* if_baudrate is 32bits on 32bit system. */
1439 ifp->if_baudrate = IF_Gbps(1);
1441 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1442 ifp->if_ioctl = hn_ioctl;
1443 ifp->if_init = hn_init;
1444 #ifdef HN_IFSTART_SUPPORT
1445 if (hn_use_if_start) {
1446 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1448 ifp->if_start = hn_start;
1449 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1450 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1451 IFQ_SET_READY(&ifp->if_snd);
1455 ifp->if_transmit = hn_transmit;
1456 ifp->if_qflush = hn_xmit_qflush;
1459 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1461 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1462 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1464 if (sc->hn_caps & HN_CAP_VLAN) {
1465 /* XXX not sure about VLAN_MTU. */
1466 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1469 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1470 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1471 ifp->if_capabilities |= IFCAP_TXCSUM;
1472 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1473 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1474 if (sc->hn_caps & HN_CAP_TSO4) {
1475 ifp->if_capabilities |= IFCAP_TSO4;
1476 ifp->if_hwassist |= CSUM_IP_TSO;
1478 if (sc->hn_caps & HN_CAP_TSO6) {
1479 ifp->if_capabilities |= IFCAP_TSO6;
1480 ifp->if_hwassist |= CSUM_IP6_TSO;
1483 /* Enable all available capabilities by default. */
1484 ifp->if_capenable = ifp->if_capabilities;
1487 * Disable IPv6 TSO and TXCSUM by default, they still can
1488 * be enabled through SIOCSIFCAP.
1490 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1491 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1493 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1494 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1495 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1496 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1499 ether_ifattach(ifp, eaddr);
1501 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1502 if_printf(ifp, "TSO segcnt %u segsz %u\n",
1503 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1506 /* Inform the upper layer about the long frame support. */
1507 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1510 * Kick off link status check.
1512 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1513 hn_update_link_status(sc);
1515 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
1516 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
1517 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
1518 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
1520 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
1521 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
1522 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
1523 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
1527 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1528 hn_synth_detach(sc);
1534 hn_detach(device_t dev)
1536 struct hn_softc *sc = device_get_softc(dev);
1537 struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
1539 if (sc->hn_ifaddr_evthand != NULL)
1540 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
1541 if (sc->hn_ifnet_evthand != NULL)
1542 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
1543 if (sc->hn_ifnet_atthand != NULL) {
1544 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
1545 sc->hn_ifnet_atthand);
1547 if (sc->hn_ifnet_dethand != NULL) {
1548 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
1549 sc->hn_ifnet_dethand);
1552 vf_ifp = sc->hn_vf_ifp;
1553 __compiler_membar();
1555 hn_ifnet_detevent(sc, vf_ifp);
1557 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1559 * In case that the vmbus missed the orphan handler
1562 vmbus_xact_ctx_orphan(sc->hn_xact);
1565 if (device_is_attached(dev)) {
1567 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1568 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1572 * hn_stop() only suspends data, so managment
1573 * stuffs have to be suspended manually here.
1575 hn_suspend_mgmt(sc);
1576 hn_synth_detach(sc);
1579 ether_ifdetach(ifp);
1582 ifmedia_removeall(&sc->hn_media);
1583 hn_destroy_rx_data(sc);
1584 hn_destroy_tx_data(sc);
1586 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1589 for (i = 0; i < hn_tx_taskq_cnt; ++i)
1590 taskqueue_free(sc->hn_tx_taskqs[i]);
1591 free(sc->hn_tx_taskqs, M_DEVBUF);
1593 taskqueue_free(sc->hn_mgmt_taskq0);
1595 if (sc->hn_xact != NULL) {
1597 * Uninstall the orphan handler _before_ the xact is
1600 vmbus_chan_unset_orphan(sc->hn_prichan);
1601 vmbus_xact_ctx_destroy(sc->hn_xact);
1606 HN_LOCK_DESTROY(sc);
1611 hn_shutdown(device_t dev)
1618 hn_link_status(struct hn_softc *sc)
1620 uint32_t link_status;
1623 error = hn_rndis_get_linkstatus(sc, &link_status);
1625 /* XXX what to do? */
1629 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1630 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1632 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1633 if_link_state_change(sc->hn_ifp,
1634 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1635 LINK_STATE_UP : LINK_STATE_DOWN);
1639 hn_link_taskfunc(void *xsc, int pending __unused)
1641 struct hn_softc *sc = xsc;
1643 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1649 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1651 struct hn_softc *sc = xsc;
1653 /* Prevent any link status checks from running. */
1654 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1657 * Fake up a [link down --> link up] state change; 5 seconds
1658 * delay is used, which closely simulates miibus reaction
1659 * upon link down event.
1661 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1662 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1663 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1664 &sc->hn_netchg_status, 5 * hz);
1668 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1670 struct hn_softc *sc = xsc;
1672 /* Re-allow link status checks. */
1673 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1678 hn_update_link_status(struct hn_softc *sc)
1681 if (sc->hn_mgmt_taskq != NULL)
1682 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1686 hn_change_network(struct hn_softc *sc)
1689 if (sc->hn_mgmt_taskq != NULL)
1690 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1694 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1695 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1697 struct mbuf *m = *m_head;
1700 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1702 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1703 m, segs, nsegs, BUS_DMA_NOWAIT);
1704 if (error == EFBIG) {
1707 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1711 *m_head = m = m_new;
1712 txr->hn_tx_collapsed++;
1714 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1715 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1718 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1719 BUS_DMASYNC_PREWRITE);
1720 txd->flags |= HN_TXD_FLAG_DMAMAP;
1726 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1729 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1730 ("put an onlist txd %#x", txd->flags));
1731 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1732 ("put an onagg txd %#x", txd->flags));
1734 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1735 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1738 if (!STAILQ_EMPTY(&txd->agg_list)) {
1739 struct hn_txdesc *tmp_txd;
1741 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1744 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1745 ("resursive aggregation on aggregated txdesc"));
1746 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1747 ("not aggregated txdesc"));
1748 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1749 ("aggregated txdesc uses dmamap"));
1750 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1751 ("aggregated txdesc consumes "
1752 "chimney sending buffer"));
1753 KASSERT(tmp_txd->chim_size == 0,
1754 ("aggregated txdesc has non-zero "
1755 "chimney sending size"));
1757 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1758 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1759 freed = hn_txdesc_put(txr, tmp_txd);
1760 KASSERT(freed, ("failed to free aggregated txdesc"));
1764 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1765 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1766 ("chim txd uses dmamap"));
1767 hn_chim_free(txr->hn_sc, txd->chim_index);
1768 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1770 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1771 bus_dmamap_sync(txr->hn_tx_data_dtag,
1772 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1773 bus_dmamap_unload(txr->hn_tx_data_dtag,
1775 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1778 if (txd->m != NULL) {
1783 txd->flags |= HN_TXD_FLAG_ONLIST;
1784 #ifndef HN_USE_TXDESC_BUFRING
1785 mtx_lock_spin(&txr->hn_txlist_spin);
1786 KASSERT(txr->hn_txdesc_avail >= 0 &&
1787 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1788 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1789 txr->hn_txdesc_avail++;
1790 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1791 mtx_unlock_spin(&txr->hn_txlist_spin);
1792 #else /* HN_USE_TXDESC_BUFRING */
1794 atomic_add_int(&txr->hn_txdesc_avail, 1);
1796 buf_ring_enqueue(txr->hn_txdesc_br, txd);
1797 #endif /* !HN_USE_TXDESC_BUFRING */
1802 static __inline struct hn_txdesc *
1803 hn_txdesc_get(struct hn_tx_ring *txr)
1805 struct hn_txdesc *txd;
1807 #ifndef HN_USE_TXDESC_BUFRING
1808 mtx_lock_spin(&txr->hn_txlist_spin);
1809 txd = SLIST_FIRST(&txr->hn_txlist);
1811 KASSERT(txr->hn_txdesc_avail > 0,
1812 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1813 txr->hn_txdesc_avail--;
1814 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1816 mtx_unlock_spin(&txr->hn_txlist_spin);
1818 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1822 #ifdef HN_USE_TXDESC_BUFRING
1824 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1826 #endif /* HN_USE_TXDESC_BUFRING */
1827 KASSERT(txd->m == NULL && txd->refs == 0 &&
1828 STAILQ_EMPTY(&txd->agg_list) &&
1829 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1830 txd->chim_size == 0 &&
1831 (txd->flags & HN_TXD_FLAG_ONLIST) &&
1832 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1833 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1834 txd->flags &= ~HN_TXD_FLAG_ONLIST;
1840 static __inline void
1841 hn_txdesc_hold(struct hn_txdesc *txd)
1844 /* 0->1 transition will never work */
1845 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1846 atomic_add_int(&txd->refs, 1);
1849 static __inline void
1850 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1853 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1854 ("recursive aggregation on aggregating txdesc"));
1856 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1857 ("already aggregated"));
1858 KASSERT(STAILQ_EMPTY(&txd->agg_list),
1859 ("recursive aggregation on to-be-aggregated txdesc"));
1861 txd->flags |= HN_TXD_FLAG_ONAGG;
1862 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1866 hn_tx_ring_pending(struct hn_tx_ring *txr)
1868 bool pending = false;
1870 #ifndef HN_USE_TXDESC_BUFRING
1871 mtx_lock_spin(&txr->hn_txlist_spin);
1872 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1874 mtx_unlock_spin(&txr->hn_txlist_spin);
1876 if (!buf_ring_full(txr->hn_txdesc_br))
1882 static __inline void
1883 hn_txeof(struct hn_tx_ring *txr)
1885 txr->hn_has_txeof = 0;
1890 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1891 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1893 struct hn_txdesc *txd = sndc->hn_cbarg;
1894 struct hn_tx_ring *txr;
1897 KASSERT(txr->hn_chan == chan,
1898 ("channel mismatch, on chan%u, should be chan%u",
1899 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1901 txr->hn_has_txeof = 1;
1902 hn_txdesc_put(txr, txd);
1904 ++txr->hn_txdone_cnt;
1905 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1906 txr->hn_txdone_cnt = 0;
1907 if (txr->hn_oactive)
1913 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1915 #if defined(INET) || defined(INET6)
1916 struct lro_ctrl *lro = &rxr->hn_lro;
1917 struct lro_entry *queued;
1919 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1920 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1921 tcp_lro_flush(lro, queued);
1927 * 'txr' could be NULL, if multiple channels and
1928 * ifnet.if_start method are enabled.
1930 if (txr == NULL || !txr->hn_has_txeof)
1933 txr->hn_txdone_cnt = 0;
1937 static __inline uint32_t
1938 hn_rndis_pktmsg_offset(uint32_t ofs)
1941 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1942 ("invalid RNDIS packet msg offset %u", ofs));
1943 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1946 static __inline void *
1947 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1948 size_t pi_dlen, uint32_t pi_type)
1950 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1951 struct rndis_pktinfo *pi;
1953 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1954 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1957 * Per-packet-info does not move; it only grows.
1960 * rm_pktinfooffset in this phase counts from the beginning
1961 * of rndis_packet_msg.
1963 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1964 ("%u pktinfo overflows RNDIS packet msg", pi_type));
1965 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1966 pkt->rm_pktinfolen);
1967 pkt->rm_pktinfolen += pi_size;
1969 pi->rm_size = pi_size;
1970 pi->rm_type = pi_type;
1971 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1973 return (pi->rm_data);
1977 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1979 struct hn_txdesc *txd;
1983 txd = txr->hn_agg_txd;
1984 KASSERT(txd != NULL, ("no aggregate txdesc"));
1987 * Since hn_txpkt() will reset this temporary stat, save
1988 * it now, so that oerrors can be updated properly, if
1989 * hn_txpkt() ever fails.
1991 pkts = txr->hn_stat_pkts;
1994 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1995 * failure, save it for later freeing, if hn_txpkt() ever
1999 error = hn_txpkt(ifp, txr, txd);
2000 if (__predict_false(error)) {
2001 /* txd is freed, but m is not. */
2004 txr->hn_flush_failed++;
2005 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2008 /* Reset all aggregation states. */
2009 txr->hn_agg_txd = NULL;
2010 txr->hn_agg_szleft = 0;
2011 txr->hn_agg_pktleft = 0;
2012 txr->hn_agg_prevpkt = NULL;
2018 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2023 if (txr->hn_agg_txd != NULL) {
2024 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2025 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2026 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2030 * Update the previous RNDIS packet's total length,
2031 * it can be increased due to the mandatory alignment
2032 * padding for this RNDIS packet. And update the
2033 * aggregating txdesc's chimney sending buffer size
2037 * Zero-out the padding, as required by the RNDIS spec.
2040 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2041 agg_txd->chim_size += pkt->rm_len - olen;
2043 /* Link this txdesc to the parent. */
2044 hn_txdesc_agg(agg_txd, txd);
2046 chim = (uint8_t *)pkt + pkt->rm_len;
2047 /* Save the current packet for later fixup. */
2048 txr->hn_agg_prevpkt = chim;
2050 txr->hn_agg_pktleft--;
2051 txr->hn_agg_szleft -= pktsize;
2052 if (txr->hn_agg_szleft <=
2053 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2055 * Probably can't aggregate more packets,
2056 * flush this aggregating txdesc proactively.
2058 txr->hn_agg_pktleft = 0;
2063 hn_flush_txagg(ifp, txr);
2065 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
2067 txr->hn_tx_chimney_tried++;
2068 txd->chim_index = hn_chim_alloc(txr->hn_sc);
2069 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
2071 txr->hn_tx_chimney++;
2073 chim = txr->hn_sc->hn_chim +
2074 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
2076 if (txr->hn_agg_pktmax > 1 &&
2077 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2078 txr->hn_agg_txd = txd;
2079 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
2080 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
2081 txr->hn_agg_prevpkt = chim;
2088 * If this function fails, then both txd and m_head0 will be freed.
2091 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2092 struct mbuf **m_head0)
2094 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
2095 int error, nsegs, i;
2096 struct mbuf *m_head = *m_head0;
2097 struct rndis_packet_msg *pkt;
2100 int pkt_hlen, pkt_size;
2102 pkt = txd->rndis_pkt;
2103 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
2104 if (pkt_size < txr->hn_chim_size) {
2105 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
2109 if (txr->hn_agg_txd != NULL)
2110 hn_flush_txagg(ifp, txr);
2113 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
2114 pkt->rm_len = m_head->m_pkthdr.len;
2115 pkt->rm_dataoffset = 0;
2116 pkt->rm_datalen = m_head->m_pkthdr.len;
2117 pkt->rm_oobdataoffset = 0;
2118 pkt->rm_oobdatalen = 0;
2119 pkt->rm_oobdataelements = 0;
2120 pkt->rm_pktinfooffset = sizeof(*pkt);
2121 pkt->rm_pktinfolen = 0;
2122 pkt->rm_vchandle = 0;
2123 pkt->rm_reserved = 0;
2125 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
2127 * Set the hash value for this packet, so that the host could
2128 * dispatch the TX done event for this packet back to this TX
2131 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2132 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
2133 *pi_data = txr->hn_tx_idx;
2136 if (m_head->m_flags & M_VLANTAG) {
2137 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2138 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
2139 *pi_data = NDIS_VLAN_INFO_MAKE(
2140 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
2141 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
2142 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
2145 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
2146 #if defined(INET6) || defined(INET)
2147 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2148 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
2150 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
2151 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
2152 m_head->m_pkthdr.tso_segsz);
2155 #if defined(INET6) && defined(INET)
2160 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
2161 m_head->m_pkthdr.tso_segsz);
2164 #endif /* INET6 || INET */
2165 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
2166 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2167 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
2168 if (m_head->m_pkthdr.csum_flags &
2169 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
2170 *pi_data = NDIS_TXCSUM_INFO_IPV6;
2172 *pi_data = NDIS_TXCSUM_INFO_IPV4;
2173 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
2174 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
2177 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
2178 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
2179 else if (m_head->m_pkthdr.csum_flags &
2180 (CSUM_IP_UDP | CSUM_IP6_UDP))
2181 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
2184 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
2185 /* Fixup RNDIS packet message total length */
2186 pkt->rm_len += pkt_hlen;
2187 /* Convert RNDIS packet message offsets */
2188 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
2189 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
2192 * Fast path: Chimney sending.
2195 struct hn_txdesc *tgt_txd = txd;
2197 if (txr->hn_agg_txd != NULL) {
2198 tgt_txd = txr->hn_agg_txd;
2204 KASSERT(pkt == chim,
2205 ("RNDIS pkt not in chimney sending buffer"));
2206 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
2207 ("chimney sending buffer is not used"));
2208 tgt_txd->chim_size += pkt->rm_len;
2210 m_copydata(m_head, 0, m_head->m_pkthdr.len,
2211 ((uint8_t *)chim) + pkt_hlen);
2213 txr->hn_gpa_cnt = 0;
2214 txr->hn_sendpkt = hn_txpkt_chim;
2218 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
2219 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2220 ("chimney buffer is used"));
2221 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
2223 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
2224 if (__predict_false(error)) {
2228 * This mbuf is not linked w/ the txd yet, so free it now.
2233 freed = hn_txdesc_put(txr, txd);
2235 ("fail to free txd upon txdma error"));
2237 txr->hn_txdma_failed++;
2238 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2243 /* +1 RNDIS packet message */
2244 txr->hn_gpa_cnt = nsegs + 1;
2246 /* send packet with page buffer */
2247 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
2248 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
2249 txr->hn_gpa[0].gpa_len = pkt_hlen;
2252 * Fill the page buffers with mbuf info after the page
2253 * buffer for RNDIS packet message.
2255 for (i = 0; i < nsegs; ++i) {
2256 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
2258 gpa->gpa_page = atop(segs[i].ds_addr);
2259 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
2260 gpa->gpa_len = segs[i].ds_len;
2263 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2265 txr->hn_sendpkt = hn_txpkt_sglist;
2269 /* Set the completion routine */
2270 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
2272 /* Update temporary stats for later use. */
2273 txr->hn_stat_pkts++;
2274 txr->hn_stat_size += m_head->m_pkthdr.len;
2275 if (m_head->m_flags & M_MCAST)
2276 txr->hn_stat_mcasts++;
2283 * If this function fails, then txd will be freed, but the mbuf
2284 * associated w/ the txd will _not_ be freed.
2287 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
2289 int error, send_failed = 0, has_bpf;
2292 has_bpf = bpf_peers_present(ifp->if_bpf);
2295 * Make sure that this txd and any aggregated txds are not
2296 * freed before ETHER_BPF_MTAP.
2298 hn_txdesc_hold(txd);
2300 error = txr->hn_sendpkt(txr, txd);
2303 const struct hn_txdesc *tmp_txd;
2305 ETHER_BPF_MTAP(ifp, txd->m);
2306 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
2307 ETHER_BPF_MTAP(ifp, tmp_txd->m);
2310 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
2311 #ifdef HN_IFSTART_SUPPORT
2312 if (!hn_use_if_start)
2315 if_inc_counter(ifp, IFCOUNTER_OBYTES,
2317 if (txr->hn_stat_mcasts != 0) {
2318 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
2319 txr->hn_stat_mcasts);
2322 txr->hn_pkts += txr->hn_stat_pkts;
2326 hn_txdesc_put(txr, txd);
2328 if (__predict_false(error)) {
2332 * This should "really rarely" happen.
2334 * XXX Too many RX to be acked or too many sideband
2335 * commands to run? Ask netvsc_channel_rollup()
2336 * to kick start later.
2338 txr->hn_has_txeof = 1;
2340 txr->hn_send_failed++;
2343 * Try sending again after set hn_has_txeof;
2344 * in case that we missed the last
2345 * netvsc_channel_rollup().
2349 if_printf(ifp, "send failed\n");
2352 * Caller will perform further processing on the
2353 * associated mbuf, so don't free it in hn_txdesc_put();
2354 * only unload it from the DMA map in hn_txdesc_put(),
2358 freed = hn_txdesc_put(txr, txd);
2360 ("fail to free txd upon send error"));
2362 txr->hn_send_failed++;
2365 /* Reset temporary stats, after this sending is done. */
2366 txr->hn_stat_size = 0;
2367 txr->hn_stat_pkts = 0;
2368 txr->hn_stat_mcasts = 0;
2374 * Append the specified data to the indicated mbuf chain,
2375 * Extend the mbuf chain if the new data does not fit in
2378 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2379 * There should be an equivalent in the kernel mbuf code,
2380 * but there does not appear to be one yet.
2382 * Differs from m_append() in that additional mbufs are
2383 * allocated with cluster size MJUMPAGESIZE, and filled
2386 * Return 1 if able to complete the job; otherwise 0.
2389 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2392 int remainder, space;
2394 for (m = m0; m->m_next != NULL; m = m->m_next)
2397 space = M_TRAILINGSPACE(m);
2400 * Copy into available space.
2402 if (space > remainder)
2404 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2409 while (remainder > 0) {
2411 * Allocate a new mbuf; could check space
2412 * and allocate a cluster instead.
2414 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
2417 n->m_len = min(MJUMPAGESIZE, remainder);
2418 bcopy(cp, mtod(n, caddr_t), n->m_len);
2420 remainder -= n->m_len;
2424 if (m0->m_flags & M_PKTHDR)
2425 m0->m_pkthdr.len += len - remainder;
2427 return (remainder == 0);
2430 #if defined(INET) || defined(INET6)
2432 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2434 #if __FreeBSD_version >= 1100095
2435 if (hn_lro_mbufq_depth) {
2436 tcp_lro_queue_mbuf(lc, m);
2440 return tcp_lro_rx(lc, m, 0);
2445 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2446 const struct hn_rxinfo *info)
2450 int size, do_lro = 0, do_csum = 1;
2451 int hash_type = M_HASHTYPE_OPAQUE;
2453 /* If the VF is active, inject the packet through the VF */
2454 ifp = rxr->hn_rxvf_ifp ? rxr->hn_rxvf_ifp : rxr->hn_ifp;
2456 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
2459 * See the NOTE of hn_rndis_init_fixat(). This
2460 * function can be reached, immediately after the
2461 * RNDIS is initialized but before the ifnet is
2462 * setup on the hn_attach() path; drop the unexpected
2468 if (dlen <= MHLEN) {
2469 m_new = m_gethdr(M_NOWAIT, MT_DATA);
2470 if (m_new == NULL) {
2471 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2474 memcpy(mtod(m_new, void *), data, dlen);
2475 m_new->m_pkthdr.len = m_new->m_len = dlen;
2476 rxr->hn_small_pkts++;
2479 * Get an mbuf with a cluster. For packets 2K or less,
2480 * get a standard 2K cluster. For anything larger, get a
2481 * 4K cluster. Any buffers larger than 4K can cause problems
2482 * if looped around to the Hyper-V TX channel, so avoid them.
2485 if (dlen > MCLBYTES) {
2487 size = MJUMPAGESIZE;
2490 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2491 if (m_new == NULL) {
2492 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2496 hv_m_append(m_new, dlen, data);
2498 m_new->m_pkthdr.rcvif = ifp;
2500 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2503 /* receive side checksum offload */
2504 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2505 /* IP csum offload */
2506 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2507 m_new->m_pkthdr.csum_flags |=
2508 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2512 /* TCP/UDP csum offload */
2513 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2514 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2515 m_new->m_pkthdr.csum_flags |=
2516 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2517 m_new->m_pkthdr.csum_data = 0xffff;
2518 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2526 * As of this write (Oct 28th, 2016), host side will turn
2527 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2528 * the do_lro setting here is actually _not_ accurate. We
2529 * depend on the RSS hash type check to reset do_lro.
2531 if ((info->csum_info &
2532 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2533 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2536 const struct ether_header *eh;
2541 if (m_new->m_len < hoff)
2543 eh = mtod(m_new, struct ether_header *);
2544 etype = ntohs(eh->ether_type);
2545 if (etype == ETHERTYPE_VLAN) {
2546 const struct ether_vlan_header *evl;
2548 hoff = sizeof(*evl);
2549 if (m_new->m_len < hoff)
2551 evl = mtod(m_new, struct ether_vlan_header *);
2552 etype = ntohs(evl->evl_proto);
2555 if (etype == ETHERTYPE_IP) {
2558 pr = hn_check_iplen(m_new, hoff);
2559 if (pr == IPPROTO_TCP) {
2561 (rxr->hn_trust_hcsum &
2562 HN_TRUST_HCSUM_TCP)) {
2563 rxr->hn_csum_trusted++;
2564 m_new->m_pkthdr.csum_flags |=
2565 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2566 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2567 m_new->m_pkthdr.csum_data = 0xffff;
2570 } else if (pr == IPPROTO_UDP) {
2572 (rxr->hn_trust_hcsum &
2573 HN_TRUST_HCSUM_UDP)) {
2574 rxr->hn_csum_trusted++;
2575 m_new->m_pkthdr.csum_flags |=
2576 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2577 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2578 m_new->m_pkthdr.csum_data = 0xffff;
2580 } else if (pr != IPPROTO_DONE && do_csum &&
2581 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2582 rxr->hn_csum_trusted++;
2583 m_new->m_pkthdr.csum_flags |=
2584 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2589 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2590 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2591 NDIS_VLAN_INFO_ID(info->vlan_info),
2592 NDIS_VLAN_INFO_PRI(info->vlan_info),
2593 NDIS_VLAN_INFO_CFI(info->vlan_info));
2594 m_new->m_flags |= M_VLANTAG;
2597 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2599 m_new->m_pkthdr.flowid = info->hash_value;
2600 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2601 NDIS_HASH_FUNCTION_TOEPLITZ) {
2602 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2606 * do_lro is resetted, if the hash types are not TCP
2607 * related. See the comment in the above csum_flags
2611 case NDIS_HASH_IPV4:
2612 hash_type = M_HASHTYPE_RSS_IPV4;
2616 case NDIS_HASH_TCP_IPV4:
2617 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2620 case NDIS_HASH_IPV6:
2621 hash_type = M_HASHTYPE_RSS_IPV6;
2625 case NDIS_HASH_IPV6_EX:
2626 hash_type = M_HASHTYPE_RSS_IPV6_EX;
2630 case NDIS_HASH_TCP_IPV6:
2631 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2634 case NDIS_HASH_TCP_IPV6_EX:
2635 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2640 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2642 M_HASHTYPE_SET(m_new, hash_type);
2645 * Note: Moved RX completion back to hv_nv_on_receive() so all
2646 * messages (not just data messages) will trigger a response.
2652 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2653 #if defined(INET) || defined(INET6)
2654 struct lro_ctrl *lro = &rxr->hn_lro;
2657 rxr->hn_lro_tried++;
2658 if (hn_lro_rx(lro, m_new) == 0) {
2666 /* We're not holding the lock here, so don't release it */
2667 (*ifp->if_input)(ifp, m_new);
2673 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2675 struct hn_softc *sc = ifp->if_softc;
2676 struct ifreq *ifr = (struct ifreq *)data;
2677 int mask, error = 0;
2681 if (ifr->ifr_mtu > HN_MTU_MAX) {
2688 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2693 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2694 /* Can't change MTU */
2700 if (ifp->if_mtu == ifr->ifr_mtu) {
2706 * Suspend this interface before the synthetic parts
2712 * Detach the synthetics parts, i.e. NVS and RNDIS.
2714 hn_synth_detach(sc);
2717 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2718 * with the new MTU setting.
2720 error = hn_synth_attach(sc, ifr->ifr_mtu);
2727 * Commit the requested MTU, after the synthetic parts
2728 * have been successfully attached.
2730 ifp->if_mtu = ifr->ifr_mtu;
2733 * Make sure that various parameters based on MTU are
2734 * still valid, after the MTU change.
2736 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2737 hn_set_chim_size(sc, sc->hn_chim_szmax);
2738 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2739 #if __FreeBSD_version >= 1100099
2740 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2741 HN_LRO_LENLIM_MIN(ifp))
2742 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2746 * All done! Resume the interface now.
2756 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2761 if (ifp->if_flags & IFF_UP) {
2762 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2764 * Caller meight hold mutex, e.g.
2765 * bpf; use busy-wait for the RNDIS
2769 hn_rxfilter_config(sc);
2775 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2778 sc->hn_if_flags = ifp->if_flags;
2785 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2787 if (mask & IFCAP_TXCSUM) {
2788 ifp->if_capenable ^= IFCAP_TXCSUM;
2789 if (ifp->if_capenable & IFCAP_TXCSUM)
2790 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2792 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2794 if (mask & IFCAP_TXCSUM_IPV6) {
2795 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2796 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2797 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2799 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2802 /* TODO: flip RNDIS offload parameters for RXCSUM. */
2803 if (mask & IFCAP_RXCSUM)
2804 ifp->if_capenable ^= IFCAP_RXCSUM;
2806 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2807 if (mask & IFCAP_RXCSUM_IPV6)
2808 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2811 if (mask & IFCAP_LRO)
2812 ifp->if_capenable ^= IFCAP_LRO;
2814 if (mask & IFCAP_TSO4) {
2815 ifp->if_capenable ^= IFCAP_TSO4;
2816 if (ifp->if_capenable & IFCAP_TSO4)
2817 ifp->if_hwassist |= CSUM_IP_TSO;
2819 ifp->if_hwassist &= ~CSUM_IP_TSO;
2821 if (mask & IFCAP_TSO6) {
2822 ifp->if_capenable ^= IFCAP_TSO6;
2823 if (ifp->if_capenable & IFCAP_TSO6)
2824 ifp->if_hwassist |= CSUM_IP6_TSO;
2826 ifp->if_hwassist &= ~CSUM_IP6_TSO;
2836 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2840 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2842 * Multicast uses mutex; use busy-wait for
2846 hn_rxfilter_config(sc);
2855 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2859 error = ether_ioctl(ifp, cmd, data);
2866 hn_stop(struct hn_softc *sc, bool detaching)
2868 struct ifnet *ifp = sc->hn_ifp;
2873 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2874 ("synthetic parts were not attached"));
2876 /* Disable polling. */
2879 /* Clear RUNNING bit _before_ hn_suspend_data() */
2880 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2881 hn_suspend_data(sc);
2883 /* Clear OACTIVE bit. */
2884 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2885 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2886 sc->hn_tx_ring[i].hn_oactive = 0;
2889 * If the VF is active, make sure the filter is not 0, even if
2890 * the synthetic NIC is down.
2892 if (!detaching && (sc->hn_flags & HN_FLAG_VF))
2893 hn_rxfilter_config(sc);
2897 hn_init_locked(struct hn_softc *sc)
2899 struct ifnet *ifp = sc->hn_ifp;
2904 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2907 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2910 /* Configure RX filter */
2911 hn_rxfilter_config(sc);
2913 /* Clear OACTIVE bit. */
2914 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2915 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2916 sc->hn_tx_ring[i].hn_oactive = 0;
2918 /* Clear TX 'suspended' bit. */
2919 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2921 /* Everything is ready; unleash! */
2922 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2924 /* Re-enable polling if requested. */
2925 if (sc->hn_pollhz > 0)
2926 hn_polling(sc, sc->hn_pollhz);
2932 struct hn_softc *sc = xsc;
2939 #if __FreeBSD_version >= 1100099
2942 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2944 struct hn_softc *sc = arg1;
2945 unsigned int lenlim;
2948 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2949 error = sysctl_handle_int(oidp, &lenlim, 0, req);
2950 if (error || req->newptr == NULL)
2954 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2955 lenlim > TCP_LRO_LENGTH_MAX) {
2959 hn_set_lro_lenlim(sc, lenlim);
2966 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2968 struct hn_softc *sc = arg1;
2969 int ackcnt, error, i;
2972 * lro_ackcnt_lim is append count limit,
2973 * +1 to turn it into aggregation limit.
2975 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2976 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2977 if (error || req->newptr == NULL)
2980 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2984 * Convert aggregation limit back to append
2989 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2990 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2998 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
3000 struct hn_softc *sc = arg1;
3005 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
3008 error = sysctl_handle_int(oidp, &on, 0, req);
3009 if (error || req->newptr == NULL)
3013 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3014 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3017 rxr->hn_trust_hcsum |= hcsum;
3019 rxr->hn_trust_hcsum &= ~hcsum;
3026 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
3028 struct hn_softc *sc = arg1;
3029 int chim_size, error;
3031 chim_size = sc->hn_tx_ring[0].hn_chim_size;
3032 error = sysctl_handle_int(oidp, &chim_size, 0, req);
3033 if (error || req->newptr == NULL)
3036 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
3040 hn_set_chim_size(sc, chim_size);
3045 #if __FreeBSD_version < 1100095
3047 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
3049 struct hn_softc *sc = arg1;
3050 int ofs = arg2, i, error;
3051 struct hn_rx_ring *rxr;
3055 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
3056 rxr = &sc->hn_rx_ring[i];
3057 stat += *((int *)((uint8_t *)rxr + ofs));
3060 error = sysctl_handle_64(oidp, &stat, 0, req);
3061 if (error || req->newptr == NULL)
3064 /* Zero out this stat. */
3065 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
3066 rxr = &sc->hn_rx_ring[i];
3067 *((int *)((uint8_t *)rxr + ofs)) = 0;
3073 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
3075 struct hn_softc *sc = arg1;
3076 int ofs = arg2, i, error;
3077 struct hn_rx_ring *rxr;
3081 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3082 rxr = &sc->hn_rx_ring[i];
3083 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
3086 error = sysctl_handle_64(oidp, &stat, 0, req);
3087 if (error || req->newptr == NULL)
3090 /* Zero out this stat. */
3091 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3092 rxr = &sc->hn_rx_ring[i];
3093 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
3101 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3103 struct hn_softc *sc = arg1;
3104 int ofs = arg2, i, error;
3105 struct hn_rx_ring *rxr;
3109 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3110 rxr = &sc->hn_rx_ring[i];
3111 stat += *((u_long *)((uint8_t *)rxr + ofs));
3114 error = sysctl_handle_long(oidp, &stat, 0, req);
3115 if (error || req->newptr == NULL)
3118 /* Zero out this stat. */
3119 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3120 rxr = &sc->hn_rx_ring[i];
3121 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
3127 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3129 struct hn_softc *sc = arg1;
3130 int ofs = arg2, i, error;
3131 struct hn_tx_ring *txr;
3135 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3136 txr = &sc->hn_tx_ring[i];
3137 stat += *((u_long *)((uint8_t *)txr + ofs));
3140 error = sysctl_handle_long(oidp, &stat, 0, req);
3141 if (error || req->newptr == NULL)
3144 /* Zero out this stat. */
3145 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3146 txr = &sc->hn_tx_ring[i];
3147 *((u_long *)((uint8_t *)txr + ofs)) = 0;
3153 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
3155 struct hn_softc *sc = arg1;
3156 int ofs = arg2, i, error, conf;
3157 struct hn_tx_ring *txr;
3159 txr = &sc->hn_tx_ring[0];
3160 conf = *((int *)((uint8_t *)txr + ofs));
3162 error = sysctl_handle_int(oidp, &conf, 0, req);
3163 if (error || req->newptr == NULL)
3167 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3168 txr = &sc->hn_tx_ring[i];
3169 *((int *)((uint8_t *)txr + ofs)) = conf;
3177 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
3179 struct hn_softc *sc = arg1;
3182 size = sc->hn_agg_size;
3183 error = sysctl_handle_int(oidp, &size, 0, req);
3184 if (error || req->newptr == NULL)
3188 sc->hn_agg_size = size;
3196 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
3198 struct hn_softc *sc = arg1;
3201 pkts = sc->hn_agg_pkts;
3202 error = sysctl_handle_int(oidp, &pkts, 0, req);
3203 if (error || req->newptr == NULL)
3207 sc->hn_agg_pkts = pkts;
3215 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
3217 struct hn_softc *sc = arg1;
3220 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
3221 return (sysctl_handle_int(oidp, &pkts, 0, req));
3225 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
3227 struct hn_softc *sc = arg1;
3230 align = sc->hn_tx_ring[0].hn_agg_align;
3231 return (sysctl_handle_int(oidp, &align, 0, req));
3235 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
3238 vmbus_chan_poll_disable(chan);
3240 vmbus_chan_poll_enable(chan, pollhz);
3244 hn_polling(struct hn_softc *sc, u_int pollhz)
3246 int nsubch = sc->hn_rx_ring_inuse - 1;
3251 struct vmbus_channel **subch;
3254 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
3255 for (i = 0; i < nsubch; ++i)
3256 hn_chan_polling(subch[i], pollhz);
3257 vmbus_subchan_rel(subch, nsubch);
3259 hn_chan_polling(sc->hn_prichan, pollhz);
3263 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
3265 struct hn_softc *sc = arg1;
3268 pollhz = sc->hn_pollhz;
3269 error = sysctl_handle_int(oidp, &pollhz, 0, req);
3270 if (error || req->newptr == NULL)
3274 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
3278 if (sc->hn_pollhz != pollhz) {
3279 sc->hn_pollhz = pollhz;
3280 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
3281 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
3282 hn_polling(sc, sc->hn_pollhz);
3290 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
3292 struct hn_softc *sc = arg1;
3295 snprintf(verstr, sizeof(verstr), "%u.%u",
3296 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
3297 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
3298 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
3302 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
3304 struct hn_softc *sc = arg1;
3311 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
3312 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
3316 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
3318 struct hn_softc *sc = arg1;
3319 char assist_str[128];
3323 hwassist = sc->hn_ifp->if_hwassist;
3325 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
3326 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
3330 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
3332 struct hn_softc *sc = arg1;
3333 char filter_str[128];
3337 filter = sc->hn_rx_filter;
3339 snprintf(filter_str, sizeof(filter_str), "%b", filter,
3341 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
3345 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
3347 struct hn_softc *sc = arg1;
3352 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3353 if (error || req->newptr == NULL)
3356 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3359 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3361 if (sc->hn_rx_ring_inuse > 1) {
3362 error = hn_rss_reconfig(sc);
3364 /* Not RSS capable, at least for now; just save the RSS key. */
3373 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
3375 struct hn_softc *sc = arg1;
3380 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3381 if (error || req->newptr == NULL)
3385 * Don't allow RSS indirect table change, if this interface is not
3386 * RSS capable currently.
3388 if (sc->hn_rx_ring_inuse == 1) {
3393 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3396 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3398 hn_rss_ind_fixup(sc);
3399 error = hn_rss_reconfig(sc);
3406 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3408 struct hn_softc *sc = arg1;
3413 hash = sc->hn_rss_hash;
3415 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3416 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3420 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
3422 struct hn_softc *sc = arg1;
3423 char vf_name[IFNAMSIZ + 1];
3430 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf));
3432 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
3436 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
3438 struct hn_softc *sc = arg1;
3439 char vf_name[IFNAMSIZ + 1];
3444 vf = sc->hn_rx_ring[0].hn_rxvf_ifp;
3446 snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf));
3448 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
3452 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
3454 struct rm_priotracker pt;
3459 error = sysctl_wire_old_buffer(req, 0);
3463 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
3467 rm_rlock(&hn_vfmap_lock, &pt);
3470 for (i = 0; i < hn_vfmap_size; ++i) {
3473 if (hn_vfmap[i] == NULL)
3476 ifp = ifnet_byindex(i);
3479 sbuf_printf(sb, "%s", ifp->if_xname);
3481 sbuf_printf(sb, " %s", ifp->if_xname);
3486 rm_runlock(&hn_vfmap_lock, &pt);
3488 error = sbuf_finish(sb);
3494 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
3496 struct rm_priotracker pt;
3501 error = sysctl_wire_old_buffer(req, 0);
3505 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
3509 rm_rlock(&hn_vfmap_lock, &pt);
3512 for (i = 0; i < hn_vfmap_size; ++i) {
3513 struct ifnet *ifp, *hn_ifp;
3515 hn_ifp = hn_vfmap[i];
3519 ifp = ifnet_byindex(i);
3522 sbuf_printf(sb, "%s:%s", ifp->if_xname,
3525 sbuf_printf(sb, " %s:%s", ifp->if_xname,
3532 rm_runlock(&hn_vfmap_lock, &pt);
3534 error = sbuf_finish(sb);
3540 hn_check_iplen(const struct mbuf *m, int hoff)
3542 const struct ip *ip;
3543 int len, iphlen, iplen;
3544 const struct tcphdr *th;
3545 int thoff; /* TCP data offset */
3547 len = hoff + sizeof(struct ip);
3549 /* The packet must be at least the size of an IP header. */
3550 if (m->m_pkthdr.len < len)
3551 return IPPROTO_DONE;
3553 /* The fixed IP header must reside completely in the first mbuf. */
3555 return IPPROTO_DONE;
3557 ip = mtodo(m, hoff);
3559 /* Bound check the packet's stated IP header length. */
3560 iphlen = ip->ip_hl << 2;
3561 if (iphlen < sizeof(struct ip)) /* minimum header length */
3562 return IPPROTO_DONE;
3564 /* The full IP header must reside completely in the one mbuf. */
3565 if (m->m_len < hoff + iphlen)
3566 return IPPROTO_DONE;
3568 iplen = ntohs(ip->ip_len);
3571 * Check that the amount of data in the buffers is as
3572 * at least much as the IP header would have us expect.
3574 if (m->m_pkthdr.len < hoff + iplen)
3575 return IPPROTO_DONE;
3578 * Ignore IP fragments.
3580 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3581 return IPPROTO_DONE;
3584 * The TCP/IP or UDP/IP header must be entirely contained within
3585 * the first fragment of a packet.
3589 if (iplen < iphlen + sizeof(struct tcphdr))
3590 return IPPROTO_DONE;
3591 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3592 return IPPROTO_DONE;
3593 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3594 thoff = th->th_off << 2;
3595 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3596 return IPPROTO_DONE;
3597 if (m->m_len < hoff + iphlen + thoff)
3598 return IPPROTO_DONE;
3601 if (iplen < iphlen + sizeof(struct udphdr))
3602 return IPPROTO_DONE;
3603 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3604 return IPPROTO_DONE;
3608 return IPPROTO_DONE;
3615 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3617 struct sysctl_oid_list *child;
3618 struct sysctl_ctx_list *ctx;
3619 device_t dev = sc->hn_dev;
3620 #if defined(INET) || defined(INET6)
3621 #if __FreeBSD_version >= 1100095
3628 * Create RXBUF for reception.
3631 * - It is shared by all channels.
3632 * - A large enough buffer is allocated, certain version of NVSes
3633 * may further limit the usable space.
3635 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3636 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3637 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3638 if (sc->hn_rxbuf == NULL) {
3639 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3643 sc->hn_rx_ring_cnt = ring_cnt;
3644 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3646 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3647 M_DEVBUF, M_WAITOK | M_ZERO);
3649 #if defined(INET) || defined(INET6)
3650 #if __FreeBSD_version >= 1100095
3651 lroent_cnt = hn_lro_entry_count;
3652 if (lroent_cnt < TCP_LRO_ENTRIES)
3653 lroent_cnt = TCP_LRO_ENTRIES;
3655 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3657 #endif /* INET || INET6 */
3659 ctx = device_get_sysctl_ctx(dev);
3660 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3662 /* Create dev.hn.UNIT.rx sysctl tree */
3663 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3664 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3666 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3667 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3669 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3670 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3671 &rxr->hn_br_dma, BUS_DMA_WAITOK);
3672 if (rxr->hn_br == NULL) {
3673 device_printf(dev, "allocate bufring failed\n");
3677 if (hn_trust_hosttcp)
3678 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3679 if (hn_trust_hostudp)
3680 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3681 if (hn_trust_hostip)
3682 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3683 rxr->hn_ifp = sc->hn_ifp;
3684 if (i < sc->hn_tx_ring_cnt)
3685 rxr->hn_txr = &sc->hn_tx_ring[i];
3686 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3687 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3689 rxr->hn_rxbuf = sc->hn_rxbuf;
3694 #if defined(INET) || defined(INET6)
3695 #if __FreeBSD_version >= 1100095
3696 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3697 hn_lro_mbufq_depth);
3699 tcp_lro_init(&rxr->hn_lro);
3700 rxr->hn_lro.ifp = sc->hn_ifp;
3702 #if __FreeBSD_version >= 1100099
3703 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3704 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3706 #endif /* INET || INET6 */
3708 if (sc->hn_rx_sysctl_tree != NULL) {
3712 * Create per RX ring sysctl tree:
3713 * dev.hn.UNIT.rx.RINGID
3715 snprintf(name, sizeof(name), "%d", i);
3716 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3717 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3718 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3720 if (rxr->hn_rx_sysctl_tree != NULL) {
3721 SYSCTL_ADD_ULONG(ctx,
3722 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3723 OID_AUTO, "packets", CTLFLAG_RW,
3724 &rxr->hn_pkts, "# of packets received");
3725 SYSCTL_ADD_ULONG(ctx,
3726 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3727 OID_AUTO, "rss_pkts", CTLFLAG_RW,
3729 "# of packets w/ RSS info received");
3731 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3732 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3733 &rxr->hn_pktbuf_len, 0,
3734 "Temporary channel packet buffer length");
3739 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3740 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3741 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3742 #if __FreeBSD_version < 1100095
3743 hn_rx_stat_int_sysctl,
3745 hn_rx_stat_u64_sysctl,
3747 "LU", "LRO queued");
3748 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3749 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3750 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3751 #if __FreeBSD_version < 1100095
3752 hn_rx_stat_int_sysctl,
3754 hn_rx_stat_u64_sysctl,
3756 "LU", "LRO flushed");
3757 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3758 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3759 __offsetof(struct hn_rx_ring, hn_lro_tried),
3760 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3761 #if __FreeBSD_version >= 1100099
3762 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3763 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3764 hn_lro_lenlim_sysctl, "IU",
3765 "Max # of data bytes to be aggregated by LRO");
3766 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3767 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3768 hn_lro_ackcnt_sysctl, "I",
3769 "Max # of ACKs to be aggregated by LRO");
3771 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3772 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3773 hn_trust_hcsum_sysctl, "I",
3774 "Trust tcp segement verification on host side, "
3775 "when csum info is missing");
3776 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3777 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3778 hn_trust_hcsum_sysctl, "I",
3779 "Trust udp datagram verification on host side, "
3780 "when csum info is missing");
3781 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3782 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3783 hn_trust_hcsum_sysctl, "I",
3784 "Trust ip packet verification on host side, "
3785 "when csum info is missing");
3786 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3787 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3788 __offsetof(struct hn_rx_ring, hn_csum_ip),
3789 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3790 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3791 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3792 __offsetof(struct hn_rx_ring, hn_csum_tcp),
3793 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3794 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3795 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3796 __offsetof(struct hn_rx_ring, hn_csum_udp),
3797 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3798 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3799 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3800 __offsetof(struct hn_rx_ring, hn_csum_trusted),
3801 hn_rx_stat_ulong_sysctl, "LU",
3802 "# of packets that we trust host's csum verification");
3803 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3804 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3805 __offsetof(struct hn_rx_ring, hn_small_pkts),
3806 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3807 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3808 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3809 __offsetof(struct hn_rx_ring, hn_ack_failed),
3810 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3811 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3812 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3813 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3814 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3820 hn_destroy_rx_data(struct hn_softc *sc)
3824 if (sc->hn_rxbuf != NULL) {
3825 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3826 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3828 device_printf(sc->hn_dev, "RXBUF is referenced\n");
3829 sc->hn_rxbuf = NULL;
3832 if (sc->hn_rx_ring_cnt == 0)
3835 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3836 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3838 if (rxr->hn_br == NULL)
3840 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3841 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3843 device_printf(sc->hn_dev,
3844 "%dth channel bufring is referenced", i);
3848 #if defined(INET) || defined(INET6)
3849 tcp_lro_free(&rxr->hn_lro);
3851 free(rxr->hn_pktbuf, M_DEVBUF);
3853 free(sc->hn_rx_ring, M_DEVBUF);
3854 sc->hn_rx_ring = NULL;
3856 sc->hn_rx_ring_cnt = 0;
3857 sc->hn_rx_ring_inuse = 0;
3861 hn_tx_ring_create(struct hn_softc *sc, int id)
3863 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3864 device_t dev = sc->hn_dev;
3865 bus_dma_tag_t parent_dtag;
3869 txr->hn_tx_idx = id;
3871 #ifndef HN_USE_TXDESC_BUFRING
3872 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3874 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3876 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3877 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3878 M_DEVBUF, M_WAITOK | M_ZERO);
3879 #ifndef HN_USE_TXDESC_BUFRING
3880 SLIST_INIT(&txr->hn_txlist);
3882 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3883 M_WAITOK, &txr->hn_tx_lock);
3886 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3887 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3888 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3890 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3893 #ifdef HN_IFSTART_SUPPORT
3894 if (hn_use_if_start) {
3895 txr->hn_txeof = hn_start_txeof;
3896 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3897 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3903 txr->hn_txeof = hn_xmit_txeof;
3904 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3905 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3907 br_depth = hn_get_txswq_depth(txr);
3908 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3909 M_WAITOK, &txr->hn_tx_lock);
3912 txr->hn_direct_tx_size = hn_direct_tx_size;
3915 * Always schedule transmission instead of trying to do direct
3916 * transmission. This one gives the best performance so far.
3918 txr->hn_sched_tx = 1;
3920 parent_dtag = bus_get_dma_tag(dev);
3922 /* DMA tag for RNDIS packet messages. */
3923 error = bus_dma_tag_create(parent_dtag, /* parent */
3924 HN_RNDIS_PKT_ALIGN, /* alignment */
3925 HN_RNDIS_PKT_BOUNDARY, /* boundary */
3926 BUS_SPACE_MAXADDR, /* lowaddr */
3927 BUS_SPACE_MAXADDR, /* highaddr */
3928 NULL, NULL, /* filter, filterarg */
3929 HN_RNDIS_PKT_LEN, /* maxsize */
3931 HN_RNDIS_PKT_LEN, /* maxsegsize */
3933 NULL, /* lockfunc */
3934 NULL, /* lockfuncarg */
3935 &txr->hn_tx_rndis_dtag);
3937 device_printf(dev, "failed to create rndis dmatag\n");
3941 /* DMA tag for data. */
3942 error = bus_dma_tag_create(parent_dtag, /* parent */
3944 HN_TX_DATA_BOUNDARY, /* boundary */
3945 BUS_SPACE_MAXADDR, /* lowaddr */
3946 BUS_SPACE_MAXADDR, /* highaddr */
3947 NULL, NULL, /* filter, filterarg */
3948 HN_TX_DATA_MAXSIZE, /* maxsize */
3949 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
3950 HN_TX_DATA_SEGSIZE, /* maxsegsize */
3952 NULL, /* lockfunc */
3953 NULL, /* lockfuncarg */
3954 &txr->hn_tx_data_dtag);
3956 device_printf(dev, "failed to create data dmatag\n");
3960 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3961 struct hn_txdesc *txd = &txr->hn_txdesc[i];
3964 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3965 STAILQ_INIT(&txd->agg_list);
3968 * Allocate and load RNDIS packet message.
3970 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3971 (void **)&txd->rndis_pkt,
3972 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3973 &txd->rndis_pkt_dmap);
3976 "failed to allocate rndis_packet_msg, %d\n", i);
3980 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3981 txd->rndis_pkt_dmap,
3982 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3983 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3987 "failed to load rndis_packet_msg, %d\n", i);
3988 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3989 txd->rndis_pkt, txd->rndis_pkt_dmap);
3993 /* DMA map for TX data. */
3994 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3998 "failed to allocate tx data dmamap\n");
3999 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
4000 txd->rndis_pkt_dmap);
4001 bus_dmamem_free(txr->hn_tx_rndis_dtag,
4002 txd->rndis_pkt, txd->rndis_pkt_dmap);
4006 /* All set, put it to list */
4007 txd->flags |= HN_TXD_FLAG_ONLIST;
4008 #ifndef HN_USE_TXDESC_BUFRING
4009 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
4011 buf_ring_enqueue(txr->hn_txdesc_br, txd);
4014 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
4016 if (sc->hn_tx_sysctl_tree != NULL) {
4017 struct sysctl_oid_list *child;
4018 struct sysctl_ctx_list *ctx;
4022 * Create per TX ring sysctl tree:
4023 * dev.hn.UNIT.tx.RINGID
4025 ctx = device_get_sysctl_ctx(dev);
4026 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
4028 snprintf(name, sizeof(name), "%d", id);
4029 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
4030 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4032 if (txr->hn_tx_sysctl_tree != NULL) {
4033 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
4036 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
4037 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
4038 "# of available TX descs");
4040 #ifdef HN_IFSTART_SUPPORT
4041 if (!hn_use_if_start)
4044 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
4045 CTLFLAG_RD, &txr->hn_oactive, 0,
4048 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
4049 CTLFLAG_RW, &txr->hn_pkts,
4050 "# of packets transmitted");
4051 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
4052 CTLFLAG_RW, &txr->hn_sends, "# of sends");
4060 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
4062 struct hn_tx_ring *txr = txd->txr;
4064 KASSERT(txd->m == NULL, ("still has mbuf installed"));
4065 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
4067 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
4068 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
4069 txd->rndis_pkt_dmap);
4070 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
4074 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
4077 KASSERT(txd->refs == 0 || txd->refs == 1,
4078 ("invalid txd refs %d", txd->refs));
4080 /* Aggregated txds will be freed by their aggregating txd. */
4081 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
4084 freed = hn_txdesc_put(txr, txd);
4085 KASSERT(freed, ("can't free txdesc"));
4090 hn_tx_ring_destroy(struct hn_tx_ring *txr)
4094 if (txr->hn_txdesc == NULL)
4099 * Because the freeing of aggregated txds will be deferred
4100 * to the aggregating txd, two passes are used here:
4101 * - The first pass GCes any pending txds. This GC is necessary,
4102 * since if the channels are revoked, hypervisor will not
4103 * deliver send-done for all pending txds.
4104 * - The second pass frees the busdma stuffs, i.e. after all txds
4107 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4108 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
4109 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4110 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
4112 if (txr->hn_tx_data_dtag != NULL)
4113 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
4114 if (txr->hn_tx_rndis_dtag != NULL)
4115 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
4117 #ifdef HN_USE_TXDESC_BUFRING
4118 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
4121 free(txr->hn_txdesc, M_DEVBUF);
4122 txr->hn_txdesc = NULL;
4124 if (txr->hn_mbuf_br != NULL)
4125 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
4127 #ifndef HN_USE_TXDESC_BUFRING
4128 mtx_destroy(&txr->hn_txlist_spin);
4130 mtx_destroy(&txr->hn_tx_lock);
4134 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
4136 struct sysctl_oid_list *child;
4137 struct sysctl_ctx_list *ctx;
4141 * Create TXBUF for chimney sending.
4143 * NOTE: It is shared by all channels.
4145 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
4146 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
4147 BUS_DMA_WAITOK | BUS_DMA_ZERO);
4148 if (sc->hn_chim == NULL) {
4149 device_printf(sc->hn_dev, "allocate txbuf failed\n");
4153 sc->hn_tx_ring_cnt = ring_cnt;
4154 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4156 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
4157 M_DEVBUF, M_WAITOK | M_ZERO);
4159 ctx = device_get_sysctl_ctx(sc->hn_dev);
4160 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
4162 /* Create dev.hn.UNIT.tx sysctl tree */
4163 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
4164 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4166 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4169 error = hn_tx_ring_create(sc, i);
4174 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
4175 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4176 __offsetof(struct hn_tx_ring, hn_no_txdescs),
4177 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
4178 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
4179 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4180 __offsetof(struct hn_tx_ring, hn_send_failed),
4181 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
4182 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
4183 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4184 __offsetof(struct hn_tx_ring, hn_txdma_failed),
4185 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
4186 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
4187 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4188 __offsetof(struct hn_tx_ring, hn_flush_failed),
4189 hn_tx_stat_ulong_sysctl, "LU",
4190 "# of packet transmission aggregation flush failure");
4191 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
4192 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4193 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
4194 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
4195 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
4196 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4197 __offsetof(struct hn_tx_ring, hn_tx_chimney),
4198 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
4199 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
4200 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4201 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
4202 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
4203 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
4204 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
4205 "# of total TX descs");
4206 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
4207 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
4208 "Chimney send packet size upper boundary");
4209 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
4210 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4211 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
4212 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
4213 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4214 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
4215 hn_tx_conf_int_sysctl, "I",
4216 "Size of the packet for direct transmission");
4217 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
4218 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4219 __offsetof(struct hn_tx_ring, hn_sched_tx),
4220 hn_tx_conf_int_sysctl, "I",
4221 "Always schedule transmission "
4222 "instead of doing direct transmission");
4223 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
4224 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
4225 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
4226 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
4227 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
4228 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
4229 "Applied packet transmission aggregation size");
4230 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
4231 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
4232 hn_txagg_pktmax_sysctl, "I",
4233 "Applied packet transmission aggregation packets");
4234 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
4235 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
4236 hn_txagg_align_sysctl, "I",
4237 "Applied packet transmission aggregation alignment");
4243 hn_set_chim_size(struct hn_softc *sc, int chim_size)
4247 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4248 sc->hn_tx_ring[i].hn_chim_size = chim_size;
4252 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
4254 struct ifnet *ifp = sc->hn_ifp;
4257 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
4260 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
4261 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
4262 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
4264 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
4265 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
4266 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
4268 if (tso_maxlen < tso_minlen)
4269 tso_maxlen = tso_minlen;
4270 else if (tso_maxlen > IP_MAXPACKET)
4271 tso_maxlen = IP_MAXPACKET;
4272 if (tso_maxlen > sc->hn_ndis_tso_szmax)
4273 tso_maxlen = sc->hn_ndis_tso_szmax;
4274 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4276 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
4280 hn_fixup_tx_data(struct hn_softc *sc)
4282 uint64_t csum_assist;
4285 hn_set_chim_size(sc, sc->hn_chim_szmax);
4286 if (hn_tx_chimney_size > 0 &&
4287 hn_tx_chimney_size < sc->hn_chim_szmax)
4288 hn_set_chim_size(sc, hn_tx_chimney_size);
4291 if (sc->hn_caps & HN_CAP_IPCS)
4292 csum_assist |= CSUM_IP;
4293 if (sc->hn_caps & HN_CAP_TCP4CS)
4294 csum_assist |= CSUM_IP_TCP;
4295 if (sc->hn_caps & HN_CAP_UDP4CS)
4296 csum_assist |= CSUM_IP_UDP;
4297 if (sc->hn_caps & HN_CAP_TCP6CS)
4298 csum_assist |= CSUM_IP6_TCP;
4299 if (sc->hn_caps & HN_CAP_UDP6CS)
4300 csum_assist |= CSUM_IP6_UDP;
4301 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4302 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
4304 if (sc->hn_caps & HN_CAP_HASHVAL) {
4306 * Support HASHVAL pktinfo on TX path.
4309 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
4310 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4311 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
4316 hn_destroy_tx_data(struct hn_softc *sc)
4320 if (sc->hn_chim != NULL) {
4321 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
4322 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
4324 device_printf(sc->hn_dev,
4325 "chimney sending buffer is referenced");
4330 if (sc->hn_tx_ring_cnt == 0)
4333 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4334 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
4336 free(sc->hn_tx_ring, M_DEVBUF);
4337 sc->hn_tx_ring = NULL;
4339 sc->hn_tx_ring_cnt = 0;
4340 sc->hn_tx_ring_inuse = 0;
4343 #ifdef HN_IFSTART_SUPPORT
4346 hn_start_taskfunc(void *xtxr, int pending __unused)
4348 struct hn_tx_ring *txr = xtxr;
4350 mtx_lock(&txr->hn_tx_lock);
4351 hn_start_locked(txr, 0);
4352 mtx_unlock(&txr->hn_tx_lock);
4356 hn_start_locked(struct hn_tx_ring *txr, int len)
4358 struct hn_softc *sc = txr->hn_sc;
4359 struct ifnet *ifp = sc->hn_ifp;
4362 KASSERT(hn_use_if_start,
4363 ("hn_start_locked is called, when if_start is disabled"));
4364 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4365 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4366 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4368 if (__predict_false(txr->hn_suspended))
4371 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
4375 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
4376 struct hn_txdesc *txd;
4377 struct mbuf *m_head;
4380 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
4384 if (len > 0 && m_head->m_pkthdr.len > len) {
4386 * This sending could be time consuming; let callers
4387 * dispatch this packet sending (and sending of any
4388 * following up packets) to tx taskqueue.
4390 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4395 #if defined(INET6) || defined(INET)
4396 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
4397 m_head = hn_tso_fixup(m_head);
4398 if (__predict_false(m_head == NULL)) {
4399 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4405 txd = hn_txdesc_get(txr);
4407 txr->hn_no_txdescs++;
4408 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4409 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4413 error = hn_encap(ifp, txr, txd, &m_head);
4415 /* Both txd and m_head are freed */
4416 KASSERT(txr->hn_agg_txd == NULL,
4417 ("encap failed w/ pending aggregating txdesc"));
4421 if (txr->hn_agg_pktleft == 0) {
4422 if (txr->hn_agg_txd != NULL) {
4423 KASSERT(m_head == NULL,
4424 ("pending mbuf for aggregating txdesc"));
4425 error = hn_flush_txagg(ifp, txr);
4426 if (__predict_false(error)) {
4427 atomic_set_int(&ifp->if_drv_flags,
4432 KASSERT(m_head != NULL, ("mbuf was freed"));
4433 error = hn_txpkt(ifp, txr, txd);
4434 if (__predict_false(error)) {
4435 /* txd is freed, but m_head is not */
4436 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4437 atomic_set_int(&ifp->if_drv_flags,
4445 KASSERT(txr->hn_agg_txd != NULL,
4446 ("no aggregating txdesc"));
4447 KASSERT(m_head == NULL,
4448 ("pending mbuf for aggregating txdesc"));
4453 /* Flush pending aggerated transmission. */
4454 if (txr->hn_agg_txd != NULL)
4455 hn_flush_txagg(ifp, txr);
4460 hn_start(struct ifnet *ifp)
4462 struct hn_softc *sc = ifp->if_softc;
4463 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
4465 if (txr->hn_sched_tx)
4468 if (mtx_trylock(&txr->hn_tx_lock)) {
4471 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4472 mtx_unlock(&txr->hn_tx_lock);
4477 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4481 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
4483 struct hn_tx_ring *txr = xtxr;
4485 mtx_lock(&txr->hn_tx_lock);
4486 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
4487 hn_start_locked(txr, 0);
4488 mtx_unlock(&txr->hn_tx_lock);
4492 hn_start_txeof(struct hn_tx_ring *txr)
4494 struct hn_softc *sc = txr->hn_sc;
4495 struct ifnet *ifp = sc->hn_ifp;
4497 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4499 if (txr->hn_sched_tx)
4502 if (mtx_trylock(&txr->hn_tx_lock)) {
4505 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4506 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4507 mtx_unlock(&txr->hn_tx_lock);
4509 taskqueue_enqueue(txr->hn_tx_taskq,
4515 * Release the OACTIVE earlier, with the hope, that
4516 * others could catch up. The task will clear the
4517 * flag again with the hn_tx_lock to avoid possible
4520 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4521 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4525 #endif /* HN_IFSTART_SUPPORT */
4528 hn_xmit(struct hn_tx_ring *txr, int len)
4530 struct hn_softc *sc = txr->hn_sc;
4531 struct ifnet *ifp = sc->hn_ifp;
4532 struct mbuf *m_head;
4535 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4536 #ifdef HN_IFSTART_SUPPORT
4537 KASSERT(hn_use_if_start == 0,
4538 ("hn_xmit is called, when if_start is enabled"));
4540 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4542 if (__predict_false(txr->hn_suspended))
4545 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4548 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4549 struct hn_txdesc *txd;
4552 if (len > 0 && m_head->m_pkthdr.len > len) {
4554 * This sending could be time consuming; let callers
4555 * dispatch this packet sending (and sending of any
4556 * following up packets) to tx taskqueue.
4558 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4563 txd = hn_txdesc_get(txr);
4565 txr->hn_no_txdescs++;
4566 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4567 txr->hn_oactive = 1;
4571 error = hn_encap(ifp, txr, txd, &m_head);
4573 /* Both txd and m_head are freed; discard */
4574 KASSERT(txr->hn_agg_txd == NULL,
4575 ("encap failed w/ pending aggregating txdesc"));
4576 drbr_advance(ifp, txr->hn_mbuf_br);
4580 if (txr->hn_agg_pktleft == 0) {
4581 if (txr->hn_agg_txd != NULL) {
4582 KASSERT(m_head == NULL,
4583 ("pending mbuf for aggregating txdesc"));
4584 error = hn_flush_txagg(ifp, txr);
4585 if (__predict_false(error)) {
4586 txr->hn_oactive = 1;
4590 KASSERT(m_head != NULL, ("mbuf was freed"));
4591 error = hn_txpkt(ifp, txr, txd);
4592 if (__predict_false(error)) {
4593 /* txd is freed, but m_head is not */
4594 drbr_putback(ifp, txr->hn_mbuf_br,
4596 txr->hn_oactive = 1;
4603 KASSERT(txr->hn_agg_txd != NULL,
4604 ("no aggregating txdesc"));
4605 KASSERT(m_head == NULL,
4606 ("pending mbuf for aggregating txdesc"));
4611 drbr_advance(ifp, txr->hn_mbuf_br);
4614 /* Flush pending aggerated transmission. */
4615 if (txr->hn_agg_txd != NULL)
4616 hn_flush_txagg(ifp, txr);
4621 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4623 struct hn_softc *sc = ifp->if_softc;
4624 struct hn_tx_ring *txr;
4627 #if defined(INET6) || defined(INET)
4629 * Perform TSO packet header fixup now, since the TSO
4630 * packet header should be cache-hot.
4632 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4633 m = hn_tso_fixup(m);
4634 if (__predict_false(m == NULL)) {
4635 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4642 * Select the TX ring based on flowid
4644 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
4645 #if defined(INET6) || defined(INET)
4648 if (m->m_pkthdr.len < 128 &&
4649 (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
4650 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
4651 m = hn_check_tcpsyn(m, &tcpsyn);
4652 if (__predict_false(m == NULL)) {
4654 IFCOUNTER_OERRORS, 1);
4659 const int tcpsyn = 0;
4664 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4666 txr = &sc->hn_tx_ring[idx];
4668 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4670 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4674 if (txr->hn_oactive)
4677 if (txr->hn_sched_tx)
4680 if (mtx_trylock(&txr->hn_tx_lock)) {
4683 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4684 mtx_unlock(&txr->hn_tx_lock);
4689 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4694 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4698 mtx_lock(&txr->hn_tx_lock);
4699 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4701 mtx_unlock(&txr->hn_tx_lock);
4705 hn_xmit_qflush(struct ifnet *ifp)
4707 struct hn_softc *sc = ifp->if_softc;
4710 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4711 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4716 hn_xmit_txeof(struct hn_tx_ring *txr)
4719 if (txr->hn_sched_tx)
4722 if (mtx_trylock(&txr->hn_tx_lock)) {
4725 txr->hn_oactive = 0;
4726 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4727 mtx_unlock(&txr->hn_tx_lock);
4729 taskqueue_enqueue(txr->hn_tx_taskq,
4735 * Release the oactive earlier, with the hope, that
4736 * others could catch up. The task will clear the
4737 * oactive again with the hn_tx_lock to avoid possible
4740 txr->hn_oactive = 0;
4741 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4746 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4748 struct hn_tx_ring *txr = xtxr;
4750 mtx_lock(&txr->hn_tx_lock);
4752 mtx_unlock(&txr->hn_tx_lock);
4756 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4758 struct hn_tx_ring *txr = xtxr;
4760 mtx_lock(&txr->hn_tx_lock);
4761 txr->hn_oactive = 0;
4763 mtx_unlock(&txr->hn_tx_lock);
4767 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4769 struct vmbus_chan_br cbr;
4770 struct hn_rx_ring *rxr;
4771 struct hn_tx_ring *txr = NULL;
4774 idx = vmbus_chan_subidx(chan);
4777 * Link this channel to RX/TX ring.
4779 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4780 ("invalid channel index %d, should > 0 && < %d",
4781 idx, sc->hn_rx_ring_inuse));
4782 rxr = &sc->hn_rx_ring[idx];
4783 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4784 ("RX ring %d already attached", idx));
4785 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4786 rxr->hn_chan = chan;
4789 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4790 idx, vmbus_chan_id(chan));
4793 if (idx < sc->hn_tx_ring_inuse) {
4794 txr = &sc->hn_tx_ring[idx];
4795 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4796 ("TX ring %d already attached", idx));
4797 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4799 txr->hn_chan = chan;
4801 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4802 idx, vmbus_chan_id(chan));
4806 /* Bind this channel to a proper CPU. */
4807 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4812 cbr.cbr = rxr->hn_br;
4813 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4814 cbr.cbr_txsz = HN_TXBR_SIZE;
4815 cbr.cbr_rxsz = HN_RXBR_SIZE;
4816 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4818 if (error == EISCONN) {
4819 if_printf(sc->hn_ifp, "bufring is connected after "
4820 "chan%u open failure\n", vmbus_chan_id(chan));
4821 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4823 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4824 vmbus_chan_id(chan), error);
4831 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4833 struct hn_rx_ring *rxr;
4836 idx = vmbus_chan_subidx(chan);
4839 * Link this channel to RX/TX ring.
4841 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4842 ("invalid channel index %d, should > 0 && < %d",
4843 idx, sc->hn_rx_ring_inuse));
4844 rxr = &sc->hn_rx_ring[idx];
4845 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4846 ("RX ring %d is not attached", idx));
4847 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4849 if (idx < sc->hn_tx_ring_inuse) {
4850 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4852 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4853 ("TX ring %d is not attached attached", idx));
4854 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4858 * Close this channel.
4861 * Channel closing does _not_ destroy the target channel.
4863 error = vmbus_chan_close_direct(chan);
4864 if (error == EISCONN) {
4865 if_printf(sc->hn_ifp, "chan%u bufring is connected "
4866 "after being closed\n", vmbus_chan_id(chan));
4867 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4869 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4870 vmbus_chan_id(chan), error);
4875 hn_attach_subchans(struct hn_softc *sc)
4877 struct vmbus_channel **subchans;
4878 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4881 KASSERT(subchan_cnt > 0, ("no sub-channels"));
4883 /* Attach the sub-channels. */
4884 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4885 for (i = 0; i < subchan_cnt; ++i) {
4888 error1 = hn_chan_attach(sc, subchans[i]);
4891 /* Move on; all channels will be detached later. */
4894 vmbus_subchan_rel(subchans, subchan_cnt);
4897 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4900 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4908 hn_detach_allchans(struct hn_softc *sc)
4910 struct vmbus_channel **subchans;
4911 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4914 if (subchan_cnt == 0)
4917 /* Detach the sub-channels. */
4918 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4919 for (i = 0; i < subchan_cnt; ++i)
4920 hn_chan_detach(sc, subchans[i]);
4921 vmbus_subchan_rel(subchans, subchan_cnt);
4925 * Detach the primary channel, _after_ all sub-channels
4928 hn_chan_detach(sc, sc->hn_prichan);
4930 /* Wait for sub-channels to be destroyed, if any. */
4931 vmbus_subchan_drain(sc->hn_prichan);
4934 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4935 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4936 HN_RX_FLAG_ATTACHED) == 0,
4937 ("%dth RX ring is still attached", i));
4939 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4940 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4941 HN_TX_FLAG_ATTACHED) == 0,
4942 ("%dth TX ring is still attached", i));
4948 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4950 struct vmbus_channel **subchans;
4951 int nchan, rxr_cnt, error;
4953 nchan = *nsubch + 1;
4956 * Multiple RX/TX rings are not requested.
4963 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4966 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4968 /* No RSS; this is benign. */
4973 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4977 if (nchan > rxr_cnt)
4980 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4986 * Allocate sub-channels from NVS.
4988 *nsubch = nchan - 1;
4989 error = hn_nvs_alloc_subchans(sc, nsubch);
4990 if (error || *nsubch == 0) {
4991 /* Failed to allocate sub-channels. */
4997 * Wait for all sub-channels to become ready before moving on.
4999 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
5000 vmbus_subchan_rel(subchans, *nsubch);
5005 hn_synth_attachable(const struct hn_softc *sc)
5009 if (sc->hn_flags & HN_FLAG_ERRORS)
5012 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5013 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5015 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
5022 * Make sure that the RX filter is zero after the successful
5023 * RNDIS initialization.
5026 * Under certain conditions on certain versions of Hyper-V,
5027 * the RNDIS rxfilter is _not_ zero on the hypervisor side
5028 * after the successful RNDIS initialization, which breaks
5029 * the assumption of any following code (well, it breaks the
5030 * RNDIS API contract actually). Clear the RNDIS rxfilter
5031 * explicitly, drain packets sneaking through, and drain the
5032 * interrupt taskqueues scheduled due to the stealth packets.
5035 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
5039 hn_drain_rxtx(sc, nchan);
5043 hn_synth_attach(struct hn_softc *sc, int mtu)
5045 #define ATTACHED_NVS 0x0002
5046 #define ATTACHED_RNDIS 0x0004
5048 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
5049 int error, nsubch, nchan = 1, i, rndis_inited;
5050 uint32_t old_caps, attached = 0;
5052 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
5053 ("synthetic parts were attached"));
5055 if (!hn_synth_attachable(sc))
5058 /* Save capabilities for later verification. */
5059 old_caps = sc->hn_caps;
5062 /* Clear RSS stuffs. */
5063 sc->hn_rss_ind_size = 0;
5064 sc->hn_rss_hash = 0;
5067 * Attach the primary channel _before_ attaching NVS and RNDIS.
5069 error = hn_chan_attach(sc, sc->hn_prichan);
5076 error = hn_nvs_attach(sc, mtu);
5079 attached |= ATTACHED_NVS;
5082 * Attach RNDIS _after_ NVS is attached.
5084 error = hn_rndis_attach(sc, mtu, &rndis_inited);
5086 attached |= ATTACHED_RNDIS;
5091 * Make sure capabilities are not changed.
5093 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
5094 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
5095 old_caps, sc->hn_caps);
5101 * Allocate sub-channels for multi-TX/RX rings.
5104 * The # of RX rings that can be used is equivalent to the # of
5105 * channels to be requested.
5107 nsubch = sc->hn_rx_ring_cnt - 1;
5108 error = hn_synth_alloc_subchans(sc, &nsubch);
5111 /* NOTE: _Full_ synthetic parts detach is required now. */
5112 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
5115 * Set the # of TX/RX rings that could be used according to
5116 * the # of channels that NVS offered.
5119 hn_set_ring_inuse(sc, nchan);
5121 /* Only the primary channel can be used; done */
5126 * Attach the sub-channels.
5128 * NOTE: hn_set_ring_inuse() _must_ have been called.
5130 error = hn_attach_subchans(sc);
5135 * Configure RSS key and indirect table _after_ all sub-channels
5138 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
5140 * RSS key is not set yet; set it to the default RSS key.
5143 if_printf(sc->hn_ifp, "setup default RSS key\n");
5144 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
5145 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
5148 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
5150 * RSS indirect table is not set yet; set it up in round-
5154 if_printf(sc->hn_ifp, "setup default RSS indirect "
5157 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
5158 rss->rss_ind[i] = i % nchan;
5159 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
5162 * # of usable channels may be changed, so we have to
5163 * make sure that all entries in RSS indirect table
5166 * NOTE: hn_set_ring_inuse() _must_ have been called.
5168 hn_rss_ind_fixup(sc);
5171 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
5176 * Fixup transmission aggregation setup.
5179 hn_rndis_init_fixat(sc, nchan);
5183 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
5184 hn_rndis_init_fixat(sc, nchan);
5185 hn_synth_detach(sc);
5187 if (attached & ATTACHED_RNDIS) {
5188 hn_rndis_init_fixat(sc, nchan);
5189 hn_rndis_detach(sc);
5191 if (attached & ATTACHED_NVS)
5193 hn_chan_detach(sc, sc->hn_prichan);
5194 /* Restore old capabilities. */
5195 sc->hn_caps = old_caps;
5199 #undef ATTACHED_RNDIS
5205 * The interface must have been suspended though hn_suspend(), before
5206 * this function get called.
5209 hn_synth_detach(struct hn_softc *sc)
5212 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
5213 ("synthetic parts were not attached"));
5215 /* Detach the RNDIS first. */
5216 hn_rndis_detach(sc);
5221 /* Detach all of the channels. */
5222 hn_detach_allchans(sc);
5224 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
5228 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
5230 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
5231 ("invalid ring count %d", ring_cnt));
5233 if (sc->hn_tx_ring_cnt > ring_cnt)
5234 sc->hn_tx_ring_inuse = ring_cnt;
5236 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5237 sc->hn_rx_ring_inuse = ring_cnt;
5240 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
5241 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
5246 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
5251 * The TX bufring will not be drained by the hypervisor,
5252 * if the primary channel is revoked.
5254 while (!vmbus_chan_rx_empty(chan) ||
5255 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
5256 !vmbus_chan_tx_empty(chan)))
5258 vmbus_chan_intr_drain(chan);
5262 hn_disable_rx(struct hn_softc *sc)
5266 * Disable RX by clearing RX filter forcefully.
5268 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
5269 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
5272 * Give RNDIS enough time to flush all pending data packets.
5274 pause("waitrx", (200 * hz) / 1000);
5279 * RX/TX _must_ have been suspended/disabled, before this function
5283 hn_drain_rxtx(struct hn_softc *sc, int nchan)
5285 struct vmbus_channel **subch = NULL;
5289 * Drain RX/TX bufrings and interrupts.
5293 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
5295 if (subch != NULL) {
5298 for (i = 0; i < nsubch; ++i)
5299 hn_chan_drain(sc, subch[i]);
5301 hn_chan_drain(sc, sc->hn_prichan);
5304 vmbus_subchan_rel(subch, nsubch);
5308 hn_suspend_data(struct hn_softc *sc)
5310 struct hn_tx_ring *txr;
5318 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5319 txr = &sc->hn_tx_ring[i];
5321 mtx_lock(&txr->hn_tx_lock);
5322 txr->hn_suspended = 1;
5323 mtx_unlock(&txr->hn_tx_lock);
5324 /* No one is able send more packets now. */
5327 * Wait for all pending sends to finish.
5330 * We will _not_ receive all pending send-done, if the
5331 * primary channel is revoked.
5333 while (hn_tx_ring_pending(txr) &&
5334 !vmbus_chan_is_revoked(sc->hn_prichan))
5335 pause("hnwtx", 1 /* 1 tick */);
5346 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
5349 * Drain any pending TX tasks.
5352 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
5353 * tasks will have to be drained _after_ the above hn_drain_rxtx().
5355 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5356 txr = &sc->hn_tx_ring[i];
5358 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
5359 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
5364 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
5367 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
5371 hn_suspend_mgmt(struct hn_softc *sc)
5378 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
5379 * through hn_mgmt_taskq.
5381 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
5382 vmbus_chan_run_task(sc->hn_prichan, &task);
5385 * Make sure that all pending management tasks are completed.
5387 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
5388 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
5389 taskqueue_drain_all(sc->hn_mgmt_taskq0);
5393 hn_suspend(struct hn_softc *sc)
5396 /* Disable polling. */
5399 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5400 (sc->hn_flags & HN_FLAG_VF))
5401 hn_suspend_data(sc);
5402 hn_suspend_mgmt(sc);
5406 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
5410 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
5411 ("invalid TX ring count %d", tx_ring_cnt));
5413 for (i = 0; i < tx_ring_cnt; ++i) {
5414 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5416 mtx_lock(&txr->hn_tx_lock);
5417 txr->hn_suspended = 0;
5418 mtx_unlock(&txr->hn_tx_lock);
5423 hn_resume_data(struct hn_softc *sc)
5432 hn_rxfilter_config(sc);
5435 * Make sure to clear suspend status on "all" TX rings,
5436 * since hn_tx_ring_inuse can be changed after
5437 * hn_suspend_data().
5439 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
5441 #ifdef HN_IFSTART_SUPPORT
5442 if (!hn_use_if_start)
5446 * Flush unused drbrs, since hn_tx_ring_inuse may be
5449 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
5450 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5456 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5457 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5460 * Use txeof task, so that any pending oactive can be
5463 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5468 hn_resume_mgmt(struct hn_softc *sc)
5471 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
5474 * Kick off network change detection, if it was pending.
5475 * If no network change was pending, start link status
5476 * checks, which is more lightweight than network change
5479 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
5480 hn_change_network(sc);
5482 hn_update_link_status(sc);
5486 hn_resume(struct hn_softc *sc)
5489 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5490 (sc->hn_flags & HN_FLAG_VF))
5494 * When the VF is activated, the synthetic interface is changed
5495 * to DOWN in hn_set_vf(). Here, if the VF is still active, we
5496 * don't call hn_resume_mgmt() until the VF is deactivated in
5499 if (!(sc->hn_flags & HN_FLAG_VF))
5503 * Re-enable polling if this interface is running and
5504 * the polling is requested.
5506 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
5507 hn_polling(sc, sc->hn_pollhz);
5511 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
5513 const struct rndis_status_msg *msg;
5516 if (dlen < sizeof(*msg)) {
5517 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
5522 switch (msg->rm_status) {
5523 case RNDIS_STATUS_MEDIA_CONNECT:
5524 case RNDIS_STATUS_MEDIA_DISCONNECT:
5525 hn_update_link_status(sc);
5528 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
5529 /* Not really useful; ignore. */
5532 case RNDIS_STATUS_NETWORK_CHANGE:
5533 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
5534 if (dlen < ofs + msg->rm_stbuflen ||
5535 msg->rm_stbuflen < sizeof(uint32_t)) {
5536 if_printf(sc->hn_ifp, "network changed\n");
5540 memcpy(&change, ((const uint8_t *)msg) + ofs,
5542 if_printf(sc->hn_ifp, "network changed, change %u\n",
5545 hn_change_network(sc);
5549 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
5556 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
5558 const struct rndis_pktinfo *pi = info_data;
5561 while (info_dlen != 0) {
5565 if (__predict_false(info_dlen < sizeof(*pi)))
5567 if (__predict_false(info_dlen < pi->rm_size))
5569 info_dlen -= pi->rm_size;
5571 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
5573 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
5575 dlen = pi->rm_size - pi->rm_pktinfooffset;
5578 switch (pi->rm_type) {
5579 case NDIS_PKTINFO_TYPE_VLAN:
5580 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
5582 info->vlan_info = *((const uint32_t *)data);
5583 mask |= HN_RXINFO_VLAN;
5586 case NDIS_PKTINFO_TYPE_CSUM:
5587 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
5589 info->csum_info = *((const uint32_t *)data);
5590 mask |= HN_RXINFO_CSUM;
5593 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5594 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5596 info->hash_value = *((const uint32_t *)data);
5597 mask |= HN_RXINFO_HASHVAL;
5600 case HN_NDIS_PKTINFO_TYPE_HASHINF:
5601 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5603 info->hash_info = *((const uint32_t *)data);
5604 mask |= HN_RXINFO_HASHINF;
5611 if (mask == HN_RXINFO_ALL) {
5612 /* All found; done */
5616 pi = (const struct rndis_pktinfo *)
5617 ((const uint8_t *)pi + pi->rm_size);
5622 * - If there is no hash value, invalidate the hash info.
5624 if ((mask & HN_RXINFO_HASHVAL) == 0)
5625 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5629 static __inline bool
5630 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5633 if (off < check_off) {
5634 if (__predict_true(off + len <= check_off))
5636 } else if (off > check_off) {
5637 if (__predict_true(check_off + check_len <= off))
5644 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5646 const struct rndis_packet_msg *pkt;
5647 struct hn_rxinfo info;
5648 int data_off, pktinfo_off, data_len, pktinfo_len;
5653 if (__predict_false(dlen < sizeof(*pkt))) {
5654 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5659 if (__predict_false(dlen < pkt->rm_len)) {
5660 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5661 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5664 if (__predict_false(pkt->rm_len <
5665 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5666 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5667 "msglen %u, data %u, oob %u, pktinfo %u\n",
5668 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5669 pkt->rm_pktinfolen);
5672 if (__predict_false(pkt->rm_datalen == 0)) {
5673 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5680 #define IS_OFFSET_INVALID(ofs) \
5681 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
5682 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5684 /* XXX Hyper-V does not meet data offset alignment requirement */
5685 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5686 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5687 "data offset %u\n", pkt->rm_dataoffset);
5690 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5691 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5692 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5693 "oob offset %u\n", pkt->rm_oobdataoffset);
5696 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5697 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5698 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5699 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5703 #undef IS_OFFSET_INVALID
5705 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5706 data_len = pkt->rm_datalen;
5707 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5708 pktinfo_len = pkt->rm_pktinfolen;
5711 * Check OOB coverage.
5713 if (__predict_false(pkt->rm_oobdatalen != 0)) {
5714 int oob_off, oob_len;
5716 if_printf(rxr->hn_ifp, "got oobdata\n");
5717 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5718 oob_len = pkt->rm_oobdatalen;
5720 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5721 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5722 "oob overflow, msglen %u, oob abs %d len %d\n",
5723 pkt->rm_len, oob_off, oob_len);
5728 * Check against data.
5730 if (hn_rndis_check_overlap(oob_off, oob_len,
5731 data_off, data_len)) {
5732 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5733 "oob overlaps data, oob abs %d len %d, "
5734 "data abs %d len %d\n",
5735 oob_off, oob_len, data_off, data_len);
5740 * Check against pktinfo.
5742 if (pktinfo_len != 0 &&
5743 hn_rndis_check_overlap(oob_off, oob_len,
5744 pktinfo_off, pktinfo_len)) {
5745 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5746 "oob overlaps pktinfo, oob abs %d len %d, "
5747 "pktinfo abs %d len %d\n",
5748 oob_off, oob_len, pktinfo_off, pktinfo_len);
5754 * Check per-packet-info coverage and find useful per-packet-info.
5756 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5757 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5758 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5759 if (__predict_true(pktinfo_len != 0)) {
5763 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5764 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5765 "pktinfo overflow, msglen %u, "
5766 "pktinfo abs %d len %d\n",
5767 pkt->rm_len, pktinfo_off, pktinfo_len);
5772 * Check packet info coverage.
5774 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5775 data_off, data_len);
5776 if (__predict_false(overlap)) {
5777 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5778 "pktinfo overlap data, pktinfo abs %d len %d, "
5779 "data abs %d len %d\n",
5780 pktinfo_off, pktinfo_len, data_off, data_len);
5785 * Find useful per-packet-info.
5787 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5788 pktinfo_len, &info);
5789 if (__predict_false(error)) {
5790 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5796 if (__predict_false(data_off + data_len > pkt->rm_len)) {
5797 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5798 "data overflow, msglen %u, data abs %d len %d\n",
5799 pkt->rm_len, data_off, data_len);
5802 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5805 static __inline void
5806 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5808 const struct rndis_msghdr *hdr;
5810 if (__predict_false(dlen < sizeof(*hdr))) {
5811 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5816 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5817 /* Hot data path. */
5818 hn_rndis_rx_data(rxr, data, dlen);
5823 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5824 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5826 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5830 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5832 const struct hn_nvs_hdr *hdr;
5834 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5835 if_printf(sc->hn_ifp, "invalid nvs notify\n");
5838 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5840 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5841 /* Useless; ignore */
5844 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5848 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5849 const struct vmbus_chanpkt_hdr *pkt)
5851 struct hn_nvs_sendctx *sndc;
5853 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5854 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5855 VMBUS_CHANPKT_DATALEN(pkt));
5858 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5864 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5865 const struct vmbus_chanpkt_hdr *pkthdr)
5867 const struct vmbus_chanpkt_rxbuf *pkt;
5868 const struct hn_nvs_hdr *nvs_hdr;
5871 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5872 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5875 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5877 /* Make sure that this is a RNDIS message. */
5878 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5879 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5884 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5885 if (__predict_false(hlen < sizeof(*pkt))) {
5886 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5889 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5891 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5892 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5897 count = pkt->cp_rxbuf_cnt;
5898 if (__predict_false(hlen <
5899 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5900 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5904 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5905 for (i = 0; i < count; ++i) {
5908 ofs = pkt->cp_rxbuf[i].rb_ofs;
5909 len = pkt->cp_rxbuf[i].rb_len;
5910 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5911 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5912 "ofs %d, len %d\n", i, ofs, len);
5915 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5919 * Ack the consumed RXBUF associated w/ this channel packet,
5920 * so that this RXBUF can be recycled by the hypervisor.
5922 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5926 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5929 struct hn_nvs_rndis_ack ack;
5932 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5933 ack.nvs_status = HN_NVS_STATUS_OK;
5937 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5938 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5939 if (__predict_false(error == EAGAIN)) {
5942 * This should _not_ happen in real world, since the
5943 * consumption of the TX bufring from the TX path is
5946 if (rxr->hn_ack_failed == 0)
5947 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5948 rxr->hn_ack_failed++;
5955 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5960 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5962 struct hn_rx_ring *rxr = xrxr;
5963 struct hn_softc *sc = rxr->hn_ifp->if_softc;
5966 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5969 pktlen = rxr->hn_pktbuf_len;
5970 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5971 if (__predict_false(error == ENOBUFS)) {
5976 * Expand channel packet buffer.
5979 * Use M_WAITOK here, since allocation failure
5982 nlen = rxr->hn_pktbuf_len * 2;
5983 while (nlen < pktlen)
5985 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5987 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5988 rxr->hn_pktbuf_len, nlen);
5990 free(rxr->hn_pktbuf, M_DEVBUF);
5991 rxr->hn_pktbuf = nbuf;
5992 rxr->hn_pktbuf_len = nlen;
5995 } else if (__predict_false(error == EAGAIN)) {
5996 /* No more channel packets; done! */
5999 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
6001 switch (pkt->cph_type) {
6002 case VMBUS_CHANPKT_TYPE_COMP:
6003 hn_nvs_handle_comp(sc, chan, pkt);
6006 case VMBUS_CHANPKT_TYPE_RXBUF:
6007 hn_nvs_handle_rxbuf(rxr, chan, pkt);
6010 case VMBUS_CHANPKT_TYPE_INBAND:
6011 hn_nvs_handle_notify(sc, pkt);
6015 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
6020 hn_chan_rollup(rxr, rxr->hn_txr);
6024 hn_sysinit(void *arg __unused)
6029 * Initialize VF map.
6031 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
6032 hn_vfmap_size = HN_VFMAP_SIZE_DEF;
6033 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
6037 * Fix the # of TX taskqueues.
6039 if (hn_tx_taskq_cnt <= 0)
6040 hn_tx_taskq_cnt = 1;
6041 else if (hn_tx_taskq_cnt > mp_ncpus)
6042 hn_tx_taskq_cnt = mp_ncpus;
6045 * Fix the TX taskqueue mode.
6047 switch (hn_tx_taskq_mode) {
6048 case HN_TX_TASKQ_M_INDEP:
6049 case HN_TX_TASKQ_M_GLOBAL:
6050 case HN_TX_TASKQ_M_EVTTQ:
6053 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
6057 if (vm_guest != VM_GUEST_HV)
6060 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
6063 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
6064 M_DEVBUF, M_WAITOK);
6065 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
6066 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
6067 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
6068 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
6072 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
6075 hn_sysuninit(void *arg __unused)
6078 if (hn_tx_taskque != NULL) {
6081 for (i = 0; i < hn_tx_taskq_cnt; ++i)
6082 taskqueue_free(hn_tx_taskque[i]);
6083 free(hn_tx_taskque, M_DEVBUF);
6086 if (hn_vfmap != NULL)
6087 free(hn_vfmap, M_DEVBUF);
6088 rm_destroy(&hn_vfmap_lock);
6090 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);