2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
59 #include "opt_inet6.h"
63 #include <sys/param.h>
65 #include <sys/kernel.h>
66 #include <sys/limits.h>
67 #include <sys/malloc.h>
69 #include <sys/module.h>
70 #include <sys/queue.h>
72 #include <sys/rmlock.h>
75 #include <sys/socket.h>
76 #include <sys/sockio.h>
78 #include <sys/sysctl.h>
79 #include <sys/systm.h>
80 #include <sys/taskqueue.h>
81 #include <sys/buf_ring.h>
82 #include <sys/eventhandler.h>
84 #include <machine/atomic.h>
85 #include <machine/in_cksum.h>
88 #include <net/ethernet.h>
90 #include <net/if_dl.h>
91 #include <net/if_media.h>
92 #include <net/if_types.h>
93 #include <net/if_var.h>
94 #include <net/rndis.h>
96 #include <net/rss_config.h>
99 #include <netinet/in_systm.h>
100 #include <netinet/in.h>
101 #include <netinet/ip.h>
102 #include <netinet/ip6.h>
103 #include <netinet/tcp.h>
104 #include <netinet/tcp_lro.h>
105 #include <netinet/udp.h>
107 #include <dev/hyperv/include/hyperv.h>
108 #include <dev/hyperv/include/hyperv_busdma.h>
109 #include <dev/hyperv/include/vmbus.h>
110 #include <dev/hyperv/include/vmbus_xact.h>
112 #include <dev/hyperv/netvsc/ndis.h>
113 #include <dev/hyperv/netvsc/if_hnreg.h>
114 #include <dev/hyperv/netvsc/if_hnvar.h>
115 #include <dev/hyperv/netvsc/hn_nvs.h>
116 #include <dev/hyperv/netvsc/hn_rndis.h>
118 #include "vmbus_if.h"
120 #define HN_IFSTART_SUPPORT
122 #define HN_RING_CNT_DEF_MAX 8
124 #define HN_VFMAP_SIZE_DEF 8
126 /* YYY should get it from the underlying channel */
127 #define HN_TX_DESC_CNT 512
129 #define HN_RNDIS_PKT_LEN \
130 (sizeof(struct rndis_packet_msg) + \
131 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
132 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
133 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
134 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
135 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
136 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
138 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
139 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
140 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
141 /* -1 for RNDIS packet message */
142 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
144 #define HN_DIRECT_TX_SIZE_DEF 128
146 #define HN_EARLY_TXEOF_THRESH 8
148 #define HN_PKTBUF_LEN_DEF (16 * 1024)
150 #define HN_LROENT_CNT_DEF 128
152 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
153 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
154 /* YYY 2*MTU is a bit rough, but should be good enough. */
155 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
157 #define HN_LRO_ACKCNT_DEF 1
159 #define HN_LOCK_INIT(sc) \
160 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
161 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
162 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
163 #define HN_LOCK(sc) \
165 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
168 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
170 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
171 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
172 #define HN_CSUM_IP_HWASSIST(sc) \
173 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
174 #define HN_CSUM_IP6_HWASSIST(sc) \
175 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
177 #define HN_PKTSIZE_MIN(align) \
178 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
179 HN_RNDIS_PKT_LEN, (align))
180 #define HN_PKTSIZE(m, align) \
181 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
184 #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets())
186 #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus)
190 #ifndef HN_USE_TXDESC_BUFRING
191 SLIST_ENTRY(hn_txdesc) link;
193 STAILQ_ENTRY(hn_txdesc) agg_link;
195 /* Aggregated txdescs, in sending order. */
196 STAILQ_HEAD(, hn_txdesc) agg_list;
198 /* The oldest packet, if transmission aggregation happens. */
200 struct hn_tx_ring *txr;
202 uint32_t flags; /* HN_TXD_FLAG_ */
203 struct hn_nvs_sendctx send_ctx;
207 bus_dmamap_t data_dmap;
209 bus_addr_t rndis_pkt_paddr;
210 struct rndis_packet_msg *rndis_pkt;
211 bus_dmamap_t rndis_pkt_dmap;
214 #define HN_TXD_FLAG_ONLIST 0x0001
215 #define HN_TXD_FLAG_DMAMAP 0x0002
216 #define HN_TXD_FLAG_ONAGG 0x0004
225 struct hn_rxvf_setarg {
226 struct hn_rx_ring *rxr;
227 struct ifnet *vf_ifp;
230 #define HN_RXINFO_VLAN 0x0001
231 #define HN_RXINFO_CSUM 0x0002
232 #define HN_RXINFO_HASHINF 0x0004
233 #define HN_RXINFO_HASHVAL 0x0008
234 #define HN_RXINFO_ALL \
237 HN_RXINFO_HASHINF | \
240 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
241 #define HN_NDIS_RXCSUM_INFO_INVALID 0
242 #define HN_NDIS_HASH_INFO_INVALID 0
244 static int hn_probe(device_t);
245 static int hn_attach(device_t);
246 static int hn_detach(device_t);
247 static int hn_shutdown(device_t);
248 static void hn_chan_callback(struct vmbus_channel *,
251 static void hn_init(void *);
252 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
253 #ifdef HN_IFSTART_SUPPORT
254 static void hn_start(struct ifnet *);
256 static int hn_transmit(struct ifnet *, struct mbuf *);
257 static void hn_xmit_qflush(struct ifnet *);
258 static int hn_ifmedia_upd(struct ifnet *);
259 static void hn_ifmedia_sts(struct ifnet *,
260 struct ifmediareq *);
262 static void hn_ifnet_event(void *, struct ifnet *, int);
263 static void hn_ifaddr_event(void *, struct ifnet *);
264 static void hn_ifnet_attevent(void *, struct ifnet *);
265 static void hn_ifnet_detevent(void *, struct ifnet *);
267 static bool hn_ismyvf(const struct hn_softc *,
268 const struct ifnet *);
269 static void hn_rxvf_change(struct hn_softc *,
270 struct ifnet *, bool);
271 static void hn_rxvf_set(struct hn_softc *, struct ifnet *);
272 static void hn_rxvf_set_task(void *, int);
274 static int hn_rndis_rxinfo(const void *, int,
276 static void hn_rndis_rx_data(struct hn_rx_ring *,
278 static void hn_rndis_rx_status(struct hn_softc *,
280 static void hn_rndis_init_fixat(struct hn_softc *, int);
282 static void hn_nvs_handle_notify(struct hn_softc *,
283 const struct vmbus_chanpkt_hdr *);
284 static void hn_nvs_handle_comp(struct hn_softc *,
285 struct vmbus_channel *,
286 const struct vmbus_chanpkt_hdr *);
287 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
288 struct vmbus_channel *,
289 const struct vmbus_chanpkt_hdr *);
290 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
291 struct vmbus_channel *, uint64_t);
293 #if __FreeBSD_version >= 1100099
294 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
295 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
297 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
298 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
299 #if __FreeBSD_version < 1100095
300 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
302 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
304 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
305 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
306 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
307 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
308 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
309 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
310 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
312 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
313 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
315 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
316 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
317 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
318 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
319 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
320 static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS);
321 static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
322 static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
323 static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
324 static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
326 static void hn_stop(struct hn_softc *, bool);
327 static void hn_init_locked(struct hn_softc *);
328 static int hn_chan_attach(struct hn_softc *,
329 struct vmbus_channel *);
330 static void hn_chan_detach(struct hn_softc *,
331 struct vmbus_channel *);
332 static int hn_attach_subchans(struct hn_softc *);
333 static void hn_detach_allchans(struct hn_softc *);
334 static void hn_chan_rollup(struct hn_rx_ring *,
335 struct hn_tx_ring *);
336 static void hn_set_ring_inuse(struct hn_softc *, int);
337 static int hn_synth_attach(struct hn_softc *, int);
338 static void hn_synth_detach(struct hn_softc *);
339 static int hn_synth_alloc_subchans(struct hn_softc *,
341 static bool hn_synth_attachable(const struct hn_softc *);
342 static void hn_suspend(struct hn_softc *);
343 static void hn_suspend_data(struct hn_softc *);
344 static void hn_suspend_mgmt(struct hn_softc *);
345 static void hn_resume(struct hn_softc *);
346 static void hn_resume_data(struct hn_softc *);
347 static void hn_resume_mgmt(struct hn_softc *);
348 static void hn_suspend_mgmt_taskfunc(void *, int);
349 static void hn_chan_drain(struct hn_softc *,
350 struct vmbus_channel *);
351 static void hn_disable_rx(struct hn_softc *);
352 static void hn_drain_rxtx(struct hn_softc *, int);
353 static void hn_polling(struct hn_softc *, u_int);
354 static void hn_chan_polling(struct vmbus_channel *, u_int);
356 static void hn_update_link_status(struct hn_softc *);
357 static void hn_change_network(struct hn_softc *);
358 static void hn_link_taskfunc(void *, int);
359 static void hn_netchg_init_taskfunc(void *, int);
360 static void hn_netchg_status_taskfunc(void *, int);
361 static void hn_link_status(struct hn_softc *);
363 static int hn_create_rx_data(struct hn_softc *, int);
364 static void hn_destroy_rx_data(struct hn_softc *);
365 static int hn_check_iplen(const struct mbuf *, int);
366 static int hn_set_rxfilter(struct hn_softc *, uint32_t);
367 static int hn_rxfilter_config(struct hn_softc *);
369 static int hn_rss_reconfig(struct hn_softc *);
371 static void hn_rss_ind_fixup(struct hn_softc *);
372 static int hn_rxpkt(struct hn_rx_ring *, const void *,
373 int, const struct hn_rxinfo *);
375 static int hn_tx_ring_create(struct hn_softc *, int);
376 static void hn_tx_ring_destroy(struct hn_tx_ring *);
377 static int hn_create_tx_data(struct hn_softc *, int);
378 static void hn_fixup_tx_data(struct hn_softc *);
379 static void hn_destroy_tx_data(struct hn_softc *);
380 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
381 static void hn_txdesc_gc(struct hn_tx_ring *,
383 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
384 struct hn_txdesc *, struct mbuf **);
385 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
387 static void hn_set_chim_size(struct hn_softc *, int);
388 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
389 static bool hn_tx_ring_pending(struct hn_tx_ring *);
390 static void hn_tx_ring_qflush(struct hn_tx_ring *);
391 static void hn_resume_tx(struct hn_softc *, int);
392 static void hn_set_txagg(struct hn_softc *);
393 static void *hn_try_txagg(struct ifnet *,
394 struct hn_tx_ring *, struct hn_txdesc *,
396 static int hn_get_txswq_depth(const struct hn_tx_ring *);
397 static void hn_txpkt_done(struct hn_nvs_sendctx *,
398 struct hn_softc *, struct vmbus_channel *,
400 static int hn_txpkt_sglist(struct hn_tx_ring *,
402 static int hn_txpkt_chim(struct hn_tx_ring *,
404 static int hn_xmit(struct hn_tx_ring *, int);
405 static void hn_xmit_taskfunc(void *, int);
406 static void hn_xmit_txeof(struct hn_tx_ring *);
407 static void hn_xmit_txeof_taskfunc(void *, int);
408 #ifdef HN_IFSTART_SUPPORT
409 static int hn_start_locked(struct hn_tx_ring *, int);
410 static void hn_start_taskfunc(void *, int);
411 static void hn_start_txeof(struct hn_tx_ring *);
412 static void hn_start_txeof_taskfunc(void *, int);
415 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
416 "Hyper-V network interface");
418 /* Trust tcp segements verification on host side. */
419 static int hn_trust_hosttcp = 1;
420 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
421 &hn_trust_hosttcp, 0,
422 "Trust tcp segement verification on host side, "
423 "when csum info is missing (global setting)");
425 /* Trust udp datagrams verification on host side. */
426 static int hn_trust_hostudp = 1;
427 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
428 &hn_trust_hostudp, 0,
429 "Trust udp datagram verification on host side, "
430 "when csum info is missing (global setting)");
432 /* Trust ip packets verification on host side. */
433 static int hn_trust_hostip = 1;
434 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
436 "Trust ip packet verification on host side, "
437 "when csum info is missing (global setting)");
439 /* Limit TSO burst size */
440 static int hn_tso_maxlen = IP_MAXPACKET;
441 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
442 &hn_tso_maxlen, 0, "TSO burst limit");
444 /* Limit chimney send size */
445 static int hn_tx_chimney_size = 0;
446 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
447 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
449 /* Limit the size of packet for direct transmission */
450 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
451 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
452 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
454 /* # of LRO entries per RX ring */
455 #if defined(INET) || defined(INET6)
456 #if __FreeBSD_version >= 1100095
457 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
458 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
459 &hn_lro_entry_count, 0, "LRO entry count");
463 static int hn_tx_taskq_cnt = 1;
464 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN,
465 &hn_tx_taskq_cnt, 0, "# of TX taskqueues");
467 #define HN_TX_TASKQ_M_INDEP 0
468 #define HN_TX_TASKQ_M_GLOBAL 1
469 #define HN_TX_TASKQ_M_EVTTQ 2
471 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
472 SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN,
473 &hn_tx_taskq_mode, 0, "TX taskqueue modes: "
474 "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs");
476 #ifndef HN_USE_TXDESC_BUFRING
477 static int hn_use_txdesc_bufring = 0;
479 static int hn_use_txdesc_bufring = 1;
481 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
482 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
484 #ifdef HN_IFSTART_SUPPORT
485 /* Use ifnet.if_start instead of ifnet.if_transmit */
486 static int hn_use_if_start = 0;
487 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
488 &hn_use_if_start, 0, "Use if_start TX method");
491 /* # of channels to use */
492 static int hn_chan_cnt = 0;
493 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
495 "# of channels to use; each channel has one RX ring and one TX ring");
497 /* # of transmit rings to use */
498 static int hn_tx_ring_cnt = 0;
499 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
500 &hn_tx_ring_cnt, 0, "# of TX rings to use");
502 /* Software TX ring deptch */
503 static int hn_tx_swq_depth = 0;
504 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
505 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
507 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
508 #if __FreeBSD_version >= 1100095
509 static u_int hn_lro_mbufq_depth = 512;
510 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
511 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
514 /* Packet transmission aggregation size limit */
515 static int hn_tx_agg_size = -1;
516 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
517 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
519 /* Packet transmission aggregation count limit */
520 static int hn_tx_agg_pkts = -1;
521 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
522 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
525 SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
526 0, 0, hn_vflist_sysctl, "A", "VF list");
529 SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
530 0, 0, hn_vfmap_sysctl, "A", "VF mapping");
532 static u_int hn_cpu_index; /* next CPU for channel */
533 static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
535 static struct rmlock hn_vfmap_lock;
536 static int hn_vfmap_size;
537 static struct ifnet **hn_vfmap;
541 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
542 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
543 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
544 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
545 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
546 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
550 static device_method_t hn_methods[] = {
551 /* Device interface */
552 DEVMETHOD(device_probe, hn_probe),
553 DEVMETHOD(device_attach, hn_attach),
554 DEVMETHOD(device_detach, hn_detach),
555 DEVMETHOD(device_shutdown, hn_shutdown),
559 static driver_t hn_driver = {
562 sizeof(struct hn_softc)
565 static devclass_t hn_devclass;
567 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
568 MODULE_VERSION(hn, 1);
569 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
571 #if __FreeBSD_version >= 1100099
573 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
577 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
578 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
583 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
586 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
587 txd->chim_size == 0, ("invalid rndis sglist txd"));
588 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
589 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
593 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
595 struct hn_nvs_rndis rndis;
597 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
598 txd->chim_size > 0, ("invalid rndis chim txd"));
600 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
601 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
602 rndis.nvs_chim_idx = txd->chim_index;
603 rndis.nvs_chim_sz = txd->chim_size;
605 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
606 &rndis, sizeof(rndis), &txd->send_ctx));
609 static __inline uint32_t
610 hn_chim_alloc(struct hn_softc *sc)
612 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
613 u_long *bmap = sc->hn_chim_bmap;
614 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
616 for (i = 0; i < bmap_cnt; ++i) {
619 idx = ffsl(~bmap[i]);
623 --idx; /* ffsl is 1-based */
624 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
625 ("invalid i %d and idx %d", i, idx));
627 if (atomic_testandset_long(&bmap[i], idx))
630 ret = i * LONG_BIT + idx;
637 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
642 idx = chim_idx / LONG_BIT;
643 KASSERT(idx < sc->hn_chim_bmap_cnt,
644 ("invalid chimney index 0x%x", chim_idx));
646 mask = 1UL << (chim_idx % LONG_BIT);
647 KASSERT(sc->hn_chim_bmap[idx] & mask,
648 ("index bitmap 0x%lx, chimney index %u, "
649 "bitmap idx %d, bitmask 0x%lx",
650 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
652 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
655 #if defined(INET6) || defined(INET)
657 #define PULLUP_HDR(m, len) \
659 if (__predict_false((m)->m_len < (len))) { \
660 (m) = m_pullup((m), (len)); \
667 * NOTE: If this function failed, the m_head would be freed.
669 static __inline struct mbuf *
670 hn_tso_fixup(struct mbuf *m_head)
672 struct ether_vlan_header *evl;
676 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
678 PULLUP_HDR(m_head, sizeof(*evl));
679 evl = mtod(m_head, struct ether_vlan_header *);
680 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
681 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
683 ehlen = ETHER_HDR_LEN;
686 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
690 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
691 ip = mtodo(m_head, ehlen);
692 iphlen = ip->ip_hl << 2;
694 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
695 th = mtodo(m_head, ehlen + iphlen);
699 th->th_sum = in_pseudo(ip->ip_src.s_addr,
700 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
703 #if defined(INET6) && defined(INET)
710 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
711 ip6 = mtodo(m_head, ehlen);
712 if (ip6->ip6_nxt != IPPROTO_TCP) {
717 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
718 th = mtodo(m_head, ehlen + sizeof(*ip6));
721 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
729 * NOTE: If this function failed, the m_head would be freed.
731 static __inline struct mbuf *
732 hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn)
734 const struct ether_vlan_header *evl;
735 const struct tcphdr *th;
740 PULLUP_HDR(m_head, sizeof(*evl));
741 evl = mtod(m_head, const struct ether_vlan_header *);
742 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
743 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
745 ehlen = ETHER_HDR_LEN;
748 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TCP) {
752 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
753 ip = mtodo(m_head, ehlen);
754 iphlen = ip->ip_hl << 2;
756 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
757 th = mtodo(m_head, ehlen + iphlen);
758 if (th->th_flags & TH_SYN)
762 #if defined(INET6) && defined(INET)
767 const struct ip6_hdr *ip6;
769 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
770 ip6 = mtodo(m_head, ehlen);
771 if (ip6->ip6_nxt != IPPROTO_TCP)
774 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
775 th = mtodo(m_head, ehlen + sizeof(*ip6));
776 if (th->th_flags & TH_SYN)
785 #endif /* INET6 || INET */
788 hn_set_rxfilter(struct hn_softc *sc, uint32_t filter)
794 if (sc->hn_rx_filter != filter) {
795 error = hn_rndis_set_rxfilter(sc, filter);
797 sc->hn_rx_filter = filter;
803 hn_rxfilter_config(struct hn_softc *sc)
805 struct ifnet *ifp = sc->hn_ifp;
810 if ((ifp->if_flags & IFF_PROMISC) ||
811 (sc->hn_flags & HN_FLAG_RXVF)) {
812 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
814 filter = NDIS_PACKET_TYPE_DIRECTED;
815 if (ifp->if_flags & IFF_BROADCAST)
816 filter |= NDIS_PACKET_TYPE_BROADCAST;
817 /* TODO: support multicast list */
818 if ((ifp->if_flags & IFF_ALLMULTI) ||
819 !TAILQ_EMPTY(&ifp->if_multiaddrs))
820 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
822 return (hn_set_rxfilter(sc, filter));
826 hn_set_txagg(struct hn_softc *sc)
832 * Setup aggregation size.
834 if (sc->hn_agg_size < 0)
837 size = sc->hn_agg_size;
839 if (sc->hn_rndis_agg_size < size)
840 size = sc->hn_rndis_agg_size;
842 /* NOTE: We only aggregate packets using chimney sending buffers. */
843 if (size > (uint32_t)sc->hn_chim_szmax)
844 size = sc->hn_chim_szmax;
846 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
853 /* NOTE: Type of the per TX ring setting is 'int'. */
858 * Setup aggregation packet count.
860 if (sc->hn_agg_pkts < 0)
863 pkts = sc->hn_agg_pkts;
865 if (sc->hn_rndis_agg_pkts < pkts)
866 pkts = sc->hn_rndis_agg_pkts;
875 /* NOTE: Type of the per TX ring setting is 'short'. */
880 /* NOTE: Type of the per TX ring setting is 'short'. */
881 if (sc->hn_rndis_agg_align > SHRT_MAX) {
888 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
889 size, pkts, sc->hn_rndis_agg_align);
892 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
893 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
895 mtx_lock(&txr->hn_tx_lock);
896 txr->hn_agg_szmax = size;
897 txr->hn_agg_pktmax = pkts;
898 txr->hn_agg_align = sc->hn_rndis_agg_align;
899 mtx_unlock(&txr->hn_tx_lock);
904 hn_get_txswq_depth(const struct hn_tx_ring *txr)
907 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
908 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
909 return txr->hn_txdesc_cnt;
910 return hn_tx_swq_depth;
915 hn_rss_reconfig(struct hn_softc *sc)
921 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
928 * Direct reconfiguration by setting the UNCHG flags does
929 * _not_ work properly.
932 if_printf(sc->hn_ifp, "disable RSS\n");
933 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
935 if_printf(sc->hn_ifp, "RSS disable failed\n");
940 * Reenable the RSS w/ the updated RSS key or indirect
944 if_printf(sc->hn_ifp, "reconfig RSS\n");
945 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
947 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
955 hn_rss_ind_fixup(struct hn_softc *sc)
957 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
960 nchan = sc->hn_rx_ring_inuse;
961 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
964 * Check indirect table to make sure that all channels in it
967 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
968 if (rss->rss_ind[i] >= nchan) {
969 if_printf(sc->hn_ifp,
970 "RSS indirect table %d fixup: %u -> %d\n",
971 i, rss->rss_ind[i], nchan - 1);
972 rss->rss_ind[i] = nchan - 1;
978 hn_ifmedia_upd(struct ifnet *ifp __unused)
985 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
987 struct hn_softc *sc = ifp->if_softc;
989 ifmr->ifm_status = IFM_AVALID;
990 ifmr->ifm_active = IFM_ETHER;
992 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
993 ifmr->ifm_active |= IFM_NONE;
996 ifmr->ifm_status |= IFM_ACTIVE;
997 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
1001 hn_rxvf_set_task(void *xarg, int pending __unused)
1003 struct hn_rxvf_setarg *arg = xarg;
1005 arg->rxr->hn_rxvf_ifp = arg->vf_ifp;
1009 hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp)
1011 struct hn_rx_ring *rxr;
1012 struct hn_rxvf_setarg arg;
1018 TASK_INIT(&task, 0, hn_rxvf_set_task, &arg);
1020 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1021 rxr = &sc->hn_rx_ring[i];
1023 if (i < sc->hn_rx_ring_inuse) {
1025 arg.vf_ifp = vf_ifp;
1026 vmbus_chan_run_task(rxr->hn_chan, &task);
1028 rxr->hn_rxvf_ifp = vf_ifp;
1034 hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp)
1036 const struct ifnet *hn_ifp;
1038 hn_ifp = sc->hn_ifp;
1043 if (ifp->if_alloctype != IFT_ETHER)
1046 /* Ignore lagg/vlan interfaces */
1047 if (strcmp(ifp->if_dname, "lagg") == 0 ||
1048 strcmp(ifp->if_dname, "vlan") == 0)
1051 if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0)
1058 hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
1060 struct ifnet *hn_ifp;
1064 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1067 if (!hn_ismyvf(sc, ifp))
1069 hn_ifp = sc->hn_ifp;
1072 if (sc->hn_flags & HN_FLAG_RXVF)
1075 sc->hn_flags |= HN_FLAG_RXVF;
1076 hn_rxfilter_config(sc);
1078 if (!(sc->hn_flags & HN_FLAG_RXVF))
1081 sc->hn_flags &= ~HN_FLAG_RXVF;
1082 if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
1083 hn_rxfilter_config(sc);
1085 hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE);
1088 hn_nvs_set_datapath(sc,
1089 rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTHETIC);
1091 hn_rxvf_set(sc, rxvf ? ifp : NULL);
1094 hn_suspend_mgmt(sc);
1095 sc->hn_link_flags &=
1096 ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG);
1097 if_link_state_change(hn_ifp, LINK_STATE_DOWN);
1102 devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname,
1103 rxvf ? "VF_UP" : "VF_DOWN", NULL);
1106 if_printf(hn_ifp, "datapath is switched %s %s\n",
1107 rxvf ? "to" : "from", ifp->if_xname);
1114 hn_ifnet_event(void *arg, struct ifnet *ifp, int event)
1117 if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN)
1119 hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP);
1123 hn_ifaddr_event(void *arg, struct ifnet *ifp)
1126 hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
1130 hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
1132 struct hn_softc *sc = xsc;
1136 if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
1139 if (!hn_ismyvf(sc, ifp))
1142 if (sc->hn_vf_ifp != NULL) {
1143 if_printf(sc->hn_ifp, "%s was attached as VF\n",
1144 sc->hn_vf_ifp->if_xname);
1148 rm_wlock(&hn_vfmap_lock);
1150 if (ifp->if_index >= hn_vfmap_size) {
1151 struct ifnet **newmap;
1154 newsize = ifp->if_index + HN_VFMAP_SIZE_DEF;
1155 newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF,
1158 memcpy(newmap, hn_vfmap,
1159 sizeof(struct ifnet *) * hn_vfmap_size);
1160 free(hn_vfmap, M_DEVBUF);
1162 hn_vfmap_size = newsize;
1164 KASSERT(hn_vfmap[ifp->if_index] == NULL,
1165 ("%s: ifindex %d was mapped to %s",
1166 ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname));
1167 hn_vfmap[ifp->if_index] = sc->hn_ifp;
1169 rm_wunlock(&hn_vfmap_lock);
1171 sc->hn_vf_ifp = ifp;
1177 hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
1179 struct hn_softc *sc = xsc;
1183 if (sc->hn_vf_ifp == NULL)
1186 if (!hn_ismyvf(sc, ifp))
1189 sc->hn_vf_ifp = NULL;
1191 rm_wlock(&hn_vfmap_lock);
1193 KASSERT(ifp->if_index < hn_vfmap_size,
1194 ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size));
1195 if (hn_vfmap[ifp->if_index] != NULL) {
1196 KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp,
1197 ("%s: ifindex %d was mapped to %s",
1198 ifp->if_xname, ifp->if_index,
1199 hn_vfmap[ifp->if_index]->if_xname));
1200 hn_vfmap[ifp->if_index] = NULL;
1203 rm_wunlock(&hn_vfmap_lock);
1208 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
1209 static const struct hyperv_guid g_net_vsc_device_type = {
1210 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
1211 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
1215 hn_probe(device_t dev)
1218 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
1219 &g_net_vsc_device_type) == 0) {
1220 device_set_desc(dev, "Hyper-V Network Interface");
1221 return BUS_PROBE_DEFAULT;
1227 hn_attach(device_t dev)
1229 struct hn_softc *sc = device_get_softc(dev);
1230 struct sysctl_oid_list *child;
1231 struct sysctl_ctx_list *ctx;
1232 uint8_t eaddr[ETHER_ADDR_LEN];
1233 struct ifnet *ifp = NULL;
1234 int error, ring_cnt, tx_ring_cnt;
1237 sc->hn_prichan = vmbus_get_channel(dev);
1241 * Initialize these tunables once.
1243 sc->hn_agg_size = hn_tx_agg_size;
1244 sc->hn_agg_pkts = hn_tx_agg_pkts;
1247 * Setup taskqueue for transmission.
1249 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) {
1253 malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
1254 M_DEVBUF, M_WAITOK);
1255 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
1256 sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx",
1257 M_WAITOK, taskqueue_thread_enqueue,
1258 &sc->hn_tx_taskqs[i]);
1259 taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET,
1260 "%s tx%d", device_get_nameunit(dev), i);
1262 } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) {
1263 sc->hn_tx_taskqs = hn_tx_taskque;
1267 * Setup taskqueue for mangement tasks, e.g. link status.
1269 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
1270 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
1271 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
1272 device_get_nameunit(dev));
1273 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
1274 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
1275 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
1276 hn_netchg_status_taskfunc, sc);
1279 * Allocate ifnet and setup its name earlier, so that if_printf
1280 * can be used by functions, which will be called after
1283 ifp = sc->hn_ifp = if_alloc(IFT_ETHER);
1285 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
1288 * Initialize ifmedia earlier so that it can be unconditionally
1289 * destroyed, if error happened later on.
1291 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
1294 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
1295 * to use (tx_ring_cnt).
1298 * The # of RX rings to use is same as the # of channels to use.
1300 ring_cnt = hn_chan_cnt;
1301 if (ring_cnt <= 0) {
1303 ring_cnt = mp_ncpus;
1304 if (ring_cnt > HN_RING_CNT_DEF_MAX)
1305 ring_cnt = HN_RING_CNT_DEF_MAX;
1306 } else if (ring_cnt > mp_ncpus) {
1307 ring_cnt = mp_ncpus;
1310 if (ring_cnt > rss_getnumbuckets())
1311 ring_cnt = rss_getnumbuckets();
1314 tx_ring_cnt = hn_tx_ring_cnt;
1315 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
1316 tx_ring_cnt = ring_cnt;
1317 #ifdef HN_IFSTART_SUPPORT
1318 if (hn_use_if_start) {
1319 /* ifnet.if_start only needs one TX ring. */
1325 * Set the leader CPU for channels.
1327 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1330 * Create enough TX/RX rings, even if only limited number of
1331 * channels can be allocated.
1333 error = hn_create_tx_data(sc, tx_ring_cnt);
1336 error = hn_create_rx_data(sc, ring_cnt);
1341 * Create transaction context for NVS and RNDIS transactions.
1343 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1344 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1345 if (sc->hn_xact == NULL) {
1351 * Install orphan handler for the revocation of this device's
1355 * The processing order is critical here:
1356 * Install the orphan handler, _before_ testing whether this
1357 * device's primary channel has been revoked or not.
1359 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1360 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1366 * Attach the synthetic parts, i.e. NVS and RNDIS.
1368 error = hn_synth_attach(sc, ETHERMTU);
1372 error = hn_rndis_get_eaddr(sc, eaddr);
1376 #if __FreeBSD_version >= 1100099
1377 if (sc->hn_rx_ring_inuse > 1) {
1379 * Reduce TCP segment aggregation limit for multiple
1380 * RX rings to increase ACK timeliness.
1382 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1387 * Fixup TX stuffs after synthetic parts are attached.
1389 hn_fixup_tx_data(sc);
1391 ctx = device_get_sysctl_ctx(dev);
1392 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1393 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1394 &sc->hn_nvs_ver, 0, "NVS version");
1395 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1396 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1397 hn_ndis_version_sysctl, "A", "NDIS version");
1398 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1399 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1400 hn_caps_sysctl, "A", "capabilities");
1401 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1402 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1403 hn_hwassist_sysctl, "A", "hwassist");
1404 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1405 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1406 hn_rxfilter_sysctl, "A", "rxfilter");
1407 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1408 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1409 hn_rss_hash_sysctl, "A", "RSS hash");
1410 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1411 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1414 * Don't allow RSS key/indirect table changes, if RSS is defined.
1416 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1417 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1418 hn_rss_key_sysctl, "IU", "RSS key");
1419 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1420 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1421 hn_rss_ind_sysctl, "IU", "RSS indirect table");
1423 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1424 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1425 "RNDIS offered packet transmission aggregation size limit");
1426 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1427 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1428 "RNDIS offered packet transmission aggregation count limit");
1429 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1430 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1431 "RNDIS packet transmission aggregation alignment");
1432 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1433 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1434 hn_txagg_size_sysctl, "I",
1435 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1436 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1437 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1438 hn_txagg_pkts_sysctl, "I",
1439 "Packet transmission aggregation packets, "
1440 "0 -- disable, -1 -- auto");
1441 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling",
1442 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1443 hn_polling_sysctl, "I",
1444 "Polling frequency: [100,1000000], 0 disable polling");
1445 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
1446 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1447 hn_vf_sysctl, "A", "Virtual Function's name");
1448 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
1449 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1450 hn_rxvf_sysctl, "A", "activated Virtual Function's name");
1453 * Setup the ifmedia, which has been initialized earlier.
1455 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1456 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1457 /* XXX ifmedia_set really should do this for us */
1458 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1461 * Setup the ifnet for this interface.
1464 ifp->if_baudrate = IF_Gbps(10);
1465 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1466 ifp->if_ioctl = hn_ioctl;
1467 ifp->if_init = hn_init;
1468 #ifdef HN_IFSTART_SUPPORT
1469 if (hn_use_if_start) {
1470 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1472 ifp->if_start = hn_start;
1473 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1474 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1475 IFQ_SET_READY(&ifp->if_snd);
1479 ifp->if_transmit = hn_transmit;
1480 ifp->if_qflush = hn_xmit_qflush;
1483 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1485 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1486 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1488 if (sc->hn_caps & HN_CAP_VLAN) {
1489 /* XXX not sure about VLAN_MTU. */
1490 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1493 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1494 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1495 ifp->if_capabilities |= IFCAP_TXCSUM;
1496 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1497 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1498 if (sc->hn_caps & HN_CAP_TSO4) {
1499 ifp->if_capabilities |= IFCAP_TSO4;
1500 ifp->if_hwassist |= CSUM_IP_TSO;
1502 if (sc->hn_caps & HN_CAP_TSO6) {
1503 ifp->if_capabilities |= IFCAP_TSO6;
1504 ifp->if_hwassist |= CSUM_IP6_TSO;
1507 /* Enable all available capabilities by default. */
1508 ifp->if_capenable = ifp->if_capabilities;
1511 * Disable IPv6 TSO and TXCSUM by default, they still can
1512 * be enabled through SIOCSIFCAP.
1514 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1515 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1517 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1518 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1519 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1520 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1523 ether_ifattach(ifp, eaddr);
1525 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1526 if_printf(ifp, "TSO segcnt %u segsz %u\n",
1527 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1530 /* Inform the upper layer about the long frame support. */
1531 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1534 * Kick off link status check.
1536 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1537 hn_update_link_status(sc);
1539 sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
1540 hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
1541 sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
1542 hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
1546 * Subscribe ether_ifattach event, instead of ifnet_arrival event,
1547 * since interface's LLADDR is needed; interface LLADDR is not
1548 * available when ifnet_arrival event is triggered.
1550 sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event,
1551 hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY);
1552 sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event,
1553 hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY);
1557 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1558 hn_synth_detach(sc);
1564 hn_detach(device_t dev)
1566 struct hn_softc *sc = device_get_softc(dev);
1567 struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
1569 if (sc->hn_ifaddr_evthand != NULL)
1570 EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
1571 if (sc->hn_ifnet_evthand != NULL)
1572 EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand);
1573 if (sc->hn_ifnet_atthand != NULL) {
1574 EVENTHANDLER_DEREGISTER(ether_ifattach_event,
1575 sc->hn_ifnet_atthand);
1577 if (sc->hn_ifnet_dethand != NULL) {
1578 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
1579 sc->hn_ifnet_dethand);
1582 vf_ifp = sc->hn_vf_ifp;
1583 __compiler_membar();
1585 hn_ifnet_detevent(sc, vf_ifp);
1587 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1589 * In case that the vmbus missed the orphan handler
1592 vmbus_xact_ctx_orphan(sc->hn_xact);
1595 if (device_is_attached(dev)) {
1597 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1598 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1602 * hn_stop() only suspends data, so managment
1603 * stuffs have to be suspended manually here.
1605 hn_suspend_mgmt(sc);
1606 hn_synth_detach(sc);
1609 ether_ifdetach(ifp);
1612 ifmedia_removeall(&sc->hn_media);
1613 hn_destroy_rx_data(sc);
1614 hn_destroy_tx_data(sc);
1616 if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) {
1619 for (i = 0; i < hn_tx_taskq_cnt; ++i)
1620 taskqueue_free(sc->hn_tx_taskqs[i]);
1621 free(sc->hn_tx_taskqs, M_DEVBUF);
1623 taskqueue_free(sc->hn_mgmt_taskq0);
1625 if (sc->hn_xact != NULL) {
1627 * Uninstall the orphan handler _before_ the xact is
1630 vmbus_chan_unset_orphan(sc->hn_prichan);
1631 vmbus_xact_ctx_destroy(sc->hn_xact);
1636 HN_LOCK_DESTROY(sc);
1641 hn_shutdown(device_t dev)
1648 hn_link_status(struct hn_softc *sc)
1650 uint32_t link_status;
1653 error = hn_rndis_get_linkstatus(sc, &link_status);
1655 /* XXX what to do? */
1659 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1660 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1662 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1663 if_link_state_change(sc->hn_ifp,
1664 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1665 LINK_STATE_UP : LINK_STATE_DOWN);
1669 hn_link_taskfunc(void *xsc, int pending __unused)
1671 struct hn_softc *sc = xsc;
1673 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1679 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1681 struct hn_softc *sc = xsc;
1683 /* Prevent any link status checks from running. */
1684 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1687 * Fake up a [link down --> link up] state change; 5 seconds
1688 * delay is used, which closely simulates miibus reaction
1689 * upon link down event.
1691 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1692 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1693 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1694 &sc->hn_netchg_status, 5 * hz);
1698 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1700 struct hn_softc *sc = xsc;
1702 /* Re-allow link status checks. */
1703 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1708 hn_update_link_status(struct hn_softc *sc)
1711 if (sc->hn_mgmt_taskq != NULL)
1712 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1716 hn_change_network(struct hn_softc *sc)
1719 if (sc->hn_mgmt_taskq != NULL)
1720 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1724 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1725 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1727 struct mbuf *m = *m_head;
1730 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1732 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1733 m, segs, nsegs, BUS_DMA_NOWAIT);
1734 if (error == EFBIG) {
1737 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1741 *m_head = m = m_new;
1742 txr->hn_tx_collapsed++;
1744 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1745 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1748 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1749 BUS_DMASYNC_PREWRITE);
1750 txd->flags |= HN_TXD_FLAG_DMAMAP;
1756 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1759 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1760 ("put an onlist txd %#x", txd->flags));
1761 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1762 ("put an onagg txd %#x", txd->flags));
1764 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1765 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1768 if (!STAILQ_EMPTY(&txd->agg_list)) {
1769 struct hn_txdesc *tmp_txd;
1771 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1774 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1775 ("resursive aggregation on aggregated txdesc"));
1776 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1777 ("not aggregated txdesc"));
1778 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1779 ("aggregated txdesc uses dmamap"));
1780 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1781 ("aggregated txdesc consumes "
1782 "chimney sending buffer"));
1783 KASSERT(tmp_txd->chim_size == 0,
1784 ("aggregated txdesc has non-zero "
1785 "chimney sending size"));
1787 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1788 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1789 freed = hn_txdesc_put(txr, tmp_txd);
1790 KASSERT(freed, ("failed to free aggregated txdesc"));
1794 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1795 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1796 ("chim txd uses dmamap"));
1797 hn_chim_free(txr->hn_sc, txd->chim_index);
1798 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1800 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1801 bus_dmamap_sync(txr->hn_tx_data_dtag,
1802 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1803 bus_dmamap_unload(txr->hn_tx_data_dtag,
1805 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1808 if (txd->m != NULL) {
1813 txd->flags |= HN_TXD_FLAG_ONLIST;
1814 #ifndef HN_USE_TXDESC_BUFRING
1815 mtx_lock_spin(&txr->hn_txlist_spin);
1816 KASSERT(txr->hn_txdesc_avail >= 0 &&
1817 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1818 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1819 txr->hn_txdesc_avail++;
1820 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1821 mtx_unlock_spin(&txr->hn_txlist_spin);
1822 #else /* HN_USE_TXDESC_BUFRING */
1824 atomic_add_int(&txr->hn_txdesc_avail, 1);
1826 buf_ring_enqueue(txr->hn_txdesc_br, txd);
1827 #endif /* !HN_USE_TXDESC_BUFRING */
1832 static __inline struct hn_txdesc *
1833 hn_txdesc_get(struct hn_tx_ring *txr)
1835 struct hn_txdesc *txd;
1837 #ifndef HN_USE_TXDESC_BUFRING
1838 mtx_lock_spin(&txr->hn_txlist_spin);
1839 txd = SLIST_FIRST(&txr->hn_txlist);
1841 KASSERT(txr->hn_txdesc_avail > 0,
1842 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1843 txr->hn_txdesc_avail--;
1844 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1846 mtx_unlock_spin(&txr->hn_txlist_spin);
1848 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1852 #ifdef HN_USE_TXDESC_BUFRING
1854 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1856 #endif /* HN_USE_TXDESC_BUFRING */
1857 KASSERT(txd->m == NULL && txd->refs == 0 &&
1858 STAILQ_EMPTY(&txd->agg_list) &&
1859 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1860 txd->chim_size == 0 &&
1861 (txd->flags & HN_TXD_FLAG_ONLIST) &&
1862 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1863 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1864 txd->flags &= ~HN_TXD_FLAG_ONLIST;
1870 static __inline void
1871 hn_txdesc_hold(struct hn_txdesc *txd)
1874 /* 0->1 transition will never work */
1875 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1876 atomic_add_int(&txd->refs, 1);
1879 static __inline void
1880 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1883 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1884 ("recursive aggregation on aggregating txdesc"));
1886 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1887 ("already aggregated"));
1888 KASSERT(STAILQ_EMPTY(&txd->agg_list),
1889 ("recursive aggregation on to-be-aggregated txdesc"));
1891 txd->flags |= HN_TXD_FLAG_ONAGG;
1892 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1896 hn_tx_ring_pending(struct hn_tx_ring *txr)
1898 bool pending = false;
1900 #ifndef HN_USE_TXDESC_BUFRING
1901 mtx_lock_spin(&txr->hn_txlist_spin);
1902 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1904 mtx_unlock_spin(&txr->hn_txlist_spin);
1906 if (!buf_ring_full(txr->hn_txdesc_br))
1912 static __inline void
1913 hn_txeof(struct hn_tx_ring *txr)
1915 txr->hn_has_txeof = 0;
1920 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1921 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1923 struct hn_txdesc *txd = sndc->hn_cbarg;
1924 struct hn_tx_ring *txr;
1927 KASSERT(txr->hn_chan == chan,
1928 ("channel mismatch, on chan%u, should be chan%u",
1929 vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan)));
1931 txr->hn_has_txeof = 1;
1932 hn_txdesc_put(txr, txd);
1934 ++txr->hn_txdone_cnt;
1935 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1936 txr->hn_txdone_cnt = 0;
1937 if (txr->hn_oactive)
1943 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1945 #if defined(INET) || defined(INET6)
1946 tcp_lro_flush_all(&rxr->hn_lro);
1951 * 'txr' could be NULL, if multiple channels and
1952 * ifnet.if_start method are enabled.
1954 if (txr == NULL || !txr->hn_has_txeof)
1957 txr->hn_txdone_cnt = 0;
1961 static __inline uint32_t
1962 hn_rndis_pktmsg_offset(uint32_t ofs)
1965 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1966 ("invalid RNDIS packet msg offset %u", ofs));
1967 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1970 static __inline void *
1971 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1972 size_t pi_dlen, uint32_t pi_type)
1974 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1975 struct rndis_pktinfo *pi;
1977 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1978 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1981 * Per-packet-info does not move; it only grows.
1984 * rm_pktinfooffset in this phase counts from the beginning
1985 * of rndis_packet_msg.
1987 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1988 ("%u pktinfo overflows RNDIS packet msg", pi_type));
1989 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1990 pkt->rm_pktinfolen);
1991 pkt->rm_pktinfolen += pi_size;
1993 pi->rm_size = pi_size;
1994 pi->rm_type = pi_type;
1995 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1997 return (pi->rm_data);
2001 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
2003 struct hn_txdesc *txd;
2007 txd = txr->hn_agg_txd;
2008 KASSERT(txd != NULL, ("no aggregate txdesc"));
2011 * Since hn_txpkt() will reset this temporary stat, save
2012 * it now, so that oerrors can be updated properly, if
2013 * hn_txpkt() ever fails.
2015 pkts = txr->hn_stat_pkts;
2018 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
2019 * failure, save it for later freeing, if hn_txpkt() ever
2023 error = hn_txpkt(ifp, txr, txd);
2024 if (__predict_false(error)) {
2025 /* txd is freed, but m is not. */
2028 txr->hn_flush_failed++;
2029 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
2032 /* Reset all aggregation states. */
2033 txr->hn_agg_txd = NULL;
2034 txr->hn_agg_szleft = 0;
2035 txr->hn_agg_pktleft = 0;
2036 txr->hn_agg_prevpkt = NULL;
2042 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2047 if (txr->hn_agg_txd != NULL) {
2048 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
2049 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
2050 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
2054 * Update the previous RNDIS packet's total length,
2055 * it can be increased due to the mandatory alignment
2056 * padding for this RNDIS packet. And update the
2057 * aggregating txdesc's chimney sending buffer size
2061 * Zero-out the padding, as required by the RNDIS spec.
2064 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
2065 agg_txd->chim_size += pkt->rm_len - olen;
2067 /* Link this txdesc to the parent. */
2068 hn_txdesc_agg(agg_txd, txd);
2070 chim = (uint8_t *)pkt + pkt->rm_len;
2071 /* Save the current packet for later fixup. */
2072 txr->hn_agg_prevpkt = chim;
2074 txr->hn_agg_pktleft--;
2075 txr->hn_agg_szleft -= pktsize;
2076 if (txr->hn_agg_szleft <=
2077 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2079 * Probably can't aggregate more packets,
2080 * flush this aggregating txdesc proactively.
2082 txr->hn_agg_pktleft = 0;
2087 hn_flush_txagg(ifp, txr);
2089 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
2091 txr->hn_tx_chimney_tried++;
2092 txd->chim_index = hn_chim_alloc(txr->hn_sc);
2093 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
2095 txr->hn_tx_chimney++;
2097 chim = txr->hn_sc->hn_chim +
2098 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
2100 if (txr->hn_agg_pktmax > 1 &&
2101 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
2102 txr->hn_agg_txd = txd;
2103 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
2104 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
2105 txr->hn_agg_prevpkt = chim;
2112 * If this function fails, then both txd and m_head0 will be freed.
2115 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
2116 struct mbuf **m_head0)
2118 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
2119 int error, nsegs, i;
2120 struct mbuf *m_head = *m_head0;
2121 struct rndis_packet_msg *pkt;
2124 int pkt_hlen, pkt_size;
2126 pkt = txd->rndis_pkt;
2127 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
2128 if (pkt_size < txr->hn_chim_size) {
2129 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
2133 if (txr->hn_agg_txd != NULL)
2134 hn_flush_txagg(ifp, txr);
2137 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
2138 pkt->rm_len = m_head->m_pkthdr.len;
2139 pkt->rm_dataoffset = 0;
2140 pkt->rm_datalen = m_head->m_pkthdr.len;
2141 pkt->rm_oobdataoffset = 0;
2142 pkt->rm_oobdatalen = 0;
2143 pkt->rm_oobdataelements = 0;
2144 pkt->rm_pktinfooffset = sizeof(*pkt);
2145 pkt->rm_pktinfolen = 0;
2146 pkt->rm_vchandle = 0;
2147 pkt->rm_reserved = 0;
2149 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
2151 * Set the hash value for this packet, so that the host could
2152 * dispatch the TX done event for this packet back to this TX
2155 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2156 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
2157 *pi_data = txr->hn_tx_idx;
2160 if (m_head->m_flags & M_VLANTAG) {
2161 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2162 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
2163 *pi_data = NDIS_VLAN_INFO_MAKE(
2164 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
2165 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
2166 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
2169 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
2170 #if defined(INET6) || defined(INET)
2171 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2172 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
2174 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
2175 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
2176 m_head->m_pkthdr.tso_segsz);
2179 #if defined(INET6) && defined(INET)
2184 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
2185 m_head->m_pkthdr.tso_segsz);
2188 #endif /* INET6 || INET */
2189 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
2190 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
2191 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
2192 if (m_head->m_pkthdr.csum_flags &
2193 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
2194 *pi_data = NDIS_TXCSUM_INFO_IPV6;
2196 *pi_data = NDIS_TXCSUM_INFO_IPV4;
2197 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
2198 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
2201 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
2202 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
2203 else if (m_head->m_pkthdr.csum_flags &
2204 (CSUM_IP_UDP | CSUM_IP6_UDP))
2205 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
2208 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
2209 /* Fixup RNDIS packet message total length */
2210 pkt->rm_len += pkt_hlen;
2211 /* Convert RNDIS packet message offsets */
2212 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen);
2213 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
2216 * Fast path: Chimney sending.
2219 struct hn_txdesc *tgt_txd = txd;
2221 if (txr->hn_agg_txd != NULL) {
2222 tgt_txd = txr->hn_agg_txd;
2228 KASSERT(pkt == chim,
2229 ("RNDIS pkt not in chimney sending buffer"));
2230 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
2231 ("chimney sending buffer is not used"));
2232 tgt_txd->chim_size += pkt->rm_len;
2234 m_copydata(m_head, 0, m_head->m_pkthdr.len,
2235 ((uint8_t *)chim) + pkt_hlen);
2237 txr->hn_gpa_cnt = 0;
2238 txr->hn_sendpkt = hn_txpkt_chim;
2242 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
2243 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
2244 ("chimney buffer is used"));
2245 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
2247 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
2248 if (__predict_false(error)) {
2252 * This mbuf is not linked w/ the txd yet, so free it now.
2257 freed = hn_txdesc_put(txr, txd);
2259 ("fail to free txd upon txdma error"));
2261 txr->hn_txdma_failed++;
2262 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2267 /* +1 RNDIS packet message */
2268 txr->hn_gpa_cnt = nsegs + 1;
2270 /* send packet with page buffer */
2271 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
2272 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
2273 txr->hn_gpa[0].gpa_len = pkt_hlen;
2276 * Fill the page buffers with mbuf info after the page
2277 * buffer for RNDIS packet message.
2279 for (i = 0; i < nsegs; ++i) {
2280 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
2282 gpa->gpa_page = atop(segs[i].ds_addr);
2283 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
2284 gpa->gpa_len = segs[i].ds_len;
2287 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
2289 txr->hn_sendpkt = hn_txpkt_sglist;
2293 /* Set the completion routine */
2294 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
2296 /* Update temporary stats for later use. */
2297 txr->hn_stat_pkts++;
2298 txr->hn_stat_size += m_head->m_pkthdr.len;
2299 if (m_head->m_flags & M_MCAST)
2300 txr->hn_stat_mcasts++;
2307 * If this function fails, then txd will be freed, but the mbuf
2308 * associated w/ the txd will _not_ be freed.
2311 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
2313 int error, send_failed = 0, has_bpf;
2316 has_bpf = bpf_peers_present(ifp->if_bpf);
2319 * Make sure that this txd and any aggregated txds are not
2320 * freed before ETHER_BPF_MTAP.
2322 hn_txdesc_hold(txd);
2324 error = txr->hn_sendpkt(txr, txd);
2327 const struct hn_txdesc *tmp_txd;
2329 ETHER_BPF_MTAP(ifp, txd->m);
2330 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
2331 ETHER_BPF_MTAP(ifp, tmp_txd->m);
2334 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
2335 #ifdef HN_IFSTART_SUPPORT
2336 if (!hn_use_if_start)
2339 if_inc_counter(ifp, IFCOUNTER_OBYTES,
2341 if (txr->hn_stat_mcasts != 0) {
2342 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
2343 txr->hn_stat_mcasts);
2346 txr->hn_pkts += txr->hn_stat_pkts;
2350 hn_txdesc_put(txr, txd);
2352 if (__predict_false(error)) {
2356 * This should "really rarely" happen.
2358 * XXX Too many RX to be acked or too many sideband
2359 * commands to run? Ask netvsc_channel_rollup()
2360 * to kick start later.
2362 txr->hn_has_txeof = 1;
2364 txr->hn_send_failed++;
2367 * Try sending again after set hn_has_txeof;
2368 * in case that we missed the last
2369 * netvsc_channel_rollup().
2373 if_printf(ifp, "send failed\n");
2376 * Caller will perform further processing on the
2377 * associated mbuf, so don't free it in hn_txdesc_put();
2378 * only unload it from the DMA map in hn_txdesc_put(),
2382 freed = hn_txdesc_put(txr, txd);
2384 ("fail to free txd upon send error"));
2386 txr->hn_send_failed++;
2389 /* Reset temporary stats, after this sending is done. */
2390 txr->hn_stat_size = 0;
2391 txr->hn_stat_pkts = 0;
2392 txr->hn_stat_mcasts = 0;
2398 * Append the specified data to the indicated mbuf chain,
2399 * Extend the mbuf chain if the new data does not fit in
2402 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2403 * There should be an equivalent in the kernel mbuf code,
2404 * but there does not appear to be one yet.
2406 * Differs from m_append() in that additional mbufs are
2407 * allocated with cluster size MJUMPAGESIZE, and filled
2410 * Return 1 if able to complete the job; otherwise 0.
2413 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2416 int remainder, space;
2418 for (m = m0; m->m_next != NULL; m = m->m_next)
2421 space = M_TRAILINGSPACE(m);
2424 * Copy into available space.
2426 if (space > remainder)
2428 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2433 while (remainder > 0) {
2435 * Allocate a new mbuf; could check space
2436 * and allocate a cluster instead.
2438 n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE);
2441 n->m_len = min(MJUMPAGESIZE, remainder);
2442 bcopy(cp, mtod(n, caddr_t), n->m_len);
2444 remainder -= n->m_len;
2448 if (m0->m_flags & M_PKTHDR)
2449 m0->m_pkthdr.len += len - remainder;
2451 return (remainder == 0);
2454 #if defined(INET) || defined(INET6)
2456 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2458 #if __FreeBSD_version >= 1100095
2459 if (hn_lro_mbufq_depth) {
2460 tcp_lro_queue_mbuf(lc, m);
2464 return tcp_lro_rx(lc, m, 0);
2469 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2470 const struct hn_rxinfo *info)
2474 int size, do_lro = 0, do_csum = 1;
2477 /* If the VF is active, inject the packet through the VF */
2478 ifp = rxr->hn_rxvf_ifp ? rxr->hn_rxvf_ifp : rxr->hn_ifp;
2480 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
2483 * See the NOTE of hn_rndis_init_fixat(). This
2484 * function can be reached, immediately after the
2485 * RNDIS is initialized but before the ifnet is
2486 * setup on the hn_attach() path; drop the unexpected
2492 if (dlen <= MHLEN) {
2493 m_new = m_gethdr(M_NOWAIT, MT_DATA);
2494 if (m_new == NULL) {
2495 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2498 memcpy(mtod(m_new, void *), data, dlen);
2499 m_new->m_pkthdr.len = m_new->m_len = dlen;
2500 rxr->hn_small_pkts++;
2503 * Get an mbuf with a cluster. For packets 2K or less,
2504 * get a standard 2K cluster. For anything larger, get a
2505 * 4K cluster. Any buffers larger than 4K can cause problems
2506 * if looped around to the Hyper-V TX channel, so avoid them.
2509 if (dlen > MCLBYTES) {
2511 size = MJUMPAGESIZE;
2514 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2515 if (m_new == NULL) {
2516 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2520 hv_m_append(m_new, dlen, data);
2522 m_new->m_pkthdr.rcvif = ifp;
2524 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2527 /* receive side checksum offload */
2528 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2529 /* IP csum offload */
2530 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2531 m_new->m_pkthdr.csum_flags |=
2532 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2536 /* TCP/UDP csum offload */
2537 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2538 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2539 m_new->m_pkthdr.csum_flags |=
2540 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2541 m_new->m_pkthdr.csum_data = 0xffff;
2542 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2550 * As of this write (Oct 28th, 2016), host side will turn
2551 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2552 * the do_lro setting here is actually _not_ accurate. We
2553 * depend on the RSS hash type check to reset do_lro.
2555 if ((info->csum_info &
2556 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2557 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2560 const struct ether_header *eh;
2565 if (m_new->m_len < hoff)
2567 eh = mtod(m_new, struct ether_header *);
2568 etype = ntohs(eh->ether_type);
2569 if (etype == ETHERTYPE_VLAN) {
2570 const struct ether_vlan_header *evl;
2572 hoff = sizeof(*evl);
2573 if (m_new->m_len < hoff)
2575 evl = mtod(m_new, struct ether_vlan_header *);
2576 etype = ntohs(evl->evl_proto);
2579 if (etype == ETHERTYPE_IP) {
2582 pr = hn_check_iplen(m_new, hoff);
2583 if (pr == IPPROTO_TCP) {
2585 (rxr->hn_trust_hcsum &
2586 HN_TRUST_HCSUM_TCP)) {
2587 rxr->hn_csum_trusted++;
2588 m_new->m_pkthdr.csum_flags |=
2589 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2590 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2591 m_new->m_pkthdr.csum_data = 0xffff;
2594 } else if (pr == IPPROTO_UDP) {
2596 (rxr->hn_trust_hcsum &
2597 HN_TRUST_HCSUM_UDP)) {
2598 rxr->hn_csum_trusted++;
2599 m_new->m_pkthdr.csum_flags |=
2600 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2601 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2602 m_new->m_pkthdr.csum_data = 0xffff;
2604 } else if (pr != IPPROTO_DONE && do_csum &&
2605 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2606 rxr->hn_csum_trusted++;
2607 m_new->m_pkthdr.csum_flags |=
2608 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2613 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2614 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2615 NDIS_VLAN_INFO_ID(info->vlan_info),
2616 NDIS_VLAN_INFO_PRI(info->vlan_info),
2617 NDIS_VLAN_INFO_CFI(info->vlan_info));
2618 m_new->m_flags |= M_VLANTAG;
2621 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2623 m_new->m_pkthdr.flowid = info->hash_value;
2624 hash_type = M_HASHTYPE_OPAQUE_HASH;
2625 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2626 NDIS_HASH_FUNCTION_TOEPLITZ) {
2627 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2631 * do_lro is resetted, if the hash types are not TCP
2632 * related. See the comment in the above csum_flags
2636 case NDIS_HASH_IPV4:
2637 hash_type = M_HASHTYPE_RSS_IPV4;
2641 case NDIS_HASH_TCP_IPV4:
2642 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2645 case NDIS_HASH_IPV6:
2646 hash_type = M_HASHTYPE_RSS_IPV6;
2650 case NDIS_HASH_IPV6_EX:
2651 hash_type = M_HASHTYPE_RSS_IPV6_EX;
2655 case NDIS_HASH_TCP_IPV6:
2656 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2659 case NDIS_HASH_TCP_IPV6_EX:
2660 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2665 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2666 hash_type = M_HASHTYPE_OPAQUE;
2668 M_HASHTYPE_SET(m_new, hash_type);
2671 * Note: Moved RX completion back to hv_nv_on_receive() so all
2672 * messages (not just data messages) will trigger a response.
2675 if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
2678 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2679 #if defined(INET) || defined(INET6)
2680 struct lro_ctrl *lro = &rxr->hn_lro;
2683 rxr->hn_lro_tried++;
2684 if (hn_lro_rx(lro, m_new) == 0) {
2692 /* We're not holding the lock here, so don't release it */
2693 (*ifp->if_input)(ifp, m_new);
2699 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2701 struct hn_softc *sc = ifp->if_softc;
2702 struct ifreq *ifr = (struct ifreq *)data;
2703 int mask, error = 0;
2707 if (ifr->ifr_mtu > HN_MTU_MAX) {
2714 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2719 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2720 /* Can't change MTU */
2726 if (ifp->if_mtu == ifr->ifr_mtu) {
2732 * Suspend this interface before the synthetic parts
2738 * Detach the synthetics parts, i.e. NVS and RNDIS.
2740 hn_synth_detach(sc);
2743 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2744 * with the new MTU setting.
2746 error = hn_synth_attach(sc, ifr->ifr_mtu);
2753 * Commit the requested MTU, after the synthetic parts
2754 * have been successfully attached.
2756 ifp->if_mtu = ifr->ifr_mtu;
2759 * Make sure that various parameters based on MTU are
2760 * still valid, after the MTU change.
2762 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2763 hn_set_chim_size(sc, sc->hn_chim_szmax);
2764 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2765 #if __FreeBSD_version >= 1100099
2766 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2767 HN_LRO_LENLIM_MIN(ifp))
2768 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2772 * All done! Resume the interface now.
2782 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2787 if (ifp->if_flags & IFF_UP) {
2788 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2790 * Caller meight hold mutex, e.g.
2791 * bpf; use busy-wait for the RNDIS
2795 hn_rxfilter_config(sc);
2801 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2804 sc->hn_if_flags = ifp->if_flags;
2811 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2813 if (mask & IFCAP_TXCSUM) {
2814 ifp->if_capenable ^= IFCAP_TXCSUM;
2815 if (ifp->if_capenable & IFCAP_TXCSUM)
2816 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2818 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2820 if (mask & IFCAP_TXCSUM_IPV6) {
2821 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2822 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2823 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2825 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2828 /* TODO: flip RNDIS offload parameters for RXCSUM. */
2829 if (mask & IFCAP_RXCSUM)
2830 ifp->if_capenable ^= IFCAP_RXCSUM;
2832 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2833 if (mask & IFCAP_RXCSUM_IPV6)
2834 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2837 if (mask & IFCAP_LRO)
2838 ifp->if_capenable ^= IFCAP_LRO;
2840 if (mask & IFCAP_TSO4) {
2841 ifp->if_capenable ^= IFCAP_TSO4;
2842 if (ifp->if_capenable & IFCAP_TSO4)
2843 ifp->if_hwassist |= CSUM_IP_TSO;
2845 ifp->if_hwassist &= ~CSUM_IP_TSO;
2847 if (mask & IFCAP_TSO6) {
2848 ifp->if_capenable ^= IFCAP_TSO6;
2849 if (ifp->if_capenable & IFCAP_TSO6)
2850 ifp->if_hwassist |= CSUM_IP6_TSO;
2852 ifp->if_hwassist &= ~CSUM_IP6_TSO;
2862 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2866 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2868 * Multicast uses mutex; use busy-wait for
2872 hn_rxfilter_config(sc);
2881 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2885 error = ether_ioctl(ifp, cmd, data);
2892 hn_stop(struct hn_softc *sc, bool detaching)
2894 struct ifnet *ifp = sc->hn_ifp;
2899 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2900 ("synthetic parts were not attached"));
2902 /* Disable polling. */
2905 /* Clear RUNNING bit _before_ hn_suspend_data() */
2906 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2907 hn_suspend_data(sc);
2909 /* Clear OACTIVE bit. */
2910 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2911 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2912 sc->hn_tx_ring[i].hn_oactive = 0;
2915 * If the VF is active, make sure the filter is not 0, even if
2916 * the synthetic NIC is down.
2918 if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
2919 hn_rxfilter_config(sc);
2923 hn_init_locked(struct hn_softc *sc)
2925 struct ifnet *ifp = sc->hn_ifp;
2930 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2933 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2936 /* Configure RX filter */
2937 hn_rxfilter_config(sc);
2939 /* Clear OACTIVE bit. */
2940 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2941 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2942 sc->hn_tx_ring[i].hn_oactive = 0;
2944 /* Clear TX 'suspended' bit. */
2945 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2947 /* Everything is ready; unleash! */
2948 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2950 /* Re-enable polling if requested. */
2951 if (sc->hn_pollhz > 0)
2952 hn_polling(sc, sc->hn_pollhz);
2958 struct hn_softc *sc = xsc;
2965 #if __FreeBSD_version >= 1100099
2968 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2970 struct hn_softc *sc = arg1;
2971 unsigned int lenlim;
2974 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2975 error = sysctl_handle_int(oidp, &lenlim, 0, req);
2976 if (error || req->newptr == NULL)
2980 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2981 lenlim > TCP_LRO_LENGTH_MAX) {
2985 hn_set_lro_lenlim(sc, lenlim);
2992 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2994 struct hn_softc *sc = arg1;
2995 int ackcnt, error, i;
2998 * lro_ackcnt_lim is append count limit,
2999 * +1 to turn it into aggregation limit.
3001 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
3002 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
3003 if (error || req->newptr == NULL)
3006 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
3010 * Convert aggregation limit back to append
3015 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
3016 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
3024 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
3026 struct hn_softc *sc = arg1;
3031 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
3034 error = sysctl_handle_int(oidp, &on, 0, req);
3035 if (error || req->newptr == NULL)
3039 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3040 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3043 rxr->hn_trust_hcsum |= hcsum;
3045 rxr->hn_trust_hcsum &= ~hcsum;
3052 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
3054 struct hn_softc *sc = arg1;
3055 int chim_size, error;
3057 chim_size = sc->hn_tx_ring[0].hn_chim_size;
3058 error = sysctl_handle_int(oidp, &chim_size, 0, req);
3059 if (error || req->newptr == NULL)
3062 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
3066 hn_set_chim_size(sc, chim_size);
3071 #if __FreeBSD_version < 1100095
3073 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
3075 struct hn_softc *sc = arg1;
3076 int ofs = arg2, i, error;
3077 struct hn_rx_ring *rxr;
3081 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3082 rxr = &sc->hn_rx_ring[i];
3083 stat += *((int *)((uint8_t *)rxr + ofs));
3086 error = sysctl_handle_64(oidp, &stat, 0, req);
3087 if (error || req->newptr == NULL)
3090 /* Zero out this stat. */
3091 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3092 rxr = &sc->hn_rx_ring[i];
3093 *((int *)((uint8_t *)rxr + ofs)) = 0;
3099 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
3101 struct hn_softc *sc = arg1;
3102 int ofs = arg2, i, error;
3103 struct hn_rx_ring *rxr;
3107 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3108 rxr = &sc->hn_rx_ring[i];
3109 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
3112 error = sysctl_handle_64(oidp, &stat, 0, req);
3113 if (error || req->newptr == NULL)
3116 /* Zero out this stat. */
3117 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3118 rxr = &sc->hn_rx_ring[i];
3119 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
3127 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3129 struct hn_softc *sc = arg1;
3130 int ofs = arg2, i, error;
3131 struct hn_rx_ring *rxr;
3135 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3136 rxr = &sc->hn_rx_ring[i];
3137 stat += *((u_long *)((uint8_t *)rxr + ofs));
3140 error = sysctl_handle_long(oidp, &stat, 0, req);
3141 if (error || req->newptr == NULL)
3144 /* Zero out this stat. */
3145 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3146 rxr = &sc->hn_rx_ring[i];
3147 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
3153 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
3155 struct hn_softc *sc = arg1;
3156 int ofs = arg2, i, error;
3157 struct hn_tx_ring *txr;
3161 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3162 txr = &sc->hn_tx_ring[i];
3163 stat += *((u_long *)((uint8_t *)txr + ofs));
3166 error = sysctl_handle_long(oidp, &stat, 0, req);
3167 if (error || req->newptr == NULL)
3170 /* Zero out this stat. */
3171 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3172 txr = &sc->hn_tx_ring[i];
3173 *((u_long *)((uint8_t *)txr + ofs)) = 0;
3179 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
3181 struct hn_softc *sc = arg1;
3182 int ofs = arg2, i, error, conf;
3183 struct hn_tx_ring *txr;
3185 txr = &sc->hn_tx_ring[0];
3186 conf = *((int *)((uint8_t *)txr + ofs));
3188 error = sysctl_handle_int(oidp, &conf, 0, req);
3189 if (error || req->newptr == NULL)
3193 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3194 txr = &sc->hn_tx_ring[i];
3195 *((int *)((uint8_t *)txr + ofs)) = conf;
3203 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
3205 struct hn_softc *sc = arg1;
3208 size = sc->hn_agg_size;
3209 error = sysctl_handle_int(oidp, &size, 0, req);
3210 if (error || req->newptr == NULL)
3214 sc->hn_agg_size = size;
3222 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
3224 struct hn_softc *sc = arg1;
3227 pkts = sc->hn_agg_pkts;
3228 error = sysctl_handle_int(oidp, &pkts, 0, req);
3229 if (error || req->newptr == NULL)
3233 sc->hn_agg_pkts = pkts;
3241 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
3243 struct hn_softc *sc = arg1;
3246 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
3247 return (sysctl_handle_int(oidp, &pkts, 0, req));
3251 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
3253 struct hn_softc *sc = arg1;
3256 align = sc->hn_tx_ring[0].hn_agg_align;
3257 return (sysctl_handle_int(oidp, &align, 0, req));
3261 hn_chan_polling(struct vmbus_channel *chan, u_int pollhz)
3264 vmbus_chan_poll_disable(chan);
3266 vmbus_chan_poll_enable(chan, pollhz);
3270 hn_polling(struct hn_softc *sc, u_int pollhz)
3272 int nsubch = sc->hn_rx_ring_inuse - 1;
3277 struct vmbus_channel **subch;
3280 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
3281 for (i = 0; i < nsubch; ++i)
3282 hn_chan_polling(subch[i], pollhz);
3283 vmbus_subchan_rel(subch, nsubch);
3285 hn_chan_polling(sc->hn_prichan, pollhz);
3289 hn_polling_sysctl(SYSCTL_HANDLER_ARGS)
3291 struct hn_softc *sc = arg1;
3294 pollhz = sc->hn_pollhz;
3295 error = sysctl_handle_int(oidp, &pollhz, 0, req);
3296 if (error || req->newptr == NULL)
3300 (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX))
3304 if (sc->hn_pollhz != pollhz) {
3305 sc->hn_pollhz = pollhz;
3306 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) &&
3307 (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED))
3308 hn_polling(sc, sc->hn_pollhz);
3316 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
3318 struct hn_softc *sc = arg1;
3321 snprintf(verstr, sizeof(verstr), "%u.%u",
3322 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
3323 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
3324 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
3328 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
3330 struct hn_softc *sc = arg1;
3337 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
3338 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
3342 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
3344 struct hn_softc *sc = arg1;
3345 char assist_str[128];
3349 hwassist = sc->hn_ifp->if_hwassist;
3351 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
3352 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
3356 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
3358 struct hn_softc *sc = arg1;
3359 char filter_str[128];
3363 filter = sc->hn_rx_filter;
3365 snprintf(filter_str, sizeof(filter_str), "%b", filter,
3367 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
3373 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
3375 struct hn_softc *sc = arg1;
3380 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3381 if (error || req->newptr == NULL)
3384 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
3387 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
3389 if (sc->hn_rx_ring_inuse > 1) {
3390 error = hn_rss_reconfig(sc);
3392 /* Not RSS capable, at least for now; just save the RSS key. */
3401 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
3403 struct hn_softc *sc = arg1;
3408 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3409 if (error || req->newptr == NULL)
3413 * Don't allow RSS indirect table change, if this interface is not
3414 * RSS capable currently.
3416 if (sc->hn_rx_ring_inuse == 1) {
3421 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
3424 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
3426 hn_rss_ind_fixup(sc);
3427 error = hn_rss_reconfig(sc);
3436 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
3438 struct hn_softc *sc = arg1;
3443 hash = sc->hn_rss_hash;
3445 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
3446 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
3450 hn_vf_sysctl(SYSCTL_HANDLER_ARGS)
3452 struct hn_softc *sc = arg1;
3453 char vf_name[IFNAMSIZ + 1];
3454 struct ifnet *vf_ifp;
3458 vf_ifp = sc->hn_vf_ifp;
3460 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
3462 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
3466 hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS)
3468 struct hn_softc *sc = arg1;
3469 char vf_name[IFNAMSIZ + 1];
3470 struct ifnet *vf_ifp;
3474 vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp;
3476 snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname);
3478 return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req);
3482 hn_vflist_sysctl(SYSCTL_HANDLER_ARGS)
3484 struct rm_priotracker pt;
3489 error = sysctl_wire_old_buffer(req, 0);
3493 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
3497 rm_rlock(&hn_vfmap_lock, &pt);
3500 for (i = 0; i < hn_vfmap_size; ++i) {
3503 if (hn_vfmap[i] == NULL)
3506 ifp = ifnet_byindex(i);
3509 sbuf_printf(sb, "%s", ifp->if_xname);
3511 sbuf_printf(sb, " %s", ifp->if_xname);
3516 rm_runlock(&hn_vfmap_lock, &pt);
3518 error = sbuf_finish(sb);
3524 hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
3526 struct rm_priotracker pt;
3531 error = sysctl_wire_old_buffer(req, 0);
3535 sb = sbuf_new_for_sysctl(NULL, NULL, 128, req);
3539 rm_rlock(&hn_vfmap_lock, &pt);
3542 for (i = 0; i < hn_vfmap_size; ++i) {
3543 struct ifnet *ifp, *hn_ifp;
3545 hn_ifp = hn_vfmap[i];
3549 ifp = ifnet_byindex(i);
3552 sbuf_printf(sb, "%s:%s", ifp->if_xname,
3555 sbuf_printf(sb, " %s:%s", ifp->if_xname,
3562 rm_runlock(&hn_vfmap_lock, &pt);
3564 error = sbuf_finish(sb);
3570 hn_check_iplen(const struct mbuf *m, int hoff)
3572 const struct ip *ip;
3573 int len, iphlen, iplen;
3574 const struct tcphdr *th;
3575 int thoff; /* TCP data offset */
3577 len = hoff + sizeof(struct ip);
3579 /* The packet must be at least the size of an IP header. */
3580 if (m->m_pkthdr.len < len)
3581 return IPPROTO_DONE;
3583 /* The fixed IP header must reside completely in the first mbuf. */
3585 return IPPROTO_DONE;
3587 ip = mtodo(m, hoff);
3589 /* Bound check the packet's stated IP header length. */
3590 iphlen = ip->ip_hl << 2;
3591 if (iphlen < sizeof(struct ip)) /* minimum header length */
3592 return IPPROTO_DONE;
3594 /* The full IP header must reside completely in the one mbuf. */
3595 if (m->m_len < hoff + iphlen)
3596 return IPPROTO_DONE;
3598 iplen = ntohs(ip->ip_len);
3601 * Check that the amount of data in the buffers is as
3602 * at least much as the IP header would have us expect.
3604 if (m->m_pkthdr.len < hoff + iplen)
3605 return IPPROTO_DONE;
3608 * Ignore IP fragments.
3610 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3611 return IPPROTO_DONE;
3614 * The TCP/IP or UDP/IP header must be entirely contained within
3615 * the first fragment of a packet.
3619 if (iplen < iphlen + sizeof(struct tcphdr))
3620 return IPPROTO_DONE;
3621 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3622 return IPPROTO_DONE;
3623 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3624 thoff = th->th_off << 2;
3625 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3626 return IPPROTO_DONE;
3627 if (m->m_len < hoff + iphlen + thoff)
3628 return IPPROTO_DONE;
3631 if (iplen < iphlen + sizeof(struct udphdr))
3632 return IPPROTO_DONE;
3633 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3634 return IPPROTO_DONE;
3638 return IPPROTO_DONE;
3645 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3647 struct sysctl_oid_list *child;
3648 struct sysctl_ctx_list *ctx;
3649 device_t dev = sc->hn_dev;
3650 #if defined(INET) || defined(INET6)
3651 #if __FreeBSD_version >= 1100095
3658 * Create RXBUF for reception.
3661 * - It is shared by all channels.
3662 * - A large enough buffer is allocated, certain version of NVSes
3663 * may further limit the usable space.
3665 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3666 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3667 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3668 if (sc->hn_rxbuf == NULL) {
3669 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3673 sc->hn_rx_ring_cnt = ring_cnt;
3674 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3676 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3677 M_DEVBUF, M_WAITOK | M_ZERO);
3679 #if defined(INET) || defined(INET6)
3680 #if __FreeBSD_version >= 1100095
3681 lroent_cnt = hn_lro_entry_count;
3682 if (lroent_cnt < TCP_LRO_ENTRIES)
3683 lroent_cnt = TCP_LRO_ENTRIES;
3685 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3687 #endif /* INET || INET6 */
3689 ctx = device_get_sysctl_ctx(dev);
3690 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3692 /* Create dev.hn.UNIT.rx sysctl tree */
3693 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3694 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3696 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3697 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3699 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3700 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3701 &rxr->hn_br_dma, BUS_DMA_WAITOK);
3702 if (rxr->hn_br == NULL) {
3703 device_printf(dev, "allocate bufring failed\n");
3707 if (hn_trust_hosttcp)
3708 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3709 if (hn_trust_hostudp)
3710 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3711 if (hn_trust_hostip)
3712 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3713 rxr->hn_ifp = sc->hn_ifp;
3714 if (i < sc->hn_tx_ring_cnt)
3715 rxr->hn_txr = &sc->hn_tx_ring[i];
3716 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3717 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3719 rxr->hn_rxbuf = sc->hn_rxbuf;
3724 #if defined(INET) || defined(INET6)
3725 #if __FreeBSD_version >= 1100095
3726 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3727 hn_lro_mbufq_depth);
3729 tcp_lro_init(&rxr->hn_lro);
3730 rxr->hn_lro.ifp = sc->hn_ifp;
3732 #if __FreeBSD_version >= 1100099
3733 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3734 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3736 #endif /* INET || INET6 */
3738 if (sc->hn_rx_sysctl_tree != NULL) {
3742 * Create per RX ring sysctl tree:
3743 * dev.hn.UNIT.rx.RINGID
3745 snprintf(name, sizeof(name), "%d", i);
3746 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3747 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3748 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3750 if (rxr->hn_rx_sysctl_tree != NULL) {
3751 SYSCTL_ADD_ULONG(ctx,
3752 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3753 OID_AUTO, "packets", CTLFLAG_RW,
3754 &rxr->hn_pkts, "# of packets received");
3755 SYSCTL_ADD_ULONG(ctx,
3756 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3757 OID_AUTO, "rss_pkts", CTLFLAG_RW,
3759 "# of packets w/ RSS info received");
3761 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3762 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3763 &rxr->hn_pktbuf_len, 0,
3764 "Temporary channel packet buffer length");
3769 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3770 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3771 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3772 #if __FreeBSD_version < 1100095
3773 hn_rx_stat_int_sysctl,
3775 hn_rx_stat_u64_sysctl,
3777 "LU", "LRO queued");
3778 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3779 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3780 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3781 #if __FreeBSD_version < 1100095
3782 hn_rx_stat_int_sysctl,
3784 hn_rx_stat_u64_sysctl,
3786 "LU", "LRO flushed");
3787 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3788 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3789 __offsetof(struct hn_rx_ring, hn_lro_tried),
3790 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3791 #if __FreeBSD_version >= 1100099
3792 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3793 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3794 hn_lro_lenlim_sysctl, "IU",
3795 "Max # of data bytes to be aggregated by LRO");
3796 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3797 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3798 hn_lro_ackcnt_sysctl, "I",
3799 "Max # of ACKs to be aggregated by LRO");
3801 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3802 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3803 hn_trust_hcsum_sysctl, "I",
3804 "Trust tcp segement verification on host side, "
3805 "when csum info is missing");
3806 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3807 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3808 hn_trust_hcsum_sysctl, "I",
3809 "Trust udp datagram verification on host side, "
3810 "when csum info is missing");
3811 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3812 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3813 hn_trust_hcsum_sysctl, "I",
3814 "Trust ip packet verification on host side, "
3815 "when csum info is missing");
3816 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3817 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3818 __offsetof(struct hn_rx_ring, hn_csum_ip),
3819 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3820 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3821 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3822 __offsetof(struct hn_rx_ring, hn_csum_tcp),
3823 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3824 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3825 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3826 __offsetof(struct hn_rx_ring, hn_csum_udp),
3827 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3828 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3829 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3830 __offsetof(struct hn_rx_ring, hn_csum_trusted),
3831 hn_rx_stat_ulong_sysctl, "LU",
3832 "# of packets that we trust host's csum verification");
3833 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3834 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3835 __offsetof(struct hn_rx_ring, hn_small_pkts),
3836 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3837 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3838 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3839 __offsetof(struct hn_rx_ring, hn_ack_failed),
3840 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3841 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3842 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3843 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3844 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3850 hn_destroy_rx_data(struct hn_softc *sc)
3854 if (sc->hn_rxbuf != NULL) {
3855 if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0)
3856 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3858 device_printf(sc->hn_dev, "RXBUF is referenced\n");
3859 sc->hn_rxbuf = NULL;
3862 if (sc->hn_rx_ring_cnt == 0)
3865 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3866 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3868 if (rxr->hn_br == NULL)
3870 if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) {
3871 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3873 device_printf(sc->hn_dev,
3874 "%dth channel bufring is referenced", i);
3878 #if defined(INET) || defined(INET6)
3879 tcp_lro_free(&rxr->hn_lro);
3881 free(rxr->hn_pktbuf, M_DEVBUF);
3883 free(sc->hn_rx_ring, M_DEVBUF);
3884 sc->hn_rx_ring = NULL;
3886 sc->hn_rx_ring_cnt = 0;
3887 sc->hn_rx_ring_inuse = 0;
3891 hn_tx_ring_create(struct hn_softc *sc, int id)
3893 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3894 device_t dev = sc->hn_dev;
3895 bus_dma_tag_t parent_dtag;
3899 txr->hn_tx_idx = id;
3901 #ifndef HN_USE_TXDESC_BUFRING
3902 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3904 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3906 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3907 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3908 M_DEVBUF, M_WAITOK | M_ZERO);
3909 #ifndef HN_USE_TXDESC_BUFRING
3910 SLIST_INIT(&txr->hn_txlist);
3912 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3913 M_WAITOK, &txr->hn_tx_lock);
3916 if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) {
3917 txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ(
3918 device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id));
3920 txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt];
3923 #ifdef HN_IFSTART_SUPPORT
3924 if (hn_use_if_start) {
3925 txr->hn_txeof = hn_start_txeof;
3926 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3927 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3933 txr->hn_txeof = hn_xmit_txeof;
3934 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3935 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3937 br_depth = hn_get_txswq_depth(txr);
3938 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3939 M_WAITOK, &txr->hn_tx_lock);
3942 txr->hn_direct_tx_size = hn_direct_tx_size;
3945 * Always schedule transmission instead of trying to do direct
3946 * transmission. This one gives the best performance so far.
3948 txr->hn_sched_tx = 1;
3950 parent_dtag = bus_get_dma_tag(dev);
3952 /* DMA tag for RNDIS packet messages. */
3953 error = bus_dma_tag_create(parent_dtag, /* parent */
3954 HN_RNDIS_PKT_ALIGN, /* alignment */
3955 HN_RNDIS_PKT_BOUNDARY, /* boundary */
3956 BUS_SPACE_MAXADDR, /* lowaddr */
3957 BUS_SPACE_MAXADDR, /* highaddr */
3958 NULL, NULL, /* filter, filterarg */
3959 HN_RNDIS_PKT_LEN, /* maxsize */
3961 HN_RNDIS_PKT_LEN, /* maxsegsize */
3963 NULL, /* lockfunc */
3964 NULL, /* lockfuncarg */
3965 &txr->hn_tx_rndis_dtag);
3967 device_printf(dev, "failed to create rndis dmatag\n");
3971 /* DMA tag for data. */
3972 error = bus_dma_tag_create(parent_dtag, /* parent */
3974 HN_TX_DATA_BOUNDARY, /* boundary */
3975 BUS_SPACE_MAXADDR, /* lowaddr */
3976 BUS_SPACE_MAXADDR, /* highaddr */
3977 NULL, NULL, /* filter, filterarg */
3978 HN_TX_DATA_MAXSIZE, /* maxsize */
3979 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
3980 HN_TX_DATA_SEGSIZE, /* maxsegsize */
3982 NULL, /* lockfunc */
3983 NULL, /* lockfuncarg */
3984 &txr->hn_tx_data_dtag);
3986 device_printf(dev, "failed to create data dmatag\n");
3990 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3991 struct hn_txdesc *txd = &txr->hn_txdesc[i];
3994 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3995 STAILQ_INIT(&txd->agg_list);
3998 * Allocate and load RNDIS packet message.
4000 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
4001 (void **)&txd->rndis_pkt,
4002 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
4003 &txd->rndis_pkt_dmap);
4006 "failed to allocate rndis_packet_msg, %d\n", i);
4010 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
4011 txd->rndis_pkt_dmap,
4012 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
4013 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
4017 "failed to load rndis_packet_msg, %d\n", i);
4018 bus_dmamem_free(txr->hn_tx_rndis_dtag,
4019 txd->rndis_pkt, txd->rndis_pkt_dmap);
4023 /* DMA map for TX data. */
4024 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
4028 "failed to allocate tx data dmamap\n");
4029 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
4030 txd->rndis_pkt_dmap);
4031 bus_dmamem_free(txr->hn_tx_rndis_dtag,
4032 txd->rndis_pkt, txd->rndis_pkt_dmap);
4036 /* All set, put it to list */
4037 txd->flags |= HN_TXD_FLAG_ONLIST;
4038 #ifndef HN_USE_TXDESC_BUFRING
4039 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
4041 buf_ring_enqueue(txr->hn_txdesc_br, txd);
4044 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
4046 if (sc->hn_tx_sysctl_tree != NULL) {
4047 struct sysctl_oid_list *child;
4048 struct sysctl_ctx_list *ctx;
4052 * Create per TX ring sysctl tree:
4053 * dev.hn.UNIT.tx.RINGID
4055 ctx = device_get_sysctl_ctx(dev);
4056 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
4058 snprintf(name, sizeof(name), "%d", id);
4059 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
4060 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4062 if (txr->hn_tx_sysctl_tree != NULL) {
4063 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
4066 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
4067 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
4068 "# of available TX descs");
4070 #ifdef HN_IFSTART_SUPPORT
4071 if (!hn_use_if_start)
4074 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
4075 CTLFLAG_RD, &txr->hn_oactive, 0,
4078 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
4079 CTLFLAG_RW, &txr->hn_pkts,
4080 "# of packets transmitted");
4081 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
4082 CTLFLAG_RW, &txr->hn_sends, "# of sends");
4090 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
4092 struct hn_tx_ring *txr = txd->txr;
4094 KASSERT(txd->m == NULL, ("still has mbuf installed"));
4095 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
4097 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
4098 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
4099 txd->rndis_pkt_dmap);
4100 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
4104 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
4107 KASSERT(txd->refs == 0 || txd->refs == 1,
4108 ("invalid txd refs %d", txd->refs));
4110 /* Aggregated txds will be freed by their aggregating txd. */
4111 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
4114 freed = hn_txdesc_put(txr, txd);
4115 KASSERT(freed, ("can't free txdesc"));
4120 hn_tx_ring_destroy(struct hn_tx_ring *txr)
4124 if (txr->hn_txdesc == NULL)
4129 * Because the freeing of aggregated txds will be deferred
4130 * to the aggregating txd, two passes are used here:
4131 * - The first pass GCes any pending txds. This GC is necessary,
4132 * since if the channels are revoked, hypervisor will not
4133 * deliver send-done for all pending txds.
4134 * - The second pass frees the busdma stuffs, i.e. after all txds
4137 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4138 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
4139 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
4140 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
4142 if (txr->hn_tx_data_dtag != NULL)
4143 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
4144 if (txr->hn_tx_rndis_dtag != NULL)
4145 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
4147 #ifdef HN_USE_TXDESC_BUFRING
4148 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
4151 free(txr->hn_txdesc, M_DEVBUF);
4152 txr->hn_txdesc = NULL;
4154 if (txr->hn_mbuf_br != NULL)
4155 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
4157 #ifndef HN_USE_TXDESC_BUFRING
4158 mtx_destroy(&txr->hn_txlist_spin);
4160 mtx_destroy(&txr->hn_tx_lock);
4164 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
4166 struct sysctl_oid_list *child;
4167 struct sysctl_ctx_list *ctx;
4171 * Create TXBUF for chimney sending.
4173 * NOTE: It is shared by all channels.
4175 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
4176 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
4177 BUS_DMA_WAITOK | BUS_DMA_ZERO);
4178 if (sc->hn_chim == NULL) {
4179 device_printf(sc->hn_dev, "allocate txbuf failed\n");
4183 sc->hn_tx_ring_cnt = ring_cnt;
4184 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4186 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
4187 M_DEVBUF, M_WAITOK | M_ZERO);
4189 ctx = device_get_sysctl_ctx(sc->hn_dev);
4190 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
4192 /* Create dev.hn.UNIT.tx sysctl tree */
4193 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
4194 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
4196 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4199 error = hn_tx_ring_create(sc, i);
4204 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
4205 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4206 __offsetof(struct hn_tx_ring, hn_no_txdescs),
4207 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
4208 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
4209 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4210 __offsetof(struct hn_tx_ring, hn_send_failed),
4211 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
4212 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
4213 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4214 __offsetof(struct hn_tx_ring, hn_txdma_failed),
4215 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
4216 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
4217 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4218 __offsetof(struct hn_tx_ring, hn_flush_failed),
4219 hn_tx_stat_ulong_sysctl, "LU",
4220 "# of packet transmission aggregation flush failure");
4221 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
4222 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4223 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
4224 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
4225 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
4226 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4227 __offsetof(struct hn_tx_ring, hn_tx_chimney),
4228 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
4229 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
4230 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4231 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
4232 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
4233 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
4234 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
4235 "# of total TX descs");
4236 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
4237 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
4238 "Chimney send packet size upper boundary");
4239 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
4240 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
4241 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
4242 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
4243 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4244 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
4245 hn_tx_conf_int_sysctl, "I",
4246 "Size of the packet for direct transmission");
4247 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
4248 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
4249 __offsetof(struct hn_tx_ring, hn_sched_tx),
4250 hn_tx_conf_int_sysctl, "I",
4251 "Always schedule transmission "
4252 "instead of doing direct transmission");
4253 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
4254 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
4255 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
4256 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
4257 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
4258 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
4259 "Applied packet transmission aggregation size");
4260 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
4261 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
4262 hn_txagg_pktmax_sysctl, "I",
4263 "Applied packet transmission aggregation packets");
4264 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
4265 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
4266 hn_txagg_align_sysctl, "I",
4267 "Applied packet transmission aggregation alignment");
4273 hn_set_chim_size(struct hn_softc *sc, int chim_size)
4277 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4278 sc->hn_tx_ring[i].hn_chim_size = chim_size;
4282 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
4284 struct ifnet *ifp = sc->hn_ifp;
4287 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
4290 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
4291 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
4292 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
4294 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
4295 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
4296 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
4298 if (tso_maxlen < tso_minlen)
4299 tso_maxlen = tso_minlen;
4300 else if (tso_maxlen > IP_MAXPACKET)
4301 tso_maxlen = IP_MAXPACKET;
4302 if (tso_maxlen > sc->hn_ndis_tso_szmax)
4303 tso_maxlen = sc->hn_ndis_tso_szmax;
4304 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
4306 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
4310 hn_fixup_tx_data(struct hn_softc *sc)
4312 uint64_t csum_assist;
4315 hn_set_chim_size(sc, sc->hn_chim_szmax);
4316 if (hn_tx_chimney_size > 0 &&
4317 hn_tx_chimney_size < sc->hn_chim_szmax)
4318 hn_set_chim_size(sc, hn_tx_chimney_size);
4321 if (sc->hn_caps & HN_CAP_IPCS)
4322 csum_assist |= CSUM_IP;
4323 if (sc->hn_caps & HN_CAP_TCP4CS)
4324 csum_assist |= CSUM_IP_TCP;
4325 if (sc->hn_caps & HN_CAP_UDP4CS)
4326 csum_assist |= CSUM_IP_UDP;
4327 if (sc->hn_caps & HN_CAP_TCP6CS)
4328 csum_assist |= CSUM_IP6_TCP;
4329 if (sc->hn_caps & HN_CAP_UDP6CS)
4330 csum_assist |= CSUM_IP6_UDP;
4331 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4332 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
4334 if (sc->hn_caps & HN_CAP_HASHVAL) {
4336 * Support HASHVAL pktinfo on TX path.
4339 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
4340 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4341 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
4346 hn_destroy_tx_data(struct hn_softc *sc)
4350 if (sc->hn_chim != NULL) {
4351 if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) {
4352 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
4354 device_printf(sc->hn_dev,
4355 "chimney sending buffer is referenced");
4360 if (sc->hn_tx_ring_cnt == 0)
4363 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
4364 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
4366 free(sc->hn_tx_ring, M_DEVBUF);
4367 sc->hn_tx_ring = NULL;
4369 sc->hn_tx_ring_cnt = 0;
4370 sc->hn_tx_ring_inuse = 0;
4373 #ifdef HN_IFSTART_SUPPORT
4376 hn_start_taskfunc(void *xtxr, int pending __unused)
4378 struct hn_tx_ring *txr = xtxr;
4380 mtx_lock(&txr->hn_tx_lock);
4381 hn_start_locked(txr, 0);
4382 mtx_unlock(&txr->hn_tx_lock);
4386 hn_start_locked(struct hn_tx_ring *txr, int len)
4388 struct hn_softc *sc = txr->hn_sc;
4389 struct ifnet *ifp = sc->hn_ifp;
4392 KASSERT(hn_use_if_start,
4393 ("hn_start_locked is called, when if_start is disabled"));
4394 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4395 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4396 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4398 if (__predict_false(txr->hn_suspended))
4401 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
4405 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
4406 struct hn_txdesc *txd;
4407 struct mbuf *m_head;
4410 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
4414 if (len > 0 && m_head->m_pkthdr.len > len) {
4416 * This sending could be time consuming; let callers
4417 * dispatch this packet sending (and sending of any
4418 * following up packets) to tx taskqueue.
4420 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4425 #if defined(INET6) || defined(INET)
4426 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
4427 m_head = hn_tso_fixup(m_head);
4428 if (__predict_false(m_head == NULL)) {
4429 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4435 txd = hn_txdesc_get(txr);
4437 txr->hn_no_txdescs++;
4438 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4439 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4443 error = hn_encap(ifp, txr, txd, &m_head);
4445 /* Both txd and m_head are freed */
4446 KASSERT(txr->hn_agg_txd == NULL,
4447 ("encap failed w/ pending aggregating txdesc"));
4451 if (txr->hn_agg_pktleft == 0) {
4452 if (txr->hn_agg_txd != NULL) {
4453 KASSERT(m_head == NULL,
4454 ("pending mbuf for aggregating txdesc"));
4455 error = hn_flush_txagg(ifp, txr);
4456 if (__predict_false(error)) {
4457 atomic_set_int(&ifp->if_drv_flags,
4462 KASSERT(m_head != NULL, ("mbuf was freed"));
4463 error = hn_txpkt(ifp, txr, txd);
4464 if (__predict_false(error)) {
4465 /* txd is freed, but m_head is not */
4466 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
4467 atomic_set_int(&ifp->if_drv_flags,
4475 KASSERT(txr->hn_agg_txd != NULL,
4476 ("no aggregating txdesc"));
4477 KASSERT(m_head == NULL,
4478 ("pending mbuf for aggregating txdesc"));
4483 /* Flush pending aggerated transmission. */
4484 if (txr->hn_agg_txd != NULL)
4485 hn_flush_txagg(ifp, txr);
4490 hn_start(struct ifnet *ifp)
4492 struct hn_softc *sc = ifp->if_softc;
4493 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
4495 if (txr->hn_sched_tx)
4498 if (mtx_trylock(&txr->hn_tx_lock)) {
4501 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4502 mtx_unlock(&txr->hn_tx_lock);
4507 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4511 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
4513 struct hn_tx_ring *txr = xtxr;
4515 mtx_lock(&txr->hn_tx_lock);
4516 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
4517 hn_start_locked(txr, 0);
4518 mtx_unlock(&txr->hn_tx_lock);
4522 hn_start_txeof(struct hn_tx_ring *txr)
4524 struct hn_softc *sc = txr->hn_sc;
4525 struct ifnet *ifp = sc->hn_ifp;
4527 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
4529 if (txr->hn_sched_tx)
4532 if (mtx_trylock(&txr->hn_tx_lock)) {
4535 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4536 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
4537 mtx_unlock(&txr->hn_tx_lock);
4539 taskqueue_enqueue(txr->hn_tx_taskq,
4545 * Release the OACTIVE earlier, with the hope, that
4546 * others could catch up. The task will clear the
4547 * flag again with the hn_tx_lock to avoid possible
4550 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
4551 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4555 #endif /* HN_IFSTART_SUPPORT */
4558 hn_xmit(struct hn_tx_ring *txr, int len)
4560 struct hn_softc *sc = txr->hn_sc;
4561 struct ifnet *ifp = sc->hn_ifp;
4562 struct mbuf *m_head;
4565 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
4566 #ifdef HN_IFSTART_SUPPORT
4567 KASSERT(hn_use_if_start == 0,
4568 ("hn_xmit is called, when if_start is enabled"));
4570 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
4572 if (__predict_false(txr->hn_suspended))
4575 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
4578 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
4579 struct hn_txdesc *txd;
4582 if (len > 0 && m_head->m_pkthdr.len > len) {
4584 * This sending could be time consuming; let callers
4585 * dispatch this packet sending (and sending of any
4586 * following up packets) to tx taskqueue.
4588 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4593 txd = hn_txdesc_get(txr);
4595 txr->hn_no_txdescs++;
4596 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4597 txr->hn_oactive = 1;
4601 error = hn_encap(ifp, txr, txd, &m_head);
4603 /* Both txd and m_head are freed; discard */
4604 KASSERT(txr->hn_agg_txd == NULL,
4605 ("encap failed w/ pending aggregating txdesc"));
4606 drbr_advance(ifp, txr->hn_mbuf_br);
4610 if (txr->hn_agg_pktleft == 0) {
4611 if (txr->hn_agg_txd != NULL) {
4612 KASSERT(m_head == NULL,
4613 ("pending mbuf for aggregating txdesc"));
4614 error = hn_flush_txagg(ifp, txr);
4615 if (__predict_false(error)) {
4616 txr->hn_oactive = 1;
4620 KASSERT(m_head != NULL, ("mbuf was freed"));
4621 error = hn_txpkt(ifp, txr, txd);
4622 if (__predict_false(error)) {
4623 /* txd is freed, but m_head is not */
4624 drbr_putback(ifp, txr->hn_mbuf_br,
4626 txr->hn_oactive = 1;
4633 KASSERT(txr->hn_agg_txd != NULL,
4634 ("no aggregating txdesc"));
4635 KASSERT(m_head == NULL,
4636 ("pending mbuf for aggregating txdesc"));
4641 drbr_advance(ifp, txr->hn_mbuf_br);
4644 /* Flush pending aggerated transmission. */
4645 if (txr->hn_agg_txd != NULL)
4646 hn_flush_txagg(ifp, txr);
4651 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4653 struct hn_softc *sc = ifp->if_softc;
4654 struct hn_tx_ring *txr;
4657 #if defined(INET6) || defined(INET)
4659 * Perform TSO packet header fixup now, since the TSO
4660 * packet header should be cache-hot.
4662 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4663 m = hn_tso_fixup(m);
4664 if (__predict_false(m == NULL)) {
4665 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4672 * Select the TX ring based on flowid
4674 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
4678 if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
4680 idx = bid % sc->hn_tx_ring_inuse;
4684 #if defined(INET6) || defined(INET)
4687 if (m->m_pkthdr.len < 128 &&
4688 (m->m_pkthdr.csum_flags &
4689 (CSUM_IP_TCP | CSUM_IP6_TCP)) &&
4690 (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) {
4691 m = hn_check_tcpsyn(m, &tcpsyn);
4692 if (__predict_false(m == NULL)) {
4694 IFCOUNTER_OERRORS, 1);
4699 const int tcpsyn = 0;
4704 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4707 txr = &sc->hn_tx_ring[idx];
4709 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4711 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4715 if (txr->hn_oactive)
4718 if (txr->hn_sched_tx)
4721 if (mtx_trylock(&txr->hn_tx_lock)) {
4724 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4725 mtx_unlock(&txr->hn_tx_lock);
4730 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4735 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4739 mtx_lock(&txr->hn_tx_lock);
4740 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4742 mtx_unlock(&txr->hn_tx_lock);
4746 hn_xmit_qflush(struct ifnet *ifp)
4748 struct hn_softc *sc = ifp->if_softc;
4751 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4752 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4757 hn_xmit_txeof(struct hn_tx_ring *txr)
4760 if (txr->hn_sched_tx)
4763 if (mtx_trylock(&txr->hn_tx_lock)) {
4766 txr->hn_oactive = 0;
4767 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4768 mtx_unlock(&txr->hn_tx_lock);
4770 taskqueue_enqueue(txr->hn_tx_taskq,
4776 * Release the oactive earlier, with the hope, that
4777 * others could catch up. The task will clear the
4778 * oactive again with the hn_tx_lock to avoid possible
4781 txr->hn_oactive = 0;
4782 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4787 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4789 struct hn_tx_ring *txr = xtxr;
4791 mtx_lock(&txr->hn_tx_lock);
4793 mtx_unlock(&txr->hn_tx_lock);
4797 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4799 struct hn_tx_ring *txr = xtxr;
4801 mtx_lock(&txr->hn_tx_lock);
4802 txr->hn_oactive = 0;
4804 mtx_unlock(&txr->hn_tx_lock);
4808 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4810 struct vmbus_chan_br cbr;
4811 struct hn_rx_ring *rxr;
4812 struct hn_tx_ring *txr = NULL;
4815 idx = vmbus_chan_subidx(chan);
4818 * Link this channel to RX/TX ring.
4820 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4821 ("invalid channel index %d, should > 0 && < %d",
4822 idx, sc->hn_rx_ring_inuse));
4823 rxr = &sc->hn_rx_ring[idx];
4824 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4825 ("RX ring %d already attached", idx));
4826 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4827 rxr->hn_chan = chan;
4830 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4831 idx, vmbus_chan_id(chan));
4834 if (idx < sc->hn_tx_ring_inuse) {
4835 txr = &sc->hn_tx_ring[idx];
4836 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4837 ("TX ring %d already attached", idx));
4838 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4840 txr->hn_chan = chan;
4842 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4843 idx, vmbus_chan_id(chan));
4847 /* Bind this channel to a proper CPU. */
4848 vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx));
4853 cbr.cbr = rxr->hn_br;
4854 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4855 cbr.cbr_txsz = HN_TXBR_SIZE;
4856 cbr.cbr_rxsz = HN_RXBR_SIZE;
4857 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4859 if (error == EISCONN) {
4860 if_printf(sc->hn_ifp, "bufring is connected after "
4861 "chan%u open failure\n", vmbus_chan_id(chan));
4862 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4864 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4865 vmbus_chan_id(chan), error);
4872 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4874 struct hn_rx_ring *rxr;
4877 idx = vmbus_chan_subidx(chan);
4880 * Link this channel to RX/TX ring.
4882 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4883 ("invalid channel index %d, should > 0 && < %d",
4884 idx, sc->hn_rx_ring_inuse));
4885 rxr = &sc->hn_rx_ring[idx];
4886 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4887 ("RX ring %d is not attached", idx));
4888 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4890 if (idx < sc->hn_tx_ring_inuse) {
4891 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4893 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4894 ("TX ring %d is not attached attached", idx));
4895 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4899 * Close this channel.
4902 * Channel closing does _not_ destroy the target channel.
4904 error = vmbus_chan_close_direct(chan);
4905 if (error == EISCONN) {
4906 if_printf(sc->hn_ifp, "chan%u bufring is connected "
4907 "after being closed\n", vmbus_chan_id(chan));
4908 rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF;
4910 if_printf(sc->hn_ifp, "chan%u close failed: %d\n",
4911 vmbus_chan_id(chan), error);
4916 hn_attach_subchans(struct hn_softc *sc)
4918 struct vmbus_channel **subchans;
4919 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4922 KASSERT(subchan_cnt > 0, ("no sub-channels"));
4924 /* Attach the sub-channels. */
4925 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4926 for (i = 0; i < subchan_cnt; ++i) {
4929 error1 = hn_chan_attach(sc, subchans[i]);
4932 /* Move on; all channels will be detached later. */
4935 vmbus_subchan_rel(subchans, subchan_cnt);
4938 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4941 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4949 hn_detach_allchans(struct hn_softc *sc)
4951 struct vmbus_channel **subchans;
4952 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4955 if (subchan_cnt == 0)
4958 /* Detach the sub-channels. */
4959 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4960 for (i = 0; i < subchan_cnt; ++i)
4961 hn_chan_detach(sc, subchans[i]);
4962 vmbus_subchan_rel(subchans, subchan_cnt);
4966 * Detach the primary channel, _after_ all sub-channels
4969 hn_chan_detach(sc, sc->hn_prichan);
4971 /* Wait for sub-channels to be destroyed, if any. */
4972 vmbus_subchan_drain(sc->hn_prichan);
4975 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4976 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4977 HN_RX_FLAG_ATTACHED) == 0,
4978 ("%dth RX ring is still attached", i));
4980 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4981 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4982 HN_TX_FLAG_ATTACHED) == 0,
4983 ("%dth TX ring is still attached", i));
4989 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4991 struct vmbus_channel **subchans;
4992 int nchan, rxr_cnt, error;
4994 nchan = *nsubch + 1;
4997 * Multiple RX/TX rings are not requested.
5004 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
5007 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
5009 /* No RSS; this is benign. */
5014 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
5018 if (nchan > rxr_cnt)
5021 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
5027 * Allocate sub-channels from NVS.
5029 *nsubch = nchan - 1;
5030 error = hn_nvs_alloc_subchans(sc, nsubch);
5031 if (error || *nsubch == 0) {
5032 /* Failed to allocate sub-channels. */
5038 * Wait for all sub-channels to become ready before moving on.
5040 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
5041 vmbus_subchan_rel(subchans, *nsubch);
5046 hn_synth_attachable(const struct hn_softc *sc)
5050 if (sc->hn_flags & HN_FLAG_ERRORS)
5053 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
5054 const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
5056 if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF)
5063 * Make sure that the RX filter is zero after the successful
5064 * RNDIS initialization.
5067 * Under certain conditions on certain versions of Hyper-V,
5068 * the RNDIS rxfilter is _not_ zero on the hypervisor side
5069 * after the successful RNDIS initialization, which breaks
5070 * the assumption of any following code (well, it breaks the
5071 * RNDIS API contract actually). Clear the RNDIS rxfilter
5072 * explicitly, drain packets sneaking through, and drain the
5073 * interrupt taskqueues scheduled due to the stealth packets.
5076 hn_rndis_init_fixat(struct hn_softc *sc, int nchan)
5080 hn_drain_rxtx(sc, nchan);
5084 hn_synth_attach(struct hn_softc *sc, int mtu)
5086 #define ATTACHED_NVS 0x0002
5087 #define ATTACHED_RNDIS 0x0004
5089 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
5090 int error, nsubch, nchan = 1, i, rndis_inited;
5091 uint32_t old_caps, attached = 0;
5093 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
5094 ("synthetic parts were attached"));
5096 if (!hn_synth_attachable(sc))
5099 /* Save capabilities for later verification. */
5100 old_caps = sc->hn_caps;
5103 /* Clear RSS stuffs. */
5104 sc->hn_rss_ind_size = 0;
5105 sc->hn_rss_hash = 0;
5108 * Attach the primary channel _before_ attaching NVS and RNDIS.
5110 error = hn_chan_attach(sc, sc->hn_prichan);
5117 error = hn_nvs_attach(sc, mtu);
5120 attached |= ATTACHED_NVS;
5123 * Attach RNDIS _after_ NVS is attached.
5125 error = hn_rndis_attach(sc, mtu, &rndis_inited);
5127 attached |= ATTACHED_RNDIS;
5132 * Make sure capabilities are not changed.
5134 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
5135 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
5136 old_caps, sc->hn_caps);
5142 * Allocate sub-channels for multi-TX/RX rings.
5145 * The # of RX rings that can be used is equivalent to the # of
5146 * channels to be requested.
5148 nsubch = sc->hn_rx_ring_cnt - 1;
5149 error = hn_synth_alloc_subchans(sc, &nsubch);
5152 /* NOTE: _Full_ synthetic parts detach is required now. */
5153 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
5156 * Set the # of TX/RX rings that could be used according to
5157 * the # of channels that NVS offered.
5160 hn_set_ring_inuse(sc, nchan);
5162 /* Only the primary channel can be used; done */
5167 * Attach the sub-channels.
5169 * NOTE: hn_set_ring_inuse() _must_ have been called.
5171 error = hn_attach_subchans(sc);
5176 * Configure RSS key and indirect table _after_ all sub-channels
5179 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
5181 * RSS key is not set yet; set it to the default RSS key.
5184 if_printf(sc->hn_ifp, "setup default RSS key\n");
5186 rss_getkey(rss->rss_key);
5188 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
5190 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
5193 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
5195 * RSS indirect table is not set yet; set it up in round-
5199 if_printf(sc->hn_ifp, "setup default RSS indirect "
5202 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
5206 subidx = rss_get_indirection_to_bucket(i);
5210 rss->rss_ind[i] = subidx % nchan;
5212 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
5215 * # of usable channels may be changed, so we have to
5216 * make sure that all entries in RSS indirect table
5219 * NOTE: hn_set_ring_inuse() _must_ have been called.
5221 hn_rss_ind_fixup(sc);
5224 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
5229 * Fixup transmission aggregation setup.
5232 hn_rndis_init_fixat(sc, nchan);
5236 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
5237 hn_rndis_init_fixat(sc, nchan);
5238 hn_synth_detach(sc);
5240 if (attached & ATTACHED_RNDIS) {
5241 hn_rndis_init_fixat(sc, nchan);
5242 hn_rndis_detach(sc);
5244 if (attached & ATTACHED_NVS)
5246 hn_chan_detach(sc, sc->hn_prichan);
5247 /* Restore old capabilities. */
5248 sc->hn_caps = old_caps;
5252 #undef ATTACHED_RNDIS
5258 * The interface must have been suspended though hn_suspend(), before
5259 * this function get called.
5262 hn_synth_detach(struct hn_softc *sc)
5265 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
5266 ("synthetic parts were not attached"));
5268 /* Detach the RNDIS first. */
5269 hn_rndis_detach(sc);
5274 /* Detach all of the channels. */
5275 hn_detach_allchans(sc);
5277 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
5281 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
5283 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
5284 ("invalid ring count %d", ring_cnt));
5286 if (sc->hn_tx_ring_cnt > ring_cnt)
5287 sc->hn_tx_ring_inuse = ring_cnt;
5289 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
5290 sc->hn_rx_ring_inuse = ring_cnt;
5293 if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) {
5294 if_printf(sc->hn_ifp, "# of RX rings (%d) does not match "
5295 "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse,
5296 rss_getnumbuckets());
5301 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
5302 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
5307 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
5312 * The TX bufring will not be drained by the hypervisor,
5313 * if the primary channel is revoked.
5315 while (!vmbus_chan_rx_empty(chan) ||
5316 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
5317 !vmbus_chan_tx_empty(chan)))
5319 vmbus_chan_intr_drain(chan);
5323 hn_disable_rx(struct hn_softc *sc)
5327 * Disable RX by clearing RX filter forcefully.
5329 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
5330 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */
5333 * Give RNDIS enough time to flush all pending data packets.
5335 pause("waitrx", (200 * hz) / 1000);
5340 * RX/TX _must_ have been suspended/disabled, before this function
5344 hn_drain_rxtx(struct hn_softc *sc, int nchan)
5346 struct vmbus_channel **subch = NULL;
5350 * Drain RX/TX bufrings and interrupts.
5354 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
5356 if (subch != NULL) {
5359 for (i = 0; i < nsubch; ++i)
5360 hn_chan_drain(sc, subch[i]);
5362 hn_chan_drain(sc, sc->hn_prichan);
5365 vmbus_subchan_rel(subch, nsubch);
5369 hn_suspend_data(struct hn_softc *sc)
5371 struct hn_tx_ring *txr;
5379 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5380 txr = &sc->hn_tx_ring[i];
5382 mtx_lock(&txr->hn_tx_lock);
5383 txr->hn_suspended = 1;
5384 mtx_unlock(&txr->hn_tx_lock);
5385 /* No one is able send more packets now. */
5388 * Wait for all pending sends to finish.
5391 * We will _not_ receive all pending send-done, if the
5392 * primary channel is revoked.
5394 while (hn_tx_ring_pending(txr) &&
5395 !vmbus_chan_is_revoked(sc->hn_prichan))
5396 pause("hnwtx", 1 /* 1 tick */);
5407 hn_drain_rxtx(sc, sc->hn_rx_ring_inuse);
5410 * Drain any pending TX tasks.
5413 * The above hn_drain_rxtx() can dispatch TX tasks, so the TX
5414 * tasks will have to be drained _after_ the above hn_drain_rxtx().
5416 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5417 txr = &sc->hn_tx_ring[i];
5419 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
5420 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
5425 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
5428 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
5432 hn_suspend_mgmt(struct hn_softc *sc)
5439 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
5440 * through hn_mgmt_taskq.
5442 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
5443 vmbus_chan_run_task(sc->hn_prichan, &task);
5446 * Make sure that all pending management tasks are completed.
5448 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
5449 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
5450 taskqueue_drain_all(sc->hn_mgmt_taskq0);
5454 hn_suspend(struct hn_softc *sc)
5457 /* Disable polling. */
5460 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5461 (sc->hn_flags & HN_FLAG_RXVF))
5462 hn_suspend_data(sc);
5463 hn_suspend_mgmt(sc);
5467 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
5471 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
5472 ("invalid TX ring count %d", tx_ring_cnt));
5474 for (i = 0; i < tx_ring_cnt; ++i) {
5475 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5477 mtx_lock(&txr->hn_tx_lock);
5478 txr->hn_suspended = 0;
5479 mtx_unlock(&txr->hn_tx_lock);
5484 hn_resume_data(struct hn_softc *sc)
5493 hn_rxfilter_config(sc);
5496 * Make sure to clear suspend status on "all" TX rings,
5497 * since hn_tx_ring_inuse can be changed after
5498 * hn_suspend_data().
5500 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
5502 #ifdef HN_IFSTART_SUPPORT
5503 if (!hn_use_if_start)
5507 * Flush unused drbrs, since hn_tx_ring_inuse may be
5510 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
5511 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
5517 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
5518 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
5521 * Use txeof task, so that any pending oactive can be
5524 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
5529 hn_resume_mgmt(struct hn_softc *sc)
5532 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
5535 * Kick off network change detection, if it was pending.
5536 * If no network change was pending, start link status
5537 * checks, which is more lightweight than network change
5540 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
5541 hn_change_network(sc);
5543 hn_update_link_status(sc);
5547 hn_resume(struct hn_softc *sc)
5550 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
5551 (sc->hn_flags & HN_FLAG_RXVF))
5555 * When the VF is activated, the synthetic interface is changed
5556 * to DOWN in hn_rxvf_change(). Here, if the VF is still active,
5557 * we don't call hn_resume_mgmt() until the VF is deactivated in
5560 if (!(sc->hn_flags & HN_FLAG_RXVF))
5564 * Re-enable polling if this interface is running and
5565 * the polling is requested.
5567 if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0)
5568 hn_polling(sc, sc->hn_pollhz);
5572 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
5574 const struct rndis_status_msg *msg;
5577 if (dlen < sizeof(*msg)) {
5578 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
5583 switch (msg->rm_status) {
5584 case RNDIS_STATUS_MEDIA_CONNECT:
5585 case RNDIS_STATUS_MEDIA_DISCONNECT:
5586 hn_update_link_status(sc);
5589 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
5590 case RNDIS_STATUS_LINK_SPEED_CHANGE:
5591 /* Not really useful; ignore. */
5594 case RNDIS_STATUS_NETWORK_CHANGE:
5595 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
5596 if (dlen < ofs + msg->rm_stbuflen ||
5597 msg->rm_stbuflen < sizeof(uint32_t)) {
5598 if_printf(sc->hn_ifp, "network changed\n");
5602 memcpy(&change, ((const uint8_t *)msg) + ofs,
5604 if_printf(sc->hn_ifp, "network changed, change %u\n",
5607 hn_change_network(sc);
5611 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
5618 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
5620 const struct rndis_pktinfo *pi = info_data;
5623 while (info_dlen != 0) {
5627 if (__predict_false(info_dlen < sizeof(*pi)))
5629 if (__predict_false(info_dlen < pi->rm_size))
5631 info_dlen -= pi->rm_size;
5633 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
5635 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
5637 dlen = pi->rm_size - pi->rm_pktinfooffset;
5640 switch (pi->rm_type) {
5641 case NDIS_PKTINFO_TYPE_VLAN:
5642 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
5644 info->vlan_info = *((const uint32_t *)data);
5645 mask |= HN_RXINFO_VLAN;
5648 case NDIS_PKTINFO_TYPE_CSUM:
5649 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
5651 info->csum_info = *((const uint32_t *)data);
5652 mask |= HN_RXINFO_CSUM;
5655 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
5656 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
5658 info->hash_value = *((const uint32_t *)data);
5659 mask |= HN_RXINFO_HASHVAL;
5662 case HN_NDIS_PKTINFO_TYPE_HASHINF:
5663 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
5665 info->hash_info = *((const uint32_t *)data);
5666 mask |= HN_RXINFO_HASHINF;
5673 if (mask == HN_RXINFO_ALL) {
5674 /* All found; done */
5678 pi = (const struct rndis_pktinfo *)
5679 ((const uint8_t *)pi + pi->rm_size);
5684 * - If there is no hash value, invalidate the hash info.
5686 if ((mask & HN_RXINFO_HASHVAL) == 0)
5687 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
5691 static __inline bool
5692 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
5695 if (off < check_off) {
5696 if (__predict_true(off + len <= check_off))
5698 } else if (off > check_off) {
5699 if (__predict_true(check_off + check_len <= off))
5706 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
5708 const struct rndis_packet_msg *pkt;
5709 struct hn_rxinfo info;
5710 int data_off, pktinfo_off, data_len, pktinfo_len;
5715 if (__predict_false(dlen < sizeof(*pkt))) {
5716 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
5721 if (__predict_false(dlen < pkt->rm_len)) {
5722 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
5723 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
5726 if (__predict_false(pkt->rm_len <
5727 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
5728 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
5729 "msglen %u, data %u, oob %u, pktinfo %u\n",
5730 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
5731 pkt->rm_pktinfolen);
5734 if (__predict_false(pkt->rm_datalen == 0)) {
5735 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
5742 #define IS_OFFSET_INVALID(ofs) \
5743 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
5744 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
5746 /* XXX Hyper-V does not meet data offset alignment requirement */
5747 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
5748 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5749 "data offset %u\n", pkt->rm_dataoffset);
5752 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
5753 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
5754 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5755 "oob offset %u\n", pkt->rm_oobdataoffset);
5758 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
5759 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
5760 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5761 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
5765 #undef IS_OFFSET_INVALID
5767 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
5768 data_len = pkt->rm_datalen;
5769 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
5770 pktinfo_len = pkt->rm_pktinfolen;
5773 * Check OOB coverage.
5775 if (__predict_false(pkt->rm_oobdatalen != 0)) {
5776 int oob_off, oob_len;
5778 if_printf(rxr->hn_ifp, "got oobdata\n");
5779 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5780 oob_len = pkt->rm_oobdatalen;
5782 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5783 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5784 "oob overflow, msglen %u, oob abs %d len %d\n",
5785 pkt->rm_len, oob_off, oob_len);
5790 * Check against data.
5792 if (hn_rndis_check_overlap(oob_off, oob_len,
5793 data_off, data_len)) {
5794 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5795 "oob overlaps data, oob abs %d len %d, "
5796 "data abs %d len %d\n",
5797 oob_off, oob_len, data_off, data_len);
5802 * Check against pktinfo.
5804 if (pktinfo_len != 0 &&
5805 hn_rndis_check_overlap(oob_off, oob_len,
5806 pktinfo_off, pktinfo_len)) {
5807 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5808 "oob overlaps pktinfo, oob abs %d len %d, "
5809 "pktinfo abs %d len %d\n",
5810 oob_off, oob_len, pktinfo_off, pktinfo_len);
5816 * Check per-packet-info coverage and find useful per-packet-info.
5818 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5819 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5820 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5821 if (__predict_true(pktinfo_len != 0)) {
5825 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5826 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5827 "pktinfo overflow, msglen %u, "
5828 "pktinfo abs %d len %d\n",
5829 pkt->rm_len, pktinfo_off, pktinfo_len);
5834 * Check packet info coverage.
5836 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5837 data_off, data_len);
5838 if (__predict_false(overlap)) {
5839 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5840 "pktinfo overlap data, pktinfo abs %d len %d, "
5841 "data abs %d len %d\n",
5842 pktinfo_off, pktinfo_len, data_off, data_len);
5847 * Find useful per-packet-info.
5849 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5850 pktinfo_len, &info);
5851 if (__predict_false(error)) {
5852 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5858 if (__predict_false(data_off + data_len > pkt->rm_len)) {
5859 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5860 "data overflow, msglen %u, data abs %d len %d\n",
5861 pkt->rm_len, data_off, data_len);
5864 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5867 static __inline void
5868 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5870 const struct rndis_msghdr *hdr;
5872 if (__predict_false(dlen < sizeof(*hdr))) {
5873 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5878 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5879 /* Hot data path. */
5880 hn_rndis_rx_data(rxr, data, dlen);
5885 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5886 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5888 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5892 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5894 const struct hn_nvs_hdr *hdr;
5896 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5897 if_printf(sc->hn_ifp, "invalid nvs notify\n");
5900 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5902 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5903 /* Useless; ignore */
5906 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5910 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5911 const struct vmbus_chanpkt_hdr *pkt)
5913 struct hn_nvs_sendctx *sndc;
5915 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5916 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5917 VMBUS_CHANPKT_DATALEN(pkt));
5920 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5926 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5927 const struct vmbus_chanpkt_hdr *pkthdr)
5929 const struct vmbus_chanpkt_rxbuf *pkt;
5930 const struct hn_nvs_hdr *nvs_hdr;
5933 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5934 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5937 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5939 /* Make sure that this is a RNDIS message. */
5940 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5941 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5946 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5947 if (__predict_false(hlen < sizeof(*pkt))) {
5948 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5951 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5953 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5954 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5959 count = pkt->cp_rxbuf_cnt;
5960 if (__predict_false(hlen <
5961 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5962 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5966 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5967 for (i = 0; i < count; ++i) {
5970 ofs = pkt->cp_rxbuf[i].rb_ofs;
5971 len = pkt->cp_rxbuf[i].rb_len;
5972 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5973 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5974 "ofs %d, len %d\n", i, ofs, len);
5977 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5981 * Ack the consumed RXBUF associated w/ this channel packet,
5982 * so that this RXBUF can be recycled by the hypervisor.
5984 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5988 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5991 struct hn_nvs_rndis_ack ack;
5994 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5995 ack.nvs_status = HN_NVS_STATUS_OK;
5999 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
6000 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
6001 if (__predict_false(error == EAGAIN)) {
6004 * This should _not_ happen in real world, since the
6005 * consumption of the TX bufring from the TX path is
6008 if (rxr->hn_ack_failed == 0)
6009 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
6010 rxr->hn_ack_failed++;
6017 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
6022 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
6024 struct hn_rx_ring *rxr = xrxr;
6025 struct hn_softc *sc = rxr->hn_ifp->if_softc;
6028 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
6031 pktlen = rxr->hn_pktbuf_len;
6032 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
6033 if (__predict_false(error == ENOBUFS)) {
6038 * Expand channel packet buffer.
6041 * Use M_WAITOK here, since allocation failure
6044 nlen = rxr->hn_pktbuf_len * 2;
6045 while (nlen < pktlen)
6047 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
6049 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
6050 rxr->hn_pktbuf_len, nlen);
6052 free(rxr->hn_pktbuf, M_DEVBUF);
6053 rxr->hn_pktbuf = nbuf;
6054 rxr->hn_pktbuf_len = nlen;
6057 } else if (__predict_false(error == EAGAIN)) {
6058 /* No more channel packets; done! */
6061 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
6063 switch (pkt->cph_type) {
6064 case VMBUS_CHANPKT_TYPE_COMP:
6065 hn_nvs_handle_comp(sc, chan, pkt);
6068 case VMBUS_CHANPKT_TYPE_RXBUF:
6069 hn_nvs_handle_rxbuf(rxr, chan, pkt);
6072 case VMBUS_CHANPKT_TYPE_INBAND:
6073 hn_nvs_handle_notify(sc, pkt);
6077 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
6082 hn_chan_rollup(rxr, rxr->hn_txr);
6086 hn_sysinit(void *arg __unused)
6091 * Initialize VF map.
6093 rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE);
6094 hn_vfmap_size = HN_VFMAP_SIZE_DEF;
6095 hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF,
6099 * Fix the # of TX taskqueues.
6101 if (hn_tx_taskq_cnt <= 0)
6102 hn_tx_taskq_cnt = 1;
6103 else if (hn_tx_taskq_cnt > mp_ncpus)
6104 hn_tx_taskq_cnt = mp_ncpus;
6107 * Fix the TX taskqueue mode.
6109 switch (hn_tx_taskq_mode) {
6110 case HN_TX_TASKQ_M_INDEP:
6111 case HN_TX_TASKQ_M_GLOBAL:
6112 case HN_TX_TASKQ_M_EVTTQ:
6115 hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP;
6119 if (vm_guest != VM_GUEST_HV)
6122 if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL)
6125 hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *),
6126 M_DEVBUF, M_WAITOK);
6127 for (i = 0; i < hn_tx_taskq_cnt; ++i) {
6128 hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK,
6129 taskqueue_thread_enqueue, &hn_tx_taskque[i]);
6130 taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET,
6134 SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL);
6137 hn_sysuninit(void *arg __unused)
6140 if (hn_tx_taskque != NULL) {
6143 for (i = 0; i < hn_tx_taskq_cnt; ++i)
6144 taskqueue_free(hn_tx_taskque[i]);
6145 free(hn_tx_taskque, M_DEVBUF);
6148 if (hn_vfmap != NULL)
6149 free(hn_vfmap, M_DEVBUF);
6150 rm_destroy(&hn_vfmap_lock);
6152 SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL);