2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
61 #include <sys/param.h>
63 #include <sys/kernel.h>
64 #include <sys/limits.h>
65 #include <sys/malloc.h>
67 #include <sys/module.h>
69 #include <sys/queue.h>
72 #include <sys/socket.h>
73 #include <sys/sockio.h>
75 #include <sys/sysctl.h>
76 #include <sys/systm.h>
77 #include <sys/taskqueue.h>
78 #include <sys/buf_ring.h>
80 #include <machine/atomic.h>
81 #include <machine/in_cksum.h>
84 #include <net/ethernet.h>
86 #include <net/if_arp.h>
87 #include <net/if_media.h>
88 #include <net/if_types.h>
89 #include <net/if_var.h>
90 #include <net/if_vlan_var.h>
91 #include <net/rndis.h>
93 #include <netinet/in_systm.h>
94 #include <netinet/in.h>
95 #include <netinet/ip.h>
96 #include <netinet/ip6.h>
97 #include <netinet/tcp.h>
98 #include <netinet/tcp_lro.h>
99 #include <netinet/udp.h>
101 #include <dev/hyperv/include/hyperv.h>
102 #include <dev/hyperv/include/hyperv_busdma.h>
103 #include <dev/hyperv/include/vmbus.h>
104 #include <dev/hyperv/include/vmbus_xact.h>
106 #include <dev/hyperv/netvsc/ndis.h>
107 #include <dev/hyperv/netvsc/if_hnreg.h>
108 #include <dev/hyperv/netvsc/if_hnvar.h>
109 #include <dev/hyperv/netvsc/hn_nvs.h>
110 #include <dev/hyperv/netvsc/hn_rndis.h>
112 #include "vmbus_if.h"
114 #define HN_IFSTART_SUPPORT
116 #define HN_RING_CNT_DEF_MAX 8
118 /* YYY should get it from the underlying channel */
119 #define HN_TX_DESC_CNT 512
121 #define HN_RNDIS_PKT_LEN \
122 (sizeof(struct rndis_packet_msg) + \
123 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
124 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
125 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
126 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
127 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
128 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
130 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
131 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
132 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
133 /* -1 for RNDIS packet message */
134 #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1)
136 #define HN_DIRECT_TX_SIZE_DEF 128
138 #define HN_EARLY_TXEOF_THRESH 8
140 #define HN_PKTBUF_LEN_DEF (16 * 1024)
142 #define HN_LROENT_CNT_DEF 128
144 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
145 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
146 /* YYY 2*MTU is a bit rough, but should be good enough. */
147 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
149 #define HN_LRO_ACKCNT_DEF 1
151 #define HN_LOCK_INIT(sc) \
152 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
153 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
154 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
155 #define HN_LOCK(sc) \
157 while (sx_try_xlock(&(sc)->hn_lock) == 0) \
160 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
162 #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP)
163 #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP)
164 #define HN_CSUM_IP_HWASSIST(sc) \
165 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK)
166 #define HN_CSUM_IP6_HWASSIST(sc) \
167 ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK)
169 #define HN_PKTSIZE_MIN(align) \
170 roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \
171 HN_RNDIS_PKT_LEN, (align))
172 #define HN_PKTSIZE(m, align) \
173 roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align))
176 #ifndef HN_USE_TXDESC_BUFRING
177 SLIST_ENTRY(hn_txdesc) link;
179 STAILQ_ENTRY(hn_txdesc) agg_link;
181 /* Aggregated txdescs, in sending order. */
182 STAILQ_HEAD(, hn_txdesc) agg_list;
184 /* The oldest packet, if transmission aggregation happens. */
186 struct hn_tx_ring *txr;
188 uint32_t flags; /* HN_TXD_FLAG_ */
189 struct hn_nvs_sendctx send_ctx;
193 bus_dmamap_t data_dmap;
195 bus_addr_t rndis_pkt_paddr;
196 struct rndis_packet_msg *rndis_pkt;
197 bus_dmamap_t rndis_pkt_dmap;
200 #define HN_TXD_FLAG_ONLIST 0x0001
201 #define HN_TXD_FLAG_DMAMAP 0x0002
202 #define HN_TXD_FLAG_ONAGG 0x0004
211 #define HN_RXINFO_VLAN 0x0001
212 #define HN_RXINFO_CSUM 0x0002
213 #define HN_RXINFO_HASHINF 0x0004
214 #define HN_RXINFO_HASHVAL 0x0008
215 #define HN_RXINFO_ALL \
218 HN_RXINFO_HASHINF | \
221 #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff
222 #define HN_NDIS_RXCSUM_INFO_INVALID 0
223 #define HN_NDIS_HASH_INFO_INVALID 0
225 static int hn_probe(device_t);
226 static int hn_attach(device_t);
227 static int hn_detach(device_t);
228 static int hn_shutdown(device_t);
229 static void hn_chan_callback(struct vmbus_channel *,
232 static void hn_init(void *);
233 static int hn_ioctl(struct ifnet *, u_long, caddr_t);
234 #ifdef HN_IFSTART_SUPPORT
235 static void hn_start(struct ifnet *);
237 static int hn_transmit(struct ifnet *, struct mbuf *);
238 static void hn_xmit_qflush(struct ifnet *);
239 static int hn_ifmedia_upd(struct ifnet *);
240 static void hn_ifmedia_sts(struct ifnet *,
241 struct ifmediareq *);
243 static int hn_rndis_rxinfo(const void *, int,
245 static void hn_rndis_rx_data(struct hn_rx_ring *,
247 static void hn_rndis_rx_status(struct hn_softc *,
250 static void hn_nvs_handle_notify(struct hn_softc *,
251 const struct vmbus_chanpkt_hdr *);
252 static void hn_nvs_handle_comp(struct hn_softc *,
253 struct vmbus_channel *,
254 const struct vmbus_chanpkt_hdr *);
255 static void hn_nvs_handle_rxbuf(struct hn_rx_ring *,
256 struct vmbus_channel *,
257 const struct vmbus_chanpkt_hdr *);
258 static void hn_nvs_ack_rxbuf(struct hn_rx_ring *,
259 struct vmbus_channel *, uint64_t);
261 #if __FreeBSD_version >= 1100099
262 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
263 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
265 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
266 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
267 #if __FreeBSD_version < 1100095
268 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
270 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
272 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
273 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
274 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
275 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
276 static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS);
277 static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS);
278 static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS);
279 static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS);
280 static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS);
281 static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS);
282 static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS);
283 static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS);
284 static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS);
285 static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS);
287 static void hn_stop(struct hn_softc *);
288 static void hn_init_locked(struct hn_softc *);
289 static int hn_chan_attach(struct hn_softc *,
290 struct vmbus_channel *);
291 static void hn_chan_detach(struct hn_softc *,
292 struct vmbus_channel *);
293 static int hn_attach_subchans(struct hn_softc *);
294 static void hn_detach_allchans(struct hn_softc *);
295 static void hn_chan_rollup(struct hn_rx_ring *,
296 struct hn_tx_ring *);
297 static void hn_set_ring_inuse(struct hn_softc *, int);
298 static int hn_synth_attach(struct hn_softc *, int);
299 static void hn_synth_detach(struct hn_softc *);
300 static int hn_synth_alloc_subchans(struct hn_softc *,
302 static void hn_suspend(struct hn_softc *);
303 static void hn_suspend_data(struct hn_softc *);
304 static void hn_suspend_mgmt(struct hn_softc *);
305 static void hn_resume(struct hn_softc *);
306 static void hn_resume_data(struct hn_softc *);
307 static void hn_resume_mgmt(struct hn_softc *);
308 static void hn_suspend_mgmt_taskfunc(void *, int);
309 static void hn_chan_drain(struct hn_softc *,
310 struct vmbus_channel *);
312 static void hn_update_link_status(struct hn_softc *);
313 static void hn_change_network(struct hn_softc *);
314 static void hn_link_taskfunc(void *, int);
315 static void hn_netchg_init_taskfunc(void *, int);
316 static void hn_netchg_status_taskfunc(void *, int);
317 static void hn_link_status(struct hn_softc *);
319 static int hn_create_rx_data(struct hn_softc *, int);
320 static void hn_destroy_rx_data(struct hn_softc *);
321 static int hn_check_iplen(const struct mbuf *, int);
322 static int hn_set_rxfilter(struct hn_softc *);
323 static int hn_rss_reconfig(struct hn_softc *);
324 static void hn_rss_ind_fixup(struct hn_softc *, int);
325 static int hn_rxpkt(struct hn_rx_ring *, const void *,
326 int, const struct hn_rxinfo *);
328 static int hn_tx_ring_create(struct hn_softc *, int);
329 static void hn_tx_ring_destroy(struct hn_tx_ring *);
330 static int hn_create_tx_data(struct hn_softc *, int);
331 static void hn_fixup_tx_data(struct hn_softc *);
332 static void hn_destroy_tx_data(struct hn_softc *);
333 static void hn_txdesc_dmamap_destroy(struct hn_txdesc *);
334 static void hn_txdesc_gc(struct hn_tx_ring *,
336 static int hn_encap(struct ifnet *, struct hn_tx_ring *,
337 struct hn_txdesc *, struct mbuf **);
338 static int hn_txpkt(struct ifnet *, struct hn_tx_ring *,
340 static void hn_set_chim_size(struct hn_softc *, int);
341 static void hn_set_tso_maxsize(struct hn_softc *, int, int);
342 static bool hn_tx_ring_pending(struct hn_tx_ring *);
343 static void hn_tx_ring_qflush(struct hn_tx_ring *);
344 static void hn_resume_tx(struct hn_softc *, int);
345 static void hn_set_txagg(struct hn_softc *);
346 static void *hn_try_txagg(struct ifnet *,
347 struct hn_tx_ring *, struct hn_txdesc *,
349 static int hn_get_txswq_depth(const struct hn_tx_ring *);
350 static void hn_txpkt_done(struct hn_nvs_sendctx *,
351 struct hn_softc *, struct vmbus_channel *,
353 static int hn_txpkt_sglist(struct hn_tx_ring *,
355 static int hn_txpkt_chim(struct hn_tx_ring *,
357 static int hn_xmit(struct hn_tx_ring *, int);
358 static void hn_xmit_taskfunc(void *, int);
359 static void hn_xmit_txeof(struct hn_tx_ring *);
360 static void hn_xmit_txeof_taskfunc(void *, int);
361 #ifdef HN_IFSTART_SUPPORT
362 static int hn_start_locked(struct hn_tx_ring *, int);
363 static void hn_start_taskfunc(void *, int);
364 static void hn_start_txeof(struct hn_tx_ring *);
365 static void hn_start_txeof_taskfunc(void *, int);
368 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
369 "Hyper-V network interface");
371 /* Trust tcp segements verification on host side. */
372 static int hn_trust_hosttcp = 1;
373 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
374 &hn_trust_hosttcp, 0,
375 "Trust tcp segement verification on host side, "
376 "when csum info is missing (global setting)");
378 /* Trust udp datagrams verification on host side. */
379 static int hn_trust_hostudp = 1;
380 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
381 &hn_trust_hostudp, 0,
382 "Trust udp datagram verification on host side, "
383 "when csum info is missing (global setting)");
385 /* Trust ip packets verification on host side. */
386 static int hn_trust_hostip = 1;
387 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
389 "Trust ip packet verification on host side, "
390 "when csum info is missing (global setting)");
392 /* Limit TSO burst size */
393 static int hn_tso_maxlen = IP_MAXPACKET;
394 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
395 &hn_tso_maxlen, 0, "TSO burst limit");
397 /* Limit chimney send size */
398 static int hn_tx_chimney_size = 0;
399 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
400 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
402 /* Limit the size of packet for direct transmission */
403 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
404 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
405 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
407 /* # of LRO entries per RX ring */
408 #if defined(INET) || defined(INET6)
409 #if __FreeBSD_version >= 1100095
410 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
411 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
412 &hn_lro_entry_count, 0, "LRO entry count");
416 /* Use shared TX taskqueue */
417 static int hn_share_tx_taskq = 0;
418 SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
419 &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
421 #ifndef HN_USE_TXDESC_BUFRING
422 static int hn_use_txdesc_bufring = 0;
424 static int hn_use_txdesc_bufring = 1;
426 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
427 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
429 /* Bind TX taskqueue to the target CPU */
430 static int hn_bind_tx_taskq = -1;
431 SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
432 &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
434 #ifdef HN_IFSTART_SUPPORT
435 /* Use ifnet.if_start instead of ifnet.if_transmit */
436 static int hn_use_if_start = 0;
437 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
438 &hn_use_if_start, 0, "Use if_start TX method");
441 /* # of channels to use */
442 static int hn_chan_cnt = 0;
443 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
445 "# of channels to use; each channel has one RX ring and one TX ring");
447 /* # of transmit rings to use */
448 static int hn_tx_ring_cnt = 0;
449 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
450 &hn_tx_ring_cnt, 0, "# of TX rings to use");
452 /* Software TX ring deptch */
453 static int hn_tx_swq_depth = 0;
454 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
455 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
457 /* Enable sorted LRO, and the depth of the per-channel mbuf queue */
458 #if __FreeBSD_version >= 1100095
459 static u_int hn_lro_mbufq_depth = 0;
460 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
461 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
464 /* Packet transmission aggregation size limit */
465 static int hn_tx_agg_size = -1;
466 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN,
467 &hn_tx_agg_size, 0, "Packet transmission aggregation size limit");
469 /* Packet transmission aggregation count limit */
470 static int hn_tx_agg_pkts = 0;
471 SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN,
472 &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit");
474 static u_int hn_cpu_index; /* next CPU for channel */
475 static struct taskqueue *hn_tx_taskq; /* shared TX taskqueue */
478 hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
479 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
480 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
481 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
482 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
483 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
486 static device_method_t hn_methods[] = {
487 /* Device interface */
488 DEVMETHOD(device_probe, hn_probe),
489 DEVMETHOD(device_attach, hn_attach),
490 DEVMETHOD(device_detach, hn_detach),
491 DEVMETHOD(device_shutdown, hn_shutdown),
495 static driver_t hn_driver = {
498 sizeof(struct hn_softc)
501 static devclass_t hn_devclass;
503 DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0);
504 MODULE_VERSION(hn, 1);
505 MODULE_DEPEND(hn, vmbus, 1, 1, 1);
507 #if __FreeBSD_version >= 1100099
509 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
513 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
514 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
519 hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd)
522 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
523 txd->chim_size == 0, ("invalid rndis sglist txd"));
524 return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
525 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt));
529 hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd)
531 struct hn_nvs_rndis rndis;
533 KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID &&
534 txd->chim_size > 0, ("invalid rndis chim txd"));
536 rndis.nvs_type = HN_NVS_TYPE_RNDIS;
537 rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA;
538 rndis.nvs_chim_idx = txd->chim_index;
539 rndis.nvs_chim_sz = txd->chim_size;
541 return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC,
542 &rndis, sizeof(rndis), &txd->send_ctx));
545 static __inline uint32_t
546 hn_chim_alloc(struct hn_softc *sc)
548 int i, bmap_cnt = sc->hn_chim_bmap_cnt;
549 u_long *bmap = sc->hn_chim_bmap;
550 uint32_t ret = HN_NVS_CHIM_IDX_INVALID;
552 for (i = 0; i < bmap_cnt; ++i) {
555 idx = ffsl(~bmap[i]);
559 --idx; /* ffsl is 1-based */
560 KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt,
561 ("invalid i %d and idx %d", i, idx));
563 if (atomic_testandset_long(&bmap[i], idx))
566 ret = i * LONG_BIT + idx;
573 hn_chim_free(struct hn_softc *sc, uint32_t chim_idx)
578 idx = chim_idx / LONG_BIT;
579 KASSERT(idx < sc->hn_chim_bmap_cnt,
580 ("invalid chimney index 0x%x", chim_idx));
582 mask = 1UL << (chim_idx % LONG_BIT);
583 KASSERT(sc->hn_chim_bmap[idx] & mask,
584 ("index bitmap 0x%lx, chimney index %u, "
585 "bitmap idx %d, bitmask 0x%lx",
586 sc->hn_chim_bmap[idx], chim_idx, idx, mask));
588 atomic_clear_long(&sc->hn_chim_bmap[idx], mask);
591 #if defined(INET6) || defined(INET)
593 * NOTE: If this function failed, the m_head would be freed.
595 static __inline struct mbuf *
596 hn_tso_fixup(struct mbuf *m_head)
598 struct ether_vlan_header *evl;
602 KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable"));
604 #define PULLUP_HDR(m, len) \
606 if (__predict_false((m)->m_len < (len))) { \
607 (m) = m_pullup((m), (len)); \
613 PULLUP_HDR(m_head, sizeof(*evl));
614 evl = mtod(m_head, struct ether_vlan_header *);
615 if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN))
616 ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
618 ehlen = ETHER_HDR_LEN;
621 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
625 PULLUP_HDR(m_head, ehlen + sizeof(*ip));
626 ip = mtodo(m_head, ehlen);
627 iphlen = ip->ip_hl << 2;
629 PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th));
630 th = mtodo(m_head, ehlen + iphlen);
634 th->th_sum = in_pseudo(ip->ip_src.s_addr,
635 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
638 #if defined(INET6) && defined(INET)
645 PULLUP_HDR(m_head, ehlen + sizeof(*ip6));
646 ip6 = mtodo(m_head, ehlen);
647 if (ip6->ip6_nxt != IPPROTO_TCP) {
652 PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th));
653 th = mtodo(m_head, ehlen + sizeof(*ip6));
656 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
663 #endif /* INET6 || INET */
666 hn_set_rxfilter(struct hn_softc *sc)
668 struct ifnet *ifp = sc->hn_ifp;
674 if (ifp->if_flags & IFF_PROMISC) {
675 filter = NDIS_PACKET_TYPE_PROMISCUOUS;
677 filter = NDIS_PACKET_TYPE_DIRECTED;
678 if (ifp->if_flags & IFF_BROADCAST)
679 filter |= NDIS_PACKET_TYPE_BROADCAST;
680 /* TODO: support multicast list */
681 if ((ifp->if_flags & IFF_ALLMULTI) ||
682 !TAILQ_EMPTY(&ifp->if_multiaddrs))
683 filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
686 if (sc->hn_rx_filter != filter) {
687 error = hn_rndis_set_rxfilter(sc, filter);
689 sc->hn_rx_filter = filter;
695 hn_set_txagg(struct hn_softc *sc)
701 * Setup aggregation size.
703 if (sc->hn_agg_size < 0)
706 size = sc->hn_agg_size;
708 if (sc->hn_rndis_agg_size < size)
709 size = sc->hn_rndis_agg_size;
711 if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) {
718 /* NOTE: Type of the per TX ring setting is 'int'. */
722 /* NOTE: We only aggregate packets using chimney sending buffers. */
723 if (size > (uint32_t)sc->hn_chim_szmax)
724 size = sc->hn_chim_szmax;
727 * Setup aggregation packet count.
729 if (sc->hn_agg_pkts < 0)
732 pkts = sc->hn_agg_pkts;
734 if (sc->hn_rndis_agg_pkts < pkts)
735 pkts = sc->hn_rndis_agg_pkts;
744 /* NOTE: Type of the per TX ring setting is 'short'. */
749 /* NOTE: Type of the per TX ring setting is 'short'. */
750 if (sc->hn_rndis_agg_align > SHRT_MAX) {
757 if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n",
758 size, pkts, sc->hn_rndis_agg_align);
761 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
762 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
764 mtx_lock(&txr->hn_tx_lock);
765 txr->hn_agg_szmax = size;
766 txr->hn_agg_pktmax = pkts;
767 txr->hn_agg_align = sc->hn_rndis_agg_align;
768 mtx_unlock(&txr->hn_tx_lock);
773 hn_get_txswq_depth(const struct hn_tx_ring *txr)
776 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
777 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
778 return txr->hn_txdesc_cnt;
779 return hn_tx_swq_depth;
783 hn_rss_reconfig(struct hn_softc *sc)
789 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
796 * Direct reconfiguration by setting the UNCHG flags does
797 * _not_ work properly.
800 if_printf(sc->hn_ifp, "disable RSS\n");
801 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE);
803 if_printf(sc->hn_ifp, "RSS disable failed\n");
808 * Reenable the RSS w/ the updated RSS key or indirect
812 if_printf(sc->hn_ifp, "reconfig RSS\n");
813 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
815 if_printf(sc->hn_ifp, "RSS reconfig failed\n");
822 hn_rss_ind_fixup(struct hn_softc *sc, int nchan)
824 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
827 KASSERT(nchan > 1, ("invalid # of channels %d", nchan));
830 * Check indirect table to make sure that all channels in it
833 for (i = 0; i < NDIS_HASH_INDCNT; ++i) {
834 if (rss->rss_ind[i] >= nchan) {
835 if_printf(sc->hn_ifp,
836 "RSS indirect table %d fixup: %u -> %d\n",
837 i, rss->rss_ind[i], nchan - 1);
838 rss->rss_ind[i] = nchan - 1;
844 hn_ifmedia_upd(struct ifnet *ifp __unused)
851 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
853 struct hn_softc *sc = ifp->if_softc;
855 ifmr->ifm_status = IFM_AVALID;
856 ifmr->ifm_active = IFM_ETHER;
858 if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) {
859 ifmr->ifm_active |= IFM_NONE;
862 ifmr->ifm_status |= IFM_ACTIVE;
863 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
866 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
867 static const struct hyperv_guid g_net_vsc_device_type = {
868 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
869 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
873 hn_probe(device_t dev)
876 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
877 &g_net_vsc_device_type) == 0) {
878 device_set_desc(dev, "Hyper-V Network Interface");
879 return BUS_PROBE_DEFAULT;
885 hn_cpuset_setthread_task(void *xmask, int pending __unused)
887 cpuset_t *mask = xmask;
890 error = cpuset_setthread(curthread->td_tid, mask);
892 panic("curthread=%ju: can't pin; error=%d",
893 (uintmax_t)curthread->td_tid, error);
898 hn_attach(device_t dev)
900 struct hn_softc *sc = device_get_softc(dev);
901 struct sysctl_oid_list *child;
902 struct sysctl_ctx_list *ctx;
903 uint8_t eaddr[ETHER_ADDR_LEN];
904 struct ifnet *ifp = NULL;
905 int error, ring_cnt, tx_ring_cnt;
908 sc->hn_prichan = vmbus_get_channel(dev);
912 * Initialize these tunables once.
914 sc->hn_agg_size = hn_tx_agg_size;
915 sc->hn_agg_pkts = hn_tx_agg_pkts;
918 * Setup taskqueue for transmission.
920 if (hn_tx_taskq == NULL) {
921 sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
922 taskqueue_thread_enqueue, &sc->hn_tx_taskq);
923 taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx",
924 device_get_nameunit(dev));
925 if (hn_bind_tx_taskq >= 0) {
926 int cpu = hn_bind_tx_taskq;
927 struct task cpuset_task;
930 if (cpu > mp_ncpus - 1)
932 CPU_SETOF(cpu, &cpu_set);
933 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task,
935 taskqueue_enqueue(sc->hn_tx_taskq, &cpuset_task);
936 taskqueue_drain(sc->hn_tx_taskq, &cpuset_task);
939 sc->hn_tx_taskq = hn_tx_taskq;
943 * Setup taskqueue for mangement tasks, e.g. link status.
945 sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK,
946 taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0);
947 taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt",
948 device_get_nameunit(dev));
949 TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc);
950 TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc);
951 TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
952 hn_netchg_status_taskfunc, sc);
955 * Allocate ifnet and setup its name earlier, so that if_printf
956 * can be used by functions, which will be called after
959 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
961 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
964 * Initialize ifmedia earlier so that it can be unconditionally
965 * destroyed, if error happened later on.
967 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
970 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
971 * to use (tx_ring_cnt).
974 * The # of RX rings to use is same as the # of channels to use.
976 ring_cnt = hn_chan_cnt;
980 if (ring_cnt > HN_RING_CNT_DEF_MAX)
981 ring_cnt = HN_RING_CNT_DEF_MAX;
982 } else if (ring_cnt > mp_ncpus) {
986 tx_ring_cnt = hn_tx_ring_cnt;
987 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
988 tx_ring_cnt = ring_cnt;
989 #ifdef HN_IFSTART_SUPPORT
990 if (hn_use_if_start) {
991 /* ifnet.if_start only needs one TX ring. */
997 * Set the leader CPU for channels.
999 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
1002 * Create enough TX/RX rings, even if only limited number of
1003 * channels can be allocated.
1005 error = hn_create_tx_data(sc, tx_ring_cnt);
1008 error = hn_create_rx_data(sc, ring_cnt);
1013 * Create transaction context for NVS and RNDIS transactions.
1015 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
1016 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
1017 if (sc->hn_xact == NULL) {
1023 * Install orphan handler for the revocation of this device's
1027 * The processing order is critical here:
1028 * Install the orphan handler, _before_ testing whether this
1029 * device's primary channel has been revoked or not.
1031 vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact);
1032 if (vmbus_chan_is_revoked(sc->hn_prichan)) {
1038 * Attach the synthetic parts, i.e. NVS and RNDIS.
1040 error = hn_synth_attach(sc, ETHERMTU);
1044 error = hn_rndis_get_eaddr(sc, eaddr);
1048 #if __FreeBSD_version >= 1100099
1049 if (sc->hn_rx_ring_inuse > 1) {
1051 * Reduce TCP segment aggregation limit for multiple
1052 * RX rings to increase ACK timeliness.
1054 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
1059 * Fixup TX stuffs after synthetic parts are attached.
1061 hn_fixup_tx_data(sc);
1063 ctx = device_get_sysctl_ctx(dev);
1064 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
1065 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
1066 &sc->hn_nvs_ver, 0, "NVS version");
1067 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
1068 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1069 hn_ndis_version_sysctl, "A", "NDIS version");
1070 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps",
1071 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1072 hn_caps_sysctl, "A", "capabilities");
1073 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
1074 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1075 hn_hwassist_sysctl, "A", "hwassist");
1076 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
1077 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1078 hn_rxfilter_sysctl, "A", "rxfilter");
1079 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash",
1080 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
1081 hn_rss_hash_sysctl, "A", "RSS hash");
1082 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size",
1083 CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count");
1084 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key",
1085 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1086 hn_rss_key_sysctl, "IU", "RSS key");
1087 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind",
1088 CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1089 hn_rss_ind_sysctl, "IU", "RSS indirect table");
1090 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size",
1091 CTLFLAG_RD, &sc->hn_rndis_agg_size, 0,
1092 "RNDIS offered packet transmission aggregation size limit");
1093 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts",
1094 CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0,
1095 "RNDIS offered packet transmission aggregation count limit");
1096 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align",
1097 CTLFLAG_RD, &sc->hn_rndis_agg_align, 0,
1098 "RNDIS packet transmission aggregation alignment");
1099 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size",
1100 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1101 hn_txagg_size_sysctl, "I",
1102 "Packet transmission aggregation size, 0 -- disable, -1 -- auto");
1103 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts",
1104 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1105 hn_txagg_pkts_sysctl, "I",
1106 "Packet transmission aggregation packets, "
1107 "0 -- disable, -1 -- auto");
1110 * Setup the ifmedia, which has been initialized earlier.
1112 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1113 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
1114 /* XXX ifmedia_set really should do this for us */
1115 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
1118 * Setup the ifnet for this interface.
1122 ifp->if_baudrate = IF_Gbps(10);
1124 /* if_baudrate is 32bits on 32bit system. */
1125 ifp->if_baudrate = IF_Gbps(1);
1127 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
1128 ifp->if_ioctl = hn_ioctl;
1129 ifp->if_init = hn_init;
1130 #ifdef HN_IFSTART_SUPPORT
1131 if (hn_use_if_start) {
1132 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
1134 ifp->if_start = hn_start;
1135 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
1136 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
1137 IFQ_SET_READY(&ifp->if_snd);
1141 ifp->if_transmit = hn_transmit;
1142 ifp->if_qflush = hn_xmit_qflush;
1145 ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
1147 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
1148 ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
1150 if (sc->hn_caps & HN_CAP_VLAN) {
1151 /* XXX not sure about VLAN_MTU. */
1152 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU;
1155 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist;
1156 if (ifp->if_hwassist & HN_CSUM_IP_MASK)
1157 ifp->if_capabilities |= IFCAP_TXCSUM;
1158 if (ifp->if_hwassist & HN_CSUM_IP6_MASK)
1159 ifp->if_capabilities |= IFCAP_TXCSUM_IPV6;
1160 if (sc->hn_caps & HN_CAP_TSO4) {
1161 ifp->if_capabilities |= IFCAP_TSO4;
1162 ifp->if_hwassist |= CSUM_IP_TSO;
1164 if (sc->hn_caps & HN_CAP_TSO6) {
1165 ifp->if_capabilities |= IFCAP_TSO6;
1166 ifp->if_hwassist |= CSUM_IP6_TSO;
1169 /* Enable all available capabilities by default. */
1170 ifp->if_capenable = ifp->if_capabilities;
1173 * Disable IPv6 TSO and TXCSUM by default, they still can
1174 * be enabled through SIOCSIFCAP.
1176 ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6);
1177 ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
1179 if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
1180 hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
1181 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
1182 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
1185 ether_ifattach(ifp, eaddr);
1187 if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) {
1188 if_printf(ifp, "TSO segcnt %u segsz %u\n",
1189 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
1192 /* Inform the upper layer about the long frame support. */
1193 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
1196 * Kick off link status check.
1198 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
1199 hn_update_link_status(sc);
1203 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)
1204 hn_synth_detach(sc);
1210 hn_detach(device_t dev)
1212 struct hn_softc *sc = device_get_softc(dev);
1213 struct ifnet *ifp = sc->hn_ifp;
1215 if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
1217 * In case that the vmbus missed the orphan handler
1220 vmbus_xact_ctx_orphan(sc->hn_xact);
1223 if (device_is_attached(dev)) {
1225 if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
1226 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
1230 * hn_stop() only suspends data, so managment
1231 * stuffs have to be suspended manually here.
1233 hn_suspend_mgmt(sc);
1234 hn_synth_detach(sc);
1237 ether_ifdetach(ifp);
1240 ifmedia_removeall(&sc->hn_media);
1241 hn_destroy_rx_data(sc);
1242 hn_destroy_tx_data(sc);
1244 if (sc->hn_tx_taskq != hn_tx_taskq)
1245 taskqueue_free(sc->hn_tx_taskq);
1246 taskqueue_free(sc->hn_mgmt_taskq0);
1248 if (sc->hn_xact != NULL) {
1250 * Uninstall the orphan handler _before_ the xact is
1253 vmbus_chan_unset_orphan(sc->hn_prichan);
1254 vmbus_xact_ctx_destroy(sc->hn_xact);
1259 HN_LOCK_DESTROY(sc);
1264 hn_shutdown(device_t dev)
1271 hn_link_status(struct hn_softc *sc)
1273 uint32_t link_status;
1276 error = hn_rndis_get_linkstatus(sc, &link_status);
1278 /* XXX what to do? */
1282 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
1283 sc->hn_link_flags |= HN_LINK_FLAG_LINKUP;
1285 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1286 if_link_state_change(sc->hn_ifp,
1287 (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ?
1288 LINK_STATE_UP : LINK_STATE_DOWN);
1292 hn_link_taskfunc(void *xsc, int pending __unused)
1294 struct hn_softc *sc = xsc;
1296 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
1302 hn_netchg_init_taskfunc(void *xsc, int pending __unused)
1304 struct hn_softc *sc = xsc;
1306 /* Prevent any link status checks from running. */
1307 sc->hn_link_flags |= HN_LINK_FLAG_NETCHG;
1310 * Fake up a [link down --> link up] state change; 5 seconds
1311 * delay is used, which closely simulates miibus reaction
1312 * upon link down event.
1314 sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP;
1315 if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN);
1316 taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0,
1317 &sc->hn_netchg_status, 5 * hz);
1321 hn_netchg_status_taskfunc(void *xsc, int pending __unused)
1323 struct hn_softc *sc = xsc;
1325 /* Re-allow link status checks. */
1326 sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG;
1331 hn_update_link_status(struct hn_softc *sc)
1334 if (sc->hn_mgmt_taskq != NULL)
1335 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task);
1339 hn_change_network(struct hn_softc *sc)
1342 if (sc->hn_mgmt_taskq != NULL)
1343 taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init);
1347 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
1348 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
1350 struct mbuf *m = *m_head;
1353 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim"));
1355 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
1356 m, segs, nsegs, BUS_DMA_NOWAIT);
1357 if (error == EFBIG) {
1360 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
1364 *m_head = m = m_new;
1365 txr->hn_tx_collapsed++;
1367 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
1368 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
1371 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
1372 BUS_DMASYNC_PREWRITE);
1373 txd->flags |= HN_TXD_FLAG_DMAMAP;
1379 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
1382 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
1383 ("put an onlist txd %#x", txd->flags));
1384 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1385 ("put an onagg txd %#x", txd->flags));
1387 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1388 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
1391 if (!STAILQ_EMPTY(&txd->agg_list)) {
1392 struct hn_txdesc *tmp_txd;
1394 while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) {
1397 KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list),
1398 ("resursive aggregation on aggregated txdesc"));
1399 KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG),
1400 ("not aggregated txdesc"));
1401 KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1402 ("aggregated txdesc uses dmamap"));
1403 KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1404 ("aggregated txdesc consumes "
1405 "chimney sending buffer"));
1406 KASSERT(tmp_txd->chim_size == 0,
1407 ("aggregated txdesc has non-zero "
1408 "chimney sending size"));
1410 STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link);
1411 tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG;
1412 freed = hn_txdesc_put(txr, tmp_txd);
1413 KASSERT(freed, ("failed to free aggregated txdesc"));
1417 if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) {
1418 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1419 ("chim txd uses dmamap"));
1420 hn_chim_free(txr->hn_sc, txd->chim_index);
1421 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1423 } else if (txd->flags & HN_TXD_FLAG_DMAMAP) {
1424 bus_dmamap_sync(txr->hn_tx_data_dtag,
1425 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
1426 bus_dmamap_unload(txr->hn_tx_data_dtag,
1428 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
1431 if (txd->m != NULL) {
1436 txd->flags |= HN_TXD_FLAG_ONLIST;
1437 #ifndef HN_USE_TXDESC_BUFRING
1438 mtx_lock_spin(&txr->hn_txlist_spin);
1439 KASSERT(txr->hn_txdesc_avail >= 0 &&
1440 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
1441 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
1442 txr->hn_txdesc_avail++;
1443 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
1444 mtx_unlock_spin(&txr->hn_txlist_spin);
1446 atomic_add_int(&txr->hn_txdesc_avail, 1);
1447 buf_ring_enqueue(txr->hn_txdesc_br, txd);
1453 static __inline struct hn_txdesc *
1454 hn_txdesc_get(struct hn_tx_ring *txr)
1456 struct hn_txdesc *txd;
1458 #ifndef HN_USE_TXDESC_BUFRING
1459 mtx_lock_spin(&txr->hn_txlist_spin);
1460 txd = SLIST_FIRST(&txr->hn_txlist);
1462 KASSERT(txr->hn_txdesc_avail > 0,
1463 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
1464 txr->hn_txdesc_avail--;
1465 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
1467 mtx_unlock_spin(&txr->hn_txlist_spin);
1469 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
1473 #ifdef HN_USE_TXDESC_BUFRING
1474 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
1476 KASSERT(txd->m == NULL && txd->refs == 0 &&
1477 STAILQ_EMPTY(&txd->agg_list) &&
1478 txd->chim_index == HN_NVS_CHIM_IDX_INVALID &&
1479 txd->chim_size == 0 &&
1480 (txd->flags & HN_TXD_FLAG_ONLIST) &&
1481 (txd->flags & HN_TXD_FLAG_ONAGG) == 0 &&
1482 (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd"));
1483 txd->flags &= ~HN_TXD_FLAG_ONLIST;
1489 static __inline void
1490 hn_txdesc_hold(struct hn_txdesc *txd)
1493 /* 0->1 transition will never work */
1494 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
1495 atomic_add_int(&txd->refs, 1);
1498 static __inline void
1499 hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd)
1502 KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1503 ("recursive aggregation on aggregating txdesc"));
1505 KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0,
1506 ("already aggregated"));
1507 KASSERT(STAILQ_EMPTY(&txd->agg_list),
1508 ("recursive aggregation on to-be-aggregated txdesc"));
1510 txd->flags |= HN_TXD_FLAG_ONAGG;
1511 STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link);
1515 hn_tx_ring_pending(struct hn_tx_ring *txr)
1517 bool pending = false;
1519 #ifndef HN_USE_TXDESC_BUFRING
1520 mtx_lock_spin(&txr->hn_txlist_spin);
1521 if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt)
1523 mtx_unlock_spin(&txr->hn_txlist_spin);
1525 if (!buf_ring_full(txr->hn_txdesc_br))
1531 static __inline void
1532 hn_txeof(struct hn_tx_ring *txr)
1534 txr->hn_has_txeof = 0;
1539 hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc,
1540 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
1542 struct hn_txdesc *txd = sndc->hn_cbarg;
1543 struct hn_tx_ring *txr;
1546 KASSERT(txr->hn_chan == chan,
1547 ("channel mismatch, on chan%u, should be chan%u",
1548 vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan)));
1550 txr->hn_has_txeof = 1;
1551 hn_txdesc_put(txr, txd);
1553 ++txr->hn_txdone_cnt;
1554 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
1555 txr->hn_txdone_cnt = 0;
1556 if (txr->hn_oactive)
1562 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
1564 #if defined(INET) || defined(INET6)
1565 struct lro_ctrl *lro = &rxr->hn_lro;
1566 struct lro_entry *queued;
1568 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1569 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1570 tcp_lro_flush(lro, queued);
1576 * 'txr' could be NULL, if multiple channels and
1577 * ifnet.if_start method are enabled.
1579 if (txr == NULL || !txr->hn_has_txeof)
1582 txr->hn_txdone_cnt = 0;
1586 static __inline uint32_t
1587 hn_rndis_pktmsg_offset(uint32_t ofs)
1590 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
1591 ("invalid RNDIS packet msg offset %u", ofs));
1592 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
1595 static __inline void *
1596 hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1597 size_t pi_dlen, uint32_t pi_type)
1599 const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen);
1600 struct rndis_pktinfo *pi;
1602 KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0,
1603 ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen));
1606 * Per-packet-info does not move; it only grows.
1609 * rm_pktinfooffset in this phase counts from the beginning
1610 * of rndis_packet_msg.
1612 KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize,
1613 ("%u pktinfo overflows RNDIS packet msg", pi_type));
1614 pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset +
1615 pkt->rm_pktinfolen);
1616 pkt->rm_pktinfolen += pi_size;
1618 pi->rm_size = pi_size;
1619 pi->rm_type = pi_type;
1620 pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET;
1622 /* Data immediately follow per-packet-info. */
1623 pkt->rm_dataoffset += pi_size;
1625 /* Update RNDIS packet msg length */
1626 pkt->rm_len += pi_size;
1628 return (pi->rm_data);
1632 hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr)
1634 struct hn_txdesc *txd;
1638 txd = txr->hn_agg_txd;
1639 KASSERT(txd != NULL, ("no aggregate txdesc"));
1642 * Since hn_txpkt() will reset this temporary stat, save
1643 * it now, so that oerrors can be updated properly, if
1644 * hn_txpkt() ever fails.
1646 pkts = txr->hn_stat_pkts;
1649 * Since txd's mbuf will _not_ be freed upon hn_txpkt()
1650 * failure, save it for later freeing, if hn_txpkt() ever
1654 error = hn_txpkt(ifp, txr, txd);
1655 if (__predict_false(error)) {
1656 /* txd is freed, but m is not. */
1659 txr->hn_flush_failed++;
1660 if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts);
1663 /* Reset all aggregation states. */
1664 txr->hn_agg_txd = NULL;
1665 txr->hn_agg_szleft = 0;
1666 txr->hn_agg_pktleft = 0;
1667 txr->hn_agg_prevpkt = NULL;
1673 hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1678 if (txr->hn_agg_txd != NULL) {
1679 if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) {
1680 struct hn_txdesc *agg_txd = txr->hn_agg_txd;
1681 struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt;
1685 * Update the previous RNDIS packet's total length,
1686 * it can be increased due to the mandatory alignment
1687 * padding for this RNDIS packet. And update the
1688 * aggregating txdesc's chimney sending buffer size
1692 * Zero-out the padding, as required by the RNDIS spec.
1695 pkt->rm_len = roundup2(olen, txr->hn_agg_align);
1696 agg_txd->chim_size += pkt->rm_len - olen;
1698 /* Link this txdesc to the parent. */
1699 hn_txdesc_agg(agg_txd, txd);
1701 chim = (uint8_t *)pkt + pkt->rm_len;
1702 /* Save the current packet for later fixup. */
1703 txr->hn_agg_prevpkt = chim;
1705 txr->hn_agg_pktleft--;
1706 txr->hn_agg_szleft -= pktsize;
1707 if (txr->hn_agg_szleft <=
1708 HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1710 * Probably can't aggregate more packets,
1711 * flush this aggregating txdesc proactively.
1713 txr->hn_agg_pktleft = 0;
1718 hn_flush_txagg(ifp, txr);
1720 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
1722 txr->hn_tx_chimney_tried++;
1723 txd->chim_index = hn_chim_alloc(txr->hn_sc);
1724 if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID)
1726 txr->hn_tx_chimney++;
1728 chim = txr->hn_sc->hn_chim +
1729 (txd->chim_index * txr->hn_sc->hn_chim_szmax);
1731 if (txr->hn_agg_pktmax > 1 &&
1732 txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) {
1733 txr->hn_agg_txd = txd;
1734 txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1;
1735 txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize;
1736 txr->hn_agg_prevpkt = chim;
1743 * If this function fails, then both txd and m_head0 will be freed.
1746 hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd,
1747 struct mbuf **m_head0)
1749 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
1750 int error, nsegs, i;
1751 struct mbuf *m_head = *m_head0;
1752 struct rndis_packet_msg *pkt;
1755 int pkt_hlen, pkt_size;
1757 pkt = txd->rndis_pkt;
1758 pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align);
1759 if (pkt_size < txr->hn_chim_size) {
1760 chim = hn_try_txagg(ifp, txr, txd, pkt_size);
1764 if (txr->hn_agg_txd != NULL)
1765 hn_flush_txagg(ifp, txr);
1768 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1769 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
1770 pkt->rm_dataoffset = sizeof(*pkt);
1771 pkt->rm_datalen = m_head->m_pkthdr.len;
1772 pkt->rm_oobdataoffset = 0;
1773 pkt->rm_oobdatalen = 0;
1774 pkt->rm_oobdataelements = 0;
1775 pkt->rm_pktinfooffset = sizeof(*pkt);
1776 pkt->rm_pktinfolen = 0;
1777 pkt->rm_vchandle = 0;
1778 pkt->rm_reserved = 0;
1780 if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) {
1782 * Set the hash value for this packet, so that the host could
1783 * dispatch the TX done event for this packet back to this TX
1786 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1787 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
1788 *pi_data = txr->hn_tx_idx;
1791 if (m_head->m_flags & M_VLANTAG) {
1792 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1793 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1794 *pi_data = NDIS_VLAN_INFO_MAKE(
1795 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
1796 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
1797 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
1800 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
1801 #if defined(INET6) || defined(INET)
1802 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1803 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
1805 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
1806 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
1807 m_head->m_pkthdr.tso_segsz);
1810 #if defined(INET6) && defined(INET)
1815 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
1816 m_head->m_pkthdr.tso_segsz);
1819 #endif /* INET6 || INET */
1820 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
1821 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
1822 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1823 if (m_head->m_pkthdr.csum_flags &
1824 (CSUM_IP6_TCP | CSUM_IP6_UDP)) {
1825 *pi_data = NDIS_TXCSUM_INFO_IPV6;
1827 *pi_data = NDIS_TXCSUM_INFO_IPV4;
1828 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
1829 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1832 if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP))
1833 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1834 else if (m_head->m_pkthdr.csum_flags &
1835 (CSUM_IP_UDP | CSUM_IP6_UDP))
1836 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1839 pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1840 /* Convert RNDIS packet message offsets */
1841 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1842 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1845 * Fast path: Chimney sending.
1848 struct hn_txdesc *tgt_txd = txd;
1850 if (txr->hn_agg_txd != NULL) {
1851 tgt_txd = txr->hn_agg_txd;
1857 KASSERT(pkt == chim,
1858 ("RNDIS pkt not in chimney sending buffer"));
1859 KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID,
1860 ("chimney sending buffer is not used"));
1861 tgt_txd->chim_size += pkt->rm_len;
1863 m_copydata(m_head, 0, m_head->m_pkthdr.len,
1864 ((uint8_t *)chim) + pkt_hlen);
1866 txr->hn_gpa_cnt = 0;
1867 txr->hn_sendpkt = hn_txpkt_chim;
1871 KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc"));
1872 KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID,
1873 ("chimney buffer is used"));
1874 KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc"));
1876 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1877 if (__predict_false(error)) {
1881 * This mbuf is not linked w/ the txd yet, so free it now.
1886 freed = hn_txdesc_put(txr, txd);
1888 ("fail to free txd upon txdma error"));
1890 txr->hn_txdma_failed++;
1891 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1896 /* +1 RNDIS packet message */
1897 txr->hn_gpa_cnt = nsegs + 1;
1899 /* send packet with page buffer */
1900 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1901 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1902 txr->hn_gpa[0].gpa_len = pkt_hlen;
1905 * Fill the page buffers with mbuf info after the page
1906 * buffer for RNDIS packet message.
1908 for (i = 0; i < nsegs; ++i) {
1909 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1911 gpa->gpa_page = atop(segs[i].ds_addr);
1912 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1913 gpa->gpa_len = segs[i].ds_len;
1916 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
1918 txr->hn_sendpkt = hn_txpkt_sglist;
1922 /* Set the completion routine */
1923 hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd);
1925 /* Update temporary stats for later use. */
1926 txr->hn_stat_pkts++;
1927 txr->hn_stat_size += m_head->m_pkthdr.len;
1928 if (m_head->m_flags & M_MCAST)
1929 txr->hn_stat_mcasts++;
1936 * If this function fails, then txd will be freed, but the mbuf
1937 * associated w/ the txd will _not_ be freed.
1940 hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1942 int error, send_failed = 0;
1946 * Make sure that this txd and any aggregated txds are not freed
1947 * before ETHER_BPF_MTAP.
1949 hn_txdesc_hold(txd);
1950 error = txr->hn_sendpkt(txr, txd);
1952 if (bpf_peers_present(ifp->if_bpf)) {
1953 const struct hn_txdesc *tmp_txd;
1955 ETHER_BPF_MTAP(ifp, txd->m);
1956 STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link)
1957 ETHER_BPF_MTAP(ifp, tmp_txd->m);
1960 if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts);
1961 #ifdef HN_IFSTART_SUPPORT
1962 if (!hn_use_if_start)
1965 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1967 if (txr->hn_stat_mcasts != 0) {
1968 if_inc_counter(ifp, IFCOUNTER_OMCASTS,
1969 txr->hn_stat_mcasts);
1972 txr->hn_pkts += txr->hn_stat_pkts;
1975 hn_txdesc_put(txr, txd);
1977 if (__predict_false(error)) {
1981 * This should "really rarely" happen.
1983 * XXX Too many RX to be acked or too many sideband
1984 * commands to run? Ask netvsc_channel_rollup()
1985 * to kick start later.
1987 txr->hn_has_txeof = 1;
1989 txr->hn_send_failed++;
1992 * Try sending again after set hn_has_txeof;
1993 * in case that we missed the last
1994 * netvsc_channel_rollup().
1998 if_printf(ifp, "send failed\n");
2001 * Caller will perform further processing on the
2002 * associated mbuf, so don't free it in hn_txdesc_put();
2003 * only unload it from the DMA map in hn_txdesc_put(),
2007 freed = hn_txdesc_put(txr, txd);
2009 ("fail to free txd upon send error"));
2011 txr->hn_send_failed++;
2014 /* Reset temporary stats, after this sending is done. */
2015 txr->hn_stat_size = 0;
2016 txr->hn_stat_pkts = 0;
2017 txr->hn_stat_mcasts = 0;
2023 * Append the specified data to the indicated mbuf chain,
2024 * Extend the mbuf chain if the new data does not fit in
2027 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
2028 * There should be an equivalent in the kernel mbuf code,
2029 * but there does not appear to be one yet.
2031 * Differs from m_append() in that additional mbufs are
2032 * allocated with cluster size MJUMPAGESIZE, and filled
2035 * Return 1 if able to complete the job; otherwise 0.
2038 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
2041 int remainder, space;
2043 for (m = m0; m->m_next != NULL; m = m->m_next)
2046 space = M_TRAILINGSPACE(m);
2049 * Copy into available space.
2051 if (space > remainder)
2053 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
2058 while (remainder > 0) {
2060 * Allocate a new mbuf; could check space
2061 * and allocate a cluster instead.
2063 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
2066 n->m_len = min(MJUMPAGESIZE, remainder);
2067 bcopy(cp, mtod(n, caddr_t), n->m_len);
2069 remainder -= n->m_len;
2073 if (m0->m_flags & M_PKTHDR)
2074 m0->m_pkthdr.len += len - remainder;
2076 return (remainder == 0);
2079 #if defined(INET) || defined(INET6)
2081 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
2083 #if __FreeBSD_version >= 1100095
2084 if (hn_lro_mbufq_depth) {
2085 tcp_lro_queue_mbuf(lc, m);
2089 return tcp_lro_rx(lc, m, 0);
2094 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
2095 const struct hn_rxinfo *info)
2097 struct ifnet *ifp = rxr->hn_ifp;
2099 int size, do_lro = 0, do_csum = 1;
2100 int hash_type = M_HASHTYPE_OPAQUE;
2102 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2106 * Bail out if packet contains more data than configured MTU.
2108 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
2110 } else if (dlen <= MHLEN) {
2111 m_new = m_gethdr(M_NOWAIT, MT_DATA);
2112 if (m_new == NULL) {
2113 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2116 memcpy(mtod(m_new, void *), data, dlen);
2117 m_new->m_pkthdr.len = m_new->m_len = dlen;
2118 rxr->hn_small_pkts++;
2121 * Get an mbuf with a cluster. For packets 2K or less,
2122 * get a standard 2K cluster. For anything larger, get a
2123 * 4K cluster. Any buffers larger than 4K can cause problems
2124 * if looped around to the Hyper-V TX channel, so avoid them.
2127 if (dlen > MCLBYTES) {
2129 size = MJUMPAGESIZE;
2132 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
2133 if (m_new == NULL) {
2134 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
2138 hv_m_append(m_new, dlen, data);
2140 m_new->m_pkthdr.rcvif = ifp;
2142 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
2145 /* receive side checksum offload */
2146 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
2147 /* IP csum offload */
2148 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
2149 m_new->m_pkthdr.csum_flags |=
2150 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2154 /* TCP/UDP csum offload */
2155 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
2156 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
2157 m_new->m_pkthdr.csum_flags |=
2158 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2159 m_new->m_pkthdr.csum_data = 0xffff;
2160 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
2168 * As of this write (Oct 28th, 2016), host side will turn
2169 * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so
2170 * the do_lro setting here is actually _not_ accurate. We
2171 * depend on the RSS hash type check to reset do_lro.
2173 if ((info->csum_info &
2174 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
2175 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
2178 const struct ether_header *eh;
2183 if (m_new->m_len < hoff)
2185 eh = mtod(m_new, struct ether_header *);
2186 etype = ntohs(eh->ether_type);
2187 if (etype == ETHERTYPE_VLAN) {
2188 const struct ether_vlan_header *evl;
2190 hoff = sizeof(*evl);
2191 if (m_new->m_len < hoff)
2193 evl = mtod(m_new, struct ether_vlan_header *);
2194 etype = ntohs(evl->evl_proto);
2197 if (etype == ETHERTYPE_IP) {
2200 pr = hn_check_iplen(m_new, hoff);
2201 if (pr == IPPROTO_TCP) {
2203 (rxr->hn_trust_hcsum &
2204 HN_TRUST_HCSUM_TCP)) {
2205 rxr->hn_csum_trusted++;
2206 m_new->m_pkthdr.csum_flags |=
2207 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2208 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2209 m_new->m_pkthdr.csum_data = 0xffff;
2212 } else if (pr == IPPROTO_UDP) {
2214 (rxr->hn_trust_hcsum &
2215 HN_TRUST_HCSUM_UDP)) {
2216 rxr->hn_csum_trusted++;
2217 m_new->m_pkthdr.csum_flags |=
2218 (CSUM_IP_CHECKED | CSUM_IP_VALID |
2219 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
2220 m_new->m_pkthdr.csum_data = 0xffff;
2222 } else if (pr != IPPROTO_DONE && do_csum &&
2223 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
2224 rxr->hn_csum_trusted++;
2225 m_new->m_pkthdr.csum_flags |=
2226 (CSUM_IP_CHECKED | CSUM_IP_VALID);
2231 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
2232 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
2233 NDIS_VLAN_INFO_ID(info->vlan_info),
2234 NDIS_VLAN_INFO_PRI(info->vlan_info),
2235 NDIS_VLAN_INFO_CFI(info->vlan_info));
2236 m_new->m_flags |= M_VLANTAG;
2239 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
2241 m_new->m_pkthdr.flowid = info->hash_value;
2242 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
2243 NDIS_HASH_FUNCTION_TOEPLITZ) {
2244 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
2248 * do_lro is resetted, if the hash types are not TCP
2249 * related. See the comment in the above csum_flags
2253 case NDIS_HASH_IPV4:
2254 hash_type = M_HASHTYPE_RSS_IPV4;
2258 case NDIS_HASH_TCP_IPV4:
2259 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
2262 case NDIS_HASH_IPV6:
2263 hash_type = M_HASHTYPE_RSS_IPV6;
2267 case NDIS_HASH_IPV6_EX:
2268 hash_type = M_HASHTYPE_RSS_IPV6_EX;
2272 case NDIS_HASH_TCP_IPV6:
2273 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
2276 case NDIS_HASH_TCP_IPV6_EX:
2277 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
2282 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
2284 M_HASHTYPE_SET(m_new, hash_type);
2287 * Note: Moved RX completion back to hv_nv_on_receive() so all
2288 * messages (not just data messages) will trigger a response.
2294 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
2295 #if defined(INET) || defined(INET6)
2296 struct lro_ctrl *lro = &rxr->hn_lro;
2299 rxr->hn_lro_tried++;
2300 if (hn_lro_rx(lro, m_new) == 0) {
2308 /* We're not holding the lock here, so don't release it */
2309 (*ifp->if_input)(ifp, m_new);
2315 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
2317 struct hn_softc *sc = ifp->if_softc;
2318 struct ifreq *ifr = (struct ifreq *)data;
2319 int mask, error = 0;
2323 if (ifr->ifr_mtu > HN_MTU_MAX) {
2330 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2335 if ((sc->hn_caps & HN_CAP_MTU) == 0) {
2336 /* Can't change MTU */
2342 if (ifp->if_mtu == ifr->ifr_mtu) {
2348 * Suspend this interface before the synthetic parts
2354 * Detach the synthetics parts, i.e. NVS and RNDIS.
2356 hn_synth_detach(sc);
2359 * Reattach the synthetic parts, i.e. NVS and RNDIS,
2360 * with the new MTU setting.
2362 error = hn_synth_attach(sc, ifr->ifr_mtu);
2369 * Commit the requested MTU, after the synthetic parts
2370 * have been successfully attached.
2372 ifp->if_mtu = ifr->ifr_mtu;
2375 * Make sure that various parameters based on MTU are
2376 * still valid, after the MTU change.
2378 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
2379 hn_set_chim_size(sc, sc->hn_chim_szmax);
2380 hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
2381 #if __FreeBSD_version >= 1100099
2382 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
2383 HN_LRO_LENLIM_MIN(ifp))
2384 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
2388 * All done! Resume the interface now.
2398 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2403 if (ifp->if_flags & IFF_UP) {
2404 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2406 * Caller meight hold mutex, e.g.
2407 * bpf; use busy-wait for the RNDIS
2411 hn_set_rxfilter(sc);
2417 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2420 sc->hn_if_flags = ifp->if_flags;
2427 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2429 if (mask & IFCAP_TXCSUM) {
2430 ifp->if_capenable ^= IFCAP_TXCSUM;
2431 if (ifp->if_capenable & IFCAP_TXCSUM)
2432 ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc);
2434 ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc);
2436 if (mask & IFCAP_TXCSUM_IPV6) {
2437 ifp->if_capenable ^= IFCAP_TXCSUM_IPV6;
2438 if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
2439 ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc);
2441 ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc);
2444 /* TODO: flip RNDIS offload parameters for RXCSUM. */
2445 if (mask & IFCAP_RXCSUM)
2446 ifp->if_capenable ^= IFCAP_RXCSUM;
2448 /* We can't diff IPv6 packets from IPv4 packets on RX path. */
2449 if (mask & IFCAP_RXCSUM_IPV6)
2450 ifp->if_capenable ^= IFCAP_RXCSUM_IPV6;
2453 if (mask & IFCAP_LRO)
2454 ifp->if_capenable ^= IFCAP_LRO;
2456 if (mask & IFCAP_TSO4) {
2457 ifp->if_capenable ^= IFCAP_TSO4;
2458 if (ifp->if_capenable & IFCAP_TSO4)
2459 ifp->if_hwassist |= CSUM_IP_TSO;
2461 ifp->if_hwassist &= ~CSUM_IP_TSO;
2463 if (mask & IFCAP_TSO6) {
2464 ifp->if_capenable ^= IFCAP_TSO6;
2465 if (ifp->if_capenable & IFCAP_TSO6)
2466 ifp->if_hwassist |= CSUM_IP6_TSO;
2468 ifp->if_hwassist &= ~CSUM_IP6_TSO;
2478 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) {
2482 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2484 * Multicast uses mutex; use busy-wait for
2488 hn_set_rxfilter(sc);
2497 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
2501 error = ether_ioctl(ifp, cmd, data);
2508 hn_stop(struct hn_softc *sc)
2510 struct ifnet *ifp = sc->hn_ifp;
2515 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
2516 ("synthetic parts were not attached"));
2518 /* Clear RUNNING bit _before_ hn_suspend_data() */
2519 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2520 hn_suspend_data(sc);
2522 /* Clear OACTIVE bit. */
2523 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2524 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2525 sc->hn_tx_ring[i].hn_oactive = 0;
2529 hn_init_locked(struct hn_softc *sc)
2531 struct ifnet *ifp = sc->hn_ifp;
2536 if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
2539 if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2542 /* Configure RX filter */
2543 hn_set_rxfilter(sc);
2545 /* Clear OACTIVE bit. */
2546 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
2547 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2548 sc->hn_tx_ring[i].hn_oactive = 0;
2550 /* Clear TX 'suspended' bit. */
2551 hn_resume_tx(sc, sc->hn_tx_ring_inuse);
2553 /* Everything is ready; unleash! */
2554 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
2560 struct hn_softc *sc = xsc;
2567 #if __FreeBSD_version >= 1100099
2570 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
2572 struct hn_softc *sc = arg1;
2573 unsigned int lenlim;
2576 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
2577 error = sysctl_handle_int(oidp, &lenlim, 0, req);
2578 if (error || req->newptr == NULL)
2582 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
2583 lenlim > TCP_LRO_LENGTH_MAX) {
2587 hn_set_lro_lenlim(sc, lenlim);
2594 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
2596 struct hn_softc *sc = arg1;
2597 int ackcnt, error, i;
2600 * lro_ackcnt_lim is append count limit,
2601 * +1 to turn it into aggregation limit.
2603 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
2604 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
2605 if (error || req->newptr == NULL)
2608 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
2612 * Convert aggregation limit back to append
2617 for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
2618 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
2626 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
2628 struct hn_softc *sc = arg1;
2633 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
2636 error = sysctl_handle_int(oidp, &on, 0, req);
2637 if (error || req->newptr == NULL)
2641 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2642 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2645 rxr->hn_trust_hcsum |= hcsum;
2647 rxr->hn_trust_hcsum &= ~hcsum;
2654 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
2656 struct hn_softc *sc = arg1;
2657 int chim_size, error;
2659 chim_size = sc->hn_tx_ring[0].hn_chim_size;
2660 error = sysctl_handle_int(oidp, &chim_size, 0, req);
2661 if (error || req->newptr == NULL)
2664 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
2668 hn_set_chim_size(sc, chim_size);
2673 #if __FreeBSD_version < 1100095
2675 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
2677 struct hn_softc *sc = arg1;
2678 int ofs = arg2, i, error;
2679 struct hn_rx_ring *rxr;
2683 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2684 rxr = &sc->hn_rx_ring[i];
2685 stat += *((int *)((uint8_t *)rxr + ofs));
2688 error = sysctl_handle_64(oidp, &stat, 0, req);
2689 if (error || req->newptr == NULL)
2692 /* Zero out this stat. */
2693 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
2694 rxr = &sc->hn_rx_ring[i];
2695 *((int *)((uint8_t *)rxr + ofs)) = 0;
2701 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
2703 struct hn_softc *sc = arg1;
2704 int ofs = arg2, i, error;
2705 struct hn_rx_ring *rxr;
2709 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2710 rxr = &sc->hn_rx_ring[i];
2711 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
2714 error = sysctl_handle_64(oidp, &stat, 0, req);
2715 if (error || req->newptr == NULL)
2718 /* Zero out this stat. */
2719 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2720 rxr = &sc->hn_rx_ring[i];
2721 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
2729 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2731 struct hn_softc *sc = arg1;
2732 int ofs = arg2, i, error;
2733 struct hn_rx_ring *rxr;
2737 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2738 rxr = &sc->hn_rx_ring[i];
2739 stat += *((u_long *)((uint8_t *)rxr + ofs));
2742 error = sysctl_handle_long(oidp, &stat, 0, req);
2743 if (error || req->newptr == NULL)
2746 /* Zero out this stat. */
2747 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2748 rxr = &sc->hn_rx_ring[i];
2749 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
2755 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
2757 struct hn_softc *sc = arg1;
2758 int ofs = arg2, i, error;
2759 struct hn_tx_ring *txr;
2763 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2764 txr = &sc->hn_tx_ring[i];
2765 stat += *((u_long *)((uint8_t *)txr + ofs));
2768 error = sysctl_handle_long(oidp, &stat, 0, req);
2769 if (error || req->newptr == NULL)
2772 /* Zero out this stat. */
2773 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2774 txr = &sc->hn_tx_ring[i];
2775 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2781 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2783 struct hn_softc *sc = arg1;
2784 int ofs = arg2, i, error, conf;
2785 struct hn_tx_ring *txr;
2787 txr = &sc->hn_tx_ring[0];
2788 conf = *((int *)((uint8_t *)txr + ofs));
2790 error = sysctl_handle_int(oidp, &conf, 0, req);
2791 if (error || req->newptr == NULL)
2795 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2796 txr = &sc->hn_tx_ring[i];
2797 *((int *)((uint8_t *)txr + ofs)) = conf;
2805 hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS)
2807 struct hn_softc *sc = arg1;
2810 size = sc->hn_agg_size;
2811 error = sysctl_handle_int(oidp, &size, 0, req);
2812 if (error || req->newptr == NULL)
2816 sc->hn_agg_size = size;
2824 hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS)
2826 struct hn_softc *sc = arg1;
2829 pkts = sc->hn_agg_pkts;
2830 error = sysctl_handle_int(oidp, &pkts, 0, req);
2831 if (error || req->newptr == NULL)
2835 sc->hn_agg_pkts = pkts;
2843 hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS)
2845 struct hn_softc *sc = arg1;
2848 pkts = sc->hn_tx_ring[0].hn_agg_pktmax;
2849 return (sysctl_handle_int(oidp, &pkts, 0, req));
2853 hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS)
2855 struct hn_softc *sc = arg1;
2858 align = sc->hn_tx_ring[0].hn_agg_align;
2859 return (sysctl_handle_int(oidp, &align, 0, req));
2863 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2865 struct hn_softc *sc = arg1;
2868 snprintf(verstr, sizeof(verstr), "%u.%u",
2869 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2870 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2871 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2875 hn_caps_sysctl(SYSCTL_HANDLER_ARGS)
2877 struct hn_softc *sc = arg1;
2884 snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS);
2885 return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req);
2889 hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS)
2891 struct hn_softc *sc = arg1;
2892 char assist_str[128];
2896 hwassist = sc->hn_ifp->if_hwassist;
2898 snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS);
2899 return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req);
2903 hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS)
2905 struct hn_softc *sc = arg1;
2906 char filter_str[128];
2910 filter = sc->hn_rx_filter;
2912 snprintf(filter_str, sizeof(filter_str), "%b", filter,
2914 return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req);
2918 hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS)
2920 struct hn_softc *sc = arg1;
2925 error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2926 if (error || req->newptr == NULL)
2929 error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key));
2932 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
2934 if (sc->hn_rx_ring_inuse > 1) {
2935 error = hn_rss_reconfig(sc);
2937 /* Not RSS capable, at least for now; just save the RSS key. */
2946 hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS)
2948 struct hn_softc *sc = arg1;
2953 error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2954 if (error || req->newptr == NULL)
2958 * Don't allow RSS indirect table change, if this interface is not
2959 * RSS capable currently.
2961 if (sc->hn_rx_ring_inuse == 1) {
2966 error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind));
2969 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
2971 hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse);
2972 error = hn_rss_reconfig(sc);
2979 hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS)
2981 struct hn_softc *sc = arg1;
2986 hash = sc->hn_rss_hash;
2988 snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS);
2989 return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req);
2993 hn_check_iplen(const struct mbuf *m, int hoff)
2995 const struct ip *ip;
2996 int len, iphlen, iplen;
2997 const struct tcphdr *th;
2998 int thoff; /* TCP data offset */
3000 len = hoff + sizeof(struct ip);
3002 /* The packet must be at least the size of an IP header. */
3003 if (m->m_pkthdr.len < len)
3004 return IPPROTO_DONE;
3006 /* The fixed IP header must reside completely in the first mbuf. */
3008 return IPPROTO_DONE;
3010 ip = mtodo(m, hoff);
3012 /* Bound check the packet's stated IP header length. */
3013 iphlen = ip->ip_hl << 2;
3014 if (iphlen < sizeof(struct ip)) /* minimum header length */
3015 return IPPROTO_DONE;
3017 /* The full IP header must reside completely in the one mbuf. */
3018 if (m->m_len < hoff + iphlen)
3019 return IPPROTO_DONE;
3021 iplen = ntohs(ip->ip_len);
3024 * Check that the amount of data in the buffers is as
3025 * at least much as the IP header would have us expect.
3027 if (m->m_pkthdr.len < hoff + iplen)
3028 return IPPROTO_DONE;
3031 * Ignore IP fragments.
3033 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
3034 return IPPROTO_DONE;
3037 * The TCP/IP or UDP/IP header must be entirely contained within
3038 * the first fragment of a packet.
3042 if (iplen < iphlen + sizeof(struct tcphdr))
3043 return IPPROTO_DONE;
3044 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
3045 return IPPROTO_DONE;
3046 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
3047 thoff = th->th_off << 2;
3048 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
3049 return IPPROTO_DONE;
3050 if (m->m_len < hoff + iphlen + thoff)
3051 return IPPROTO_DONE;
3054 if (iplen < iphlen + sizeof(struct udphdr))
3055 return IPPROTO_DONE;
3056 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
3057 return IPPROTO_DONE;
3061 return IPPROTO_DONE;
3068 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
3070 struct sysctl_oid_list *child;
3071 struct sysctl_ctx_list *ctx;
3072 device_t dev = sc->hn_dev;
3073 #if defined(INET) || defined(INET6)
3074 #if __FreeBSD_version >= 1100095
3081 * Create RXBUF for reception.
3084 * - It is shared by all channels.
3085 * - A large enough buffer is allocated, certain version of NVSes
3086 * may further limit the usable space.
3088 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3089 PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma,
3090 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3091 if (sc->hn_rxbuf == NULL) {
3092 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
3096 sc->hn_rx_ring_cnt = ring_cnt;
3097 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
3099 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
3100 M_DEVBUF, M_WAITOK | M_ZERO);
3102 #if defined(INET) || defined(INET6)
3103 #if __FreeBSD_version >= 1100095
3104 lroent_cnt = hn_lro_entry_count;
3105 if (lroent_cnt < TCP_LRO_ENTRIES)
3106 lroent_cnt = TCP_LRO_ENTRIES;
3108 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
3110 #endif /* INET || INET6 */
3112 ctx = device_get_sysctl_ctx(dev);
3113 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
3115 /* Create dev.hn.UNIT.rx sysctl tree */
3116 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
3117 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3119 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3120 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3122 rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
3123 PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE,
3124 &rxr->hn_br_dma, BUS_DMA_WAITOK);
3125 if (rxr->hn_br == NULL) {
3126 device_printf(dev, "allocate bufring failed\n");
3130 if (hn_trust_hosttcp)
3131 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
3132 if (hn_trust_hostudp)
3133 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
3134 if (hn_trust_hostip)
3135 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
3136 rxr->hn_ifp = sc->hn_ifp;
3137 if (i < sc->hn_tx_ring_cnt)
3138 rxr->hn_txr = &sc->hn_tx_ring[i];
3139 rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF;
3140 rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK);
3142 rxr->hn_rxbuf = sc->hn_rxbuf;
3147 #if defined(INET) || defined(INET6)
3148 #if __FreeBSD_version >= 1100095
3149 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
3150 hn_lro_mbufq_depth);
3152 tcp_lro_init(&rxr->hn_lro);
3153 rxr->hn_lro.ifp = sc->hn_ifp;
3155 #if __FreeBSD_version >= 1100099
3156 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
3157 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
3159 #endif /* INET || INET6 */
3161 if (sc->hn_rx_sysctl_tree != NULL) {
3165 * Create per RX ring sysctl tree:
3166 * dev.hn.UNIT.rx.RINGID
3168 snprintf(name, sizeof(name), "%d", i);
3169 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
3170 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
3171 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3173 if (rxr->hn_rx_sysctl_tree != NULL) {
3174 SYSCTL_ADD_ULONG(ctx,
3175 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3176 OID_AUTO, "packets", CTLFLAG_RW,
3177 &rxr->hn_pkts, "# of packets received");
3178 SYSCTL_ADD_ULONG(ctx,
3179 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3180 OID_AUTO, "rss_pkts", CTLFLAG_RW,
3182 "# of packets w/ RSS info received");
3184 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
3185 OID_AUTO, "pktbuf_len", CTLFLAG_RD,
3186 &rxr->hn_pktbuf_len, 0,
3187 "Temporary channel packet buffer length");
3192 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
3193 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3194 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
3195 #if __FreeBSD_version < 1100095
3196 hn_rx_stat_int_sysctl,
3198 hn_rx_stat_u64_sysctl,
3200 "LU", "LRO queued");
3201 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
3202 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3203 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
3204 #if __FreeBSD_version < 1100095
3205 hn_rx_stat_int_sysctl,
3207 hn_rx_stat_u64_sysctl,
3209 "LU", "LRO flushed");
3210 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
3211 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3212 __offsetof(struct hn_rx_ring, hn_lro_tried),
3213 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
3214 #if __FreeBSD_version >= 1100099
3215 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
3216 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3217 hn_lro_lenlim_sysctl, "IU",
3218 "Max # of data bytes to be aggregated by LRO");
3219 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
3220 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3221 hn_lro_ackcnt_sysctl, "I",
3222 "Max # of ACKs to be aggregated by LRO");
3224 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
3225 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
3226 hn_trust_hcsum_sysctl, "I",
3227 "Trust tcp segement verification on host side, "
3228 "when csum info is missing");
3229 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
3230 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
3231 hn_trust_hcsum_sysctl, "I",
3232 "Trust udp datagram verification on host side, "
3233 "when csum info is missing");
3234 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
3235 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
3236 hn_trust_hcsum_sysctl, "I",
3237 "Trust ip packet verification on host side, "
3238 "when csum info is missing");
3239 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
3240 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3241 __offsetof(struct hn_rx_ring, hn_csum_ip),
3242 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
3243 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
3244 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3245 __offsetof(struct hn_rx_ring, hn_csum_tcp),
3246 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
3247 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
3248 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3249 __offsetof(struct hn_rx_ring, hn_csum_udp),
3250 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
3251 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
3252 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3253 __offsetof(struct hn_rx_ring, hn_csum_trusted),
3254 hn_rx_stat_ulong_sysctl, "LU",
3255 "# of packets that we trust host's csum verification");
3256 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
3257 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3258 __offsetof(struct hn_rx_ring, hn_small_pkts),
3259 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
3260 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed",
3261 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3262 __offsetof(struct hn_rx_ring, hn_ack_failed),
3263 hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures");
3264 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
3265 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
3266 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
3267 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
3273 hn_destroy_rx_data(struct hn_softc *sc)
3277 if (sc->hn_rxbuf != NULL) {
3278 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
3279 sc->hn_rxbuf = NULL;
3282 if (sc->hn_rx_ring_cnt == 0)
3285 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3286 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
3288 if (rxr->hn_br == NULL)
3290 hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br);
3293 #if defined(INET) || defined(INET6)
3294 tcp_lro_free(&rxr->hn_lro);
3296 free(rxr->hn_pktbuf, M_DEVBUF);
3298 free(sc->hn_rx_ring, M_DEVBUF);
3299 sc->hn_rx_ring = NULL;
3301 sc->hn_rx_ring_cnt = 0;
3302 sc->hn_rx_ring_inuse = 0;
3306 hn_tx_ring_create(struct hn_softc *sc, int id)
3308 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
3309 device_t dev = sc->hn_dev;
3310 bus_dma_tag_t parent_dtag;
3314 txr->hn_tx_idx = id;
3316 #ifndef HN_USE_TXDESC_BUFRING
3317 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
3319 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
3321 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
3322 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
3323 M_DEVBUF, M_WAITOK | M_ZERO);
3324 #ifndef HN_USE_TXDESC_BUFRING
3325 SLIST_INIT(&txr->hn_txlist);
3327 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF,
3328 M_WAITOK, &txr->hn_tx_lock);
3331 txr->hn_tx_taskq = sc->hn_tx_taskq;
3333 #ifdef HN_IFSTART_SUPPORT
3334 if (hn_use_if_start) {
3335 txr->hn_txeof = hn_start_txeof;
3336 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
3337 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
3343 txr->hn_txeof = hn_xmit_txeof;
3344 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
3345 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
3347 br_depth = hn_get_txswq_depth(txr);
3348 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF,
3349 M_WAITOK, &txr->hn_tx_lock);
3352 txr->hn_direct_tx_size = hn_direct_tx_size;
3355 * Always schedule transmission instead of trying to do direct
3356 * transmission. This one gives the best performance so far.
3358 txr->hn_sched_tx = 1;
3360 parent_dtag = bus_get_dma_tag(dev);
3362 /* DMA tag for RNDIS packet messages. */
3363 error = bus_dma_tag_create(parent_dtag, /* parent */
3364 HN_RNDIS_PKT_ALIGN, /* alignment */
3365 HN_RNDIS_PKT_BOUNDARY, /* boundary */
3366 BUS_SPACE_MAXADDR, /* lowaddr */
3367 BUS_SPACE_MAXADDR, /* highaddr */
3368 NULL, NULL, /* filter, filterarg */
3369 HN_RNDIS_PKT_LEN, /* maxsize */
3371 HN_RNDIS_PKT_LEN, /* maxsegsize */
3373 NULL, /* lockfunc */
3374 NULL, /* lockfuncarg */
3375 &txr->hn_tx_rndis_dtag);
3377 device_printf(dev, "failed to create rndis dmatag\n");
3381 /* DMA tag for data. */
3382 error = bus_dma_tag_create(parent_dtag, /* parent */
3384 HN_TX_DATA_BOUNDARY, /* boundary */
3385 BUS_SPACE_MAXADDR, /* lowaddr */
3386 BUS_SPACE_MAXADDR, /* highaddr */
3387 NULL, NULL, /* filter, filterarg */
3388 HN_TX_DATA_MAXSIZE, /* maxsize */
3389 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
3390 HN_TX_DATA_SEGSIZE, /* maxsegsize */
3392 NULL, /* lockfunc */
3393 NULL, /* lockfuncarg */
3394 &txr->hn_tx_data_dtag);
3396 device_printf(dev, "failed to create data dmatag\n");
3400 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
3401 struct hn_txdesc *txd = &txr->hn_txdesc[i];
3404 txd->chim_index = HN_NVS_CHIM_IDX_INVALID;
3405 STAILQ_INIT(&txd->agg_list);
3408 * Allocate and load RNDIS packet message.
3410 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
3411 (void **)&txd->rndis_pkt,
3412 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
3413 &txd->rndis_pkt_dmap);
3416 "failed to allocate rndis_packet_msg, %d\n", i);
3420 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
3421 txd->rndis_pkt_dmap,
3422 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
3423 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
3427 "failed to load rndis_packet_msg, %d\n", i);
3428 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3429 txd->rndis_pkt, txd->rndis_pkt_dmap);
3433 /* DMA map for TX data. */
3434 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
3438 "failed to allocate tx data dmamap\n");
3439 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
3440 txd->rndis_pkt_dmap);
3441 bus_dmamem_free(txr->hn_tx_rndis_dtag,
3442 txd->rndis_pkt, txd->rndis_pkt_dmap);
3446 /* All set, put it to list */
3447 txd->flags |= HN_TXD_FLAG_ONLIST;
3448 #ifndef HN_USE_TXDESC_BUFRING
3449 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
3451 buf_ring_enqueue(txr->hn_txdesc_br, txd);
3454 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
3456 if (sc->hn_tx_sysctl_tree != NULL) {
3457 struct sysctl_oid_list *child;
3458 struct sysctl_ctx_list *ctx;
3462 * Create per TX ring sysctl tree:
3463 * dev.hn.UNIT.tx.RINGID
3465 ctx = device_get_sysctl_ctx(dev);
3466 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
3468 snprintf(name, sizeof(name), "%d", id);
3469 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
3470 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3472 if (txr->hn_tx_sysctl_tree != NULL) {
3473 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
3475 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
3476 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
3477 "# of available TX descs");
3478 #ifdef HN_IFSTART_SUPPORT
3479 if (!hn_use_if_start)
3482 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
3483 CTLFLAG_RD, &txr->hn_oactive, 0,
3486 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
3487 CTLFLAG_RW, &txr->hn_pkts,
3488 "# of packets transmitted");
3489 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends",
3490 CTLFLAG_RW, &txr->hn_sends, "# of sends");
3498 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
3500 struct hn_tx_ring *txr = txd->txr;
3502 KASSERT(txd->m == NULL, ("still has mbuf installed"));
3503 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
3505 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
3506 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
3507 txd->rndis_pkt_dmap);
3508 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
3512 hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd)
3515 KASSERT(txd->refs == 0 || txd->refs == 1,
3516 ("invalid txd refs %d", txd->refs));
3518 /* Aggregated txds will be freed by their aggregating txd. */
3519 if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) {
3522 freed = hn_txdesc_put(txr, txd);
3523 KASSERT(freed, ("can't free txdesc"));
3528 hn_tx_ring_destroy(struct hn_tx_ring *txr)
3532 if (txr->hn_txdesc == NULL)
3537 * Because the freeing of aggregated txds will be deferred
3538 * to the aggregating txd, two passes are used here:
3539 * - The first pass GCes any pending txds. This GC is necessary,
3540 * since if the channels are revoked, hypervisor will not
3541 * deliver send-done for all pending txds.
3542 * - The second pass frees the busdma stuffs, i.e. after all txds
3545 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3546 hn_txdesc_gc(txr, &txr->hn_txdesc[i]);
3547 for (i = 0; i < txr->hn_txdesc_cnt; ++i)
3548 hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]);
3550 if (txr->hn_tx_data_dtag != NULL)
3551 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
3552 if (txr->hn_tx_rndis_dtag != NULL)
3553 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
3555 #ifdef HN_USE_TXDESC_BUFRING
3556 buf_ring_free(txr->hn_txdesc_br, M_DEVBUF);
3559 free(txr->hn_txdesc, M_DEVBUF);
3560 txr->hn_txdesc = NULL;
3562 if (txr->hn_mbuf_br != NULL)
3563 buf_ring_free(txr->hn_mbuf_br, M_DEVBUF);
3565 #ifndef HN_USE_TXDESC_BUFRING
3566 mtx_destroy(&txr->hn_txlist_spin);
3568 mtx_destroy(&txr->hn_tx_lock);
3572 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
3574 struct sysctl_oid_list *child;
3575 struct sysctl_ctx_list *ctx;
3579 * Create TXBUF for chimney sending.
3581 * NOTE: It is shared by all channels.
3583 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
3584 PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma,
3585 BUS_DMA_WAITOK | BUS_DMA_ZERO);
3586 if (sc->hn_chim == NULL) {
3587 device_printf(sc->hn_dev, "allocate txbuf failed\n");
3591 sc->hn_tx_ring_cnt = ring_cnt;
3592 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3594 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
3595 M_DEVBUF, M_WAITOK | M_ZERO);
3597 ctx = device_get_sysctl_ctx(sc->hn_dev);
3598 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
3600 /* Create dev.hn.UNIT.tx sysctl tree */
3601 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
3602 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
3604 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3607 error = hn_tx_ring_create(sc, i);
3612 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
3613 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3614 __offsetof(struct hn_tx_ring, hn_no_txdescs),
3615 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
3616 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
3617 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3618 __offsetof(struct hn_tx_ring, hn_send_failed),
3619 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
3620 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
3621 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3622 __offsetof(struct hn_tx_ring, hn_txdma_failed),
3623 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
3624 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed",
3625 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3626 __offsetof(struct hn_tx_ring, hn_flush_failed),
3627 hn_tx_stat_ulong_sysctl, "LU",
3628 "# of packet transmission aggregation flush failure");
3629 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
3630 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3631 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
3632 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
3633 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
3634 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3635 __offsetof(struct hn_tx_ring, hn_tx_chimney),
3636 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
3637 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
3638 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3639 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
3640 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
3641 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
3642 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
3643 "# of total TX descs");
3644 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
3645 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
3646 "Chimney send packet size upper boundary");
3647 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
3648 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
3649 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
3650 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
3651 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3652 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
3653 hn_tx_conf_int_sysctl, "I",
3654 "Size of the packet for direct transmission");
3655 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
3656 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
3657 __offsetof(struct hn_tx_ring, hn_sched_tx),
3658 hn_tx_conf_int_sysctl, "I",
3659 "Always schedule transmission "
3660 "instead of doing direct transmission");
3661 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
3662 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
3663 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
3664 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
3665 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax",
3666 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0,
3667 "Applied packet transmission aggregation size");
3668 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax",
3669 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3670 hn_txagg_pktmax_sysctl, "I",
3671 "Applied packet transmission aggregation packets");
3672 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align",
3673 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
3674 hn_txagg_align_sysctl, "I",
3675 "Applied packet transmission aggregation alignment");
3681 hn_set_chim_size(struct hn_softc *sc, int chim_size)
3685 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3686 sc->hn_tx_ring[i].hn_chim_size = chim_size;
3690 hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
3692 struct ifnet *ifp = sc->hn_ifp;
3695 if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
3698 KASSERT(sc->hn_ndis_tso_sgmin >= 2,
3699 ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin));
3700 tso_minlen = sc->hn_ndis_tso_sgmin * mtu;
3702 KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen &&
3703 sc->hn_ndis_tso_szmax <= IP_MAXPACKET,
3704 ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax));
3706 if (tso_maxlen < tso_minlen)
3707 tso_maxlen = tso_minlen;
3708 else if (tso_maxlen > IP_MAXPACKET)
3709 tso_maxlen = IP_MAXPACKET;
3710 if (tso_maxlen > sc->hn_ndis_tso_szmax)
3711 tso_maxlen = sc->hn_ndis_tso_szmax;
3712 ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
3714 if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
3718 hn_fixup_tx_data(struct hn_softc *sc)
3720 uint64_t csum_assist;
3723 hn_set_chim_size(sc, sc->hn_chim_szmax);
3724 if (hn_tx_chimney_size > 0 &&
3725 hn_tx_chimney_size < sc->hn_chim_szmax)
3726 hn_set_chim_size(sc, hn_tx_chimney_size);
3729 if (sc->hn_caps & HN_CAP_IPCS)
3730 csum_assist |= CSUM_IP;
3731 if (sc->hn_caps & HN_CAP_TCP4CS)
3732 csum_assist |= CSUM_IP_TCP;
3733 if (sc->hn_caps & HN_CAP_UDP4CS)
3734 csum_assist |= CSUM_IP_UDP;
3735 if (sc->hn_caps & HN_CAP_TCP6CS)
3736 csum_assist |= CSUM_IP6_TCP;
3737 if (sc->hn_caps & HN_CAP_UDP6CS)
3738 csum_assist |= CSUM_IP6_UDP;
3739 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3740 sc->hn_tx_ring[i].hn_csum_assist = csum_assist;
3742 if (sc->hn_caps & HN_CAP_HASHVAL) {
3744 * Support HASHVAL pktinfo on TX path.
3747 if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n");
3748 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3749 sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL;
3754 hn_destroy_tx_data(struct hn_softc *sc)
3758 if (sc->hn_chim != NULL) {
3759 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
3763 if (sc->hn_tx_ring_cnt == 0)
3766 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
3767 hn_tx_ring_destroy(&sc->hn_tx_ring[i]);
3769 free(sc->hn_tx_ring, M_DEVBUF);
3770 sc->hn_tx_ring = NULL;
3772 sc->hn_tx_ring_cnt = 0;
3773 sc->hn_tx_ring_inuse = 0;
3776 #ifdef HN_IFSTART_SUPPORT
3779 hn_start_taskfunc(void *xtxr, int pending __unused)
3781 struct hn_tx_ring *txr = xtxr;
3783 mtx_lock(&txr->hn_tx_lock);
3784 hn_start_locked(txr, 0);
3785 mtx_unlock(&txr->hn_tx_lock);
3789 hn_start_locked(struct hn_tx_ring *txr, int len)
3791 struct hn_softc *sc = txr->hn_sc;
3792 struct ifnet *ifp = sc->hn_ifp;
3795 KASSERT(hn_use_if_start,
3796 ("hn_start_locked is called, when if_start is disabled"));
3797 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3798 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3799 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3801 if (__predict_false(txr->hn_suspended))
3804 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
3808 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
3809 struct hn_txdesc *txd;
3810 struct mbuf *m_head;
3813 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
3817 if (len > 0 && m_head->m_pkthdr.len > len) {
3819 * This sending could be time consuming; let callers
3820 * dispatch this packet sending (and sending of any
3821 * following up packets) to tx taskqueue.
3823 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3828 #if defined(INET6) || defined(INET)
3829 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3830 m_head = hn_tso_fixup(m_head);
3831 if (__predict_false(m_head == NULL)) {
3832 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
3838 txd = hn_txdesc_get(txr);
3840 txr->hn_no_txdescs++;
3841 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3842 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3846 error = hn_encap(ifp, txr, txd, &m_head);
3848 /* Both txd and m_head are freed */
3849 KASSERT(txr->hn_agg_txd == NULL,
3850 ("encap failed w/ pending aggregating txdesc"));
3854 if (txr->hn_agg_pktleft == 0) {
3855 if (txr->hn_agg_txd != NULL) {
3856 KASSERT(m_head == NULL,
3857 ("pending mbuf for aggregating txdesc"));
3858 error = hn_flush_txagg(ifp, txr);
3859 if (__predict_false(error)) {
3860 atomic_set_int(&ifp->if_drv_flags,
3865 KASSERT(m_head != NULL, ("mbuf was freed"));
3866 error = hn_txpkt(ifp, txr, txd);
3867 if (__predict_false(error)) {
3868 /* txd is freed, but m_head is not */
3869 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
3870 atomic_set_int(&ifp->if_drv_flags,
3878 KASSERT(txr->hn_agg_txd != NULL,
3879 ("no aggregating txdesc"));
3880 KASSERT(m_head == NULL,
3881 ("pending mbuf for aggregating txdesc"));
3886 /* Flush pending aggerated transmission. */
3887 if (txr->hn_agg_txd != NULL)
3888 hn_flush_txagg(ifp, txr);
3893 hn_start(struct ifnet *ifp)
3895 struct hn_softc *sc = ifp->if_softc;
3896 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
3898 if (txr->hn_sched_tx)
3901 if (mtx_trylock(&txr->hn_tx_lock)) {
3904 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3905 mtx_unlock(&txr->hn_tx_lock);
3910 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
3914 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
3916 struct hn_tx_ring *txr = xtxr;
3918 mtx_lock(&txr->hn_tx_lock);
3919 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
3920 hn_start_locked(txr, 0);
3921 mtx_unlock(&txr->hn_tx_lock);
3925 hn_start_txeof(struct hn_tx_ring *txr)
3927 struct hn_softc *sc = txr->hn_sc;
3928 struct ifnet *ifp = sc->hn_ifp;
3930 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
3932 if (txr->hn_sched_tx)
3935 if (mtx_trylock(&txr->hn_tx_lock)) {
3938 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3939 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
3940 mtx_unlock(&txr->hn_tx_lock);
3942 taskqueue_enqueue(txr->hn_tx_taskq,
3948 * Release the OACTIVE earlier, with the hope, that
3949 * others could catch up. The task will clear the
3950 * flag again with the hn_tx_lock to avoid possible
3953 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
3954 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
3958 #endif /* HN_IFSTART_SUPPORT */
3961 hn_xmit(struct hn_tx_ring *txr, int len)
3963 struct hn_softc *sc = txr->hn_sc;
3964 struct ifnet *ifp = sc->hn_ifp;
3965 struct mbuf *m_head;
3968 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
3969 #ifdef HN_IFSTART_SUPPORT
3970 KASSERT(hn_use_if_start == 0,
3971 ("hn_xmit is called, when if_start is enabled"));
3973 KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc"));
3975 if (__predict_false(txr->hn_suspended))
3978 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
3981 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
3982 struct hn_txdesc *txd;
3985 if (len > 0 && m_head->m_pkthdr.len > len) {
3987 * This sending could be time consuming; let callers
3988 * dispatch this packet sending (and sending of any
3989 * following up packets) to tx taskqueue.
3991 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
3996 txd = hn_txdesc_get(txr);
3998 txr->hn_no_txdescs++;
3999 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
4000 txr->hn_oactive = 1;
4004 error = hn_encap(ifp, txr, txd, &m_head);
4006 /* Both txd and m_head are freed; discard */
4007 KASSERT(txr->hn_agg_txd == NULL,
4008 ("encap failed w/ pending aggregating txdesc"));
4009 drbr_advance(ifp, txr->hn_mbuf_br);
4013 if (txr->hn_agg_pktleft == 0) {
4014 if (txr->hn_agg_txd != NULL) {
4015 KASSERT(m_head == NULL,
4016 ("pending mbuf for aggregating txdesc"));
4017 error = hn_flush_txagg(ifp, txr);
4018 if (__predict_false(error)) {
4019 txr->hn_oactive = 1;
4023 KASSERT(m_head != NULL, ("mbuf was freed"));
4024 error = hn_txpkt(ifp, txr, txd);
4025 if (__predict_false(error)) {
4026 /* txd is freed, but m_head is not */
4027 drbr_putback(ifp, txr->hn_mbuf_br,
4029 txr->hn_oactive = 1;
4036 KASSERT(txr->hn_agg_txd != NULL,
4037 ("no aggregating txdesc"));
4038 KASSERT(m_head == NULL,
4039 ("pending mbuf for aggregating txdesc"));
4044 drbr_advance(ifp, txr->hn_mbuf_br);
4047 /* Flush pending aggerated transmission. */
4048 if (txr->hn_agg_txd != NULL)
4049 hn_flush_txagg(ifp, txr);
4054 hn_transmit(struct ifnet *ifp, struct mbuf *m)
4056 struct hn_softc *sc = ifp->if_softc;
4057 struct hn_tx_ring *txr;
4060 #if defined(INET6) || defined(INET)
4062 * Perform TSO packet header fixup now, since the TSO
4063 * packet header should be cache-hot.
4065 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
4066 m = hn_tso_fixup(m);
4067 if (__predict_false(m == NULL)) {
4068 if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
4075 * Select the TX ring based on flowid
4077 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
4078 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
4079 txr = &sc->hn_tx_ring[idx];
4081 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
4083 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
4087 if (txr->hn_oactive)
4090 if (txr->hn_sched_tx)
4093 if (mtx_trylock(&txr->hn_tx_lock)) {
4096 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4097 mtx_unlock(&txr->hn_tx_lock);
4102 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
4107 hn_tx_ring_qflush(struct hn_tx_ring *txr)
4111 mtx_lock(&txr->hn_tx_lock);
4112 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
4114 mtx_unlock(&txr->hn_tx_lock);
4118 hn_xmit_qflush(struct ifnet *ifp)
4120 struct hn_softc *sc = ifp->if_softc;
4123 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
4124 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4129 hn_xmit_txeof(struct hn_tx_ring *txr)
4132 if (txr->hn_sched_tx)
4135 if (mtx_trylock(&txr->hn_tx_lock)) {
4138 txr->hn_oactive = 0;
4139 sched = hn_xmit(txr, txr->hn_direct_tx_size);
4140 mtx_unlock(&txr->hn_tx_lock);
4142 taskqueue_enqueue(txr->hn_tx_taskq,
4148 * Release the oactive earlier, with the hope, that
4149 * others could catch up. The task will clear the
4150 * oactive again with the hn_tx_lock to avoid possible
4153 txr->hn_oactive = 0;
4154 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4159 hn_xmit_taskfunc(void *xtxr, int pending __unused)
4161 struct hn_tx_ring *txr = xtxr;
4163 mtx_lock(&txr->hn_tx_lock);
4165 mtx_unlock(&txr->hn_tx_lock);
4169 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
4171 struct hn_tx_ring *txr = xtxr;
4173 mtx_lock(&txr->hn_tx_lock);
4174 txr->hn_oactive = 0;
4176 mtx_unlock(&txr->hn_tx_lock);
4180 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
4182 struct vmbus_chan_br cbr;
4183 struct hn_rx_ring *rxr;
4184 struct hn_tx_ring *txr = NULL;
4187 idx = vmbus_chan_subidx(chan);
4190 * Link this channel to RX/TX ring.
4192 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4193 ("invalid channel index %d, should > 0 && < %d",
4194 idx, sc->hn_rx_ring_inuse));
4195 rxr = &sc->hn_rx_ring[idx];
4196 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
4197 ("RX ring %d already attached", idx));
4198 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
4201 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
4202 idx, vmbus_chan_id(chan));
4205 if (idx < sc->hn_tx_ring_inuse) {
4206 txr = &sc->hn_tx_ring[idx];
4207 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
4208 ("TX ring %d already attached", idx));
4209 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
4211 txr->hn_chan = chan;
4213 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
4214 idx, vmbus_chan_id(chan));
4218 /* Bind this channel to a proper CPU. */
4219 vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus);
4224 cbr.cbr = rxr->hn_br;
4225 cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr;
4226 cbr.cbr_txsz = HN_TXBR_SIZE;
4227 cbr.cbr_rxsz = HN_RXBR_SIZE;
4228 error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr);
4230 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
4231 vmbus_chan_id(chan), error);
4232 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4234 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4240 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
4242 struct hn_rx_ring *rxr;
4245 idx = vmbus_chan_subidx(chan);
4248 * Link this channel to RX/TX ring.
4250 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
4251 ("invalid channel index %d, should > 0 && < %d",
4252 idx, sc->hn_rx_ring_inuse));
4253 rxr = &sc->hn_rx_ring[idx];
4254 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
4255 ("RX ring %d is not attached", idx));
4256 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
4258 if (idx < sc->hn_tx_ring_inuse) {
4259 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
4261 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
4262 ("TX ring %d is not attached attached", idx));
4263 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
4267 * Close this channel.
4270 * Channel closing does _not_ destroy the target channel.
4272 vmbus_chan_close(chan);
4276 hn_attach_subchans(struct hn_softc *sc)
4278 struct vmbus_channel **subchans;
4279 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4282 if (subchan_cnt == 0)
4285 /* Attach the sub-channels. */
4286 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4287 for (i = 0; i < subchan_cnt; ++i) {
4288 error = hn_chan_attach(sc, subchans[i]);
4292 vmbus_subchan_rel(subchans, subchan_cnt);
4295 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
4298 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
4306 hn_detach_allchans(struct hn_softc *sc)
4308 struct vmbus_channel **subchans;
4309 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
4312 if (subchan_cnt == 0)
4315 /* Detach the sub-channels. */
4316 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
4317 for (i = 0; i < subchan_cnt; ++i)
4318 hn_chan_detach(sc, subchans[i]);
4319 vmbus_subchan_rel(subchans, subchan_cnt);
4323 * Detach the primary channel, _after_ all sub-channels
4326 hn_chan_detach(sc, sc->hn_prichan);
4328 /* Wait for sub-channels to be destroyed, if any. */
4329 vmbus_subchan_drain(sc->hn_prichan);
4332 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
4333 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
4334 HN_RX_FLAG_ATTACHED) == 0,
4335 ("%dth RX ring is still attached", i));
4337 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
4338 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
4339 HN_TX_FLAG_ATTACHED) == 0,
4340 ("%dth TX ring is still attached", i));
4346 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
4348 struct vmbus_channel **subchans;
4349 int nchan, rxr_cnt, error;
4351 nchan = *nsubch + 1;
4354 * Multiple RX/TX rings are not requested.
4361 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
4364 error = hn_rndis_query_rsscaps(sc, &rxr_cnt);
4366 /* No RSS; this is benign. */
4371 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
4375 if (nchan > rxr_cnt)
4378 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
4384 * Allocate sub-channels from NVS.
4386 *nsubch = nchan - 1;
4387 error = hn_nvs_alloc_subchans(sc, nsubch);
4388 if (error || *nsubch == 0) {
4389 /* Failed to allocate sub-channels. */
4395 * Wait for all sub-channels to become ready before moving on.
4397 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
4398 vmbus_subchan_rel(subchans, *nsubch);
4403 hn_synth_attach(struct hn_softc *sc, int mtu)
4405 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
4406 int error, nsubch, nchan, i;
4409 KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0,
4410 ("synthetic parts were attached"));
4412 /* Save capabilities for later verification. */
4413 old_caps = sc->hn_caps;
4416 /* Clear RSS stuffs. */
4417 sc->hn_rss_ind_size = 0;
4418 sc->hn_rss_hash = 0;
4421 * Attach the primary channel _before_ attaching NVS and RNDIS.
4423 error = hn_chan_attach(sc, sc->hn_prichan);
4430 error = hn_nvs_attach(sc, mtu);
4435 * Attach RNDIS _after_ NVS is attached.
4437 error = hn_rndis_attach(sc, mtu);
4442 * Make sure capabilities are not changed.
4444 if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) {
4445 if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n",
4446 old_caps, sc->hn_caps);
4447 /* Restore old capabilities and abort. */
4448 sc->hn_caps = old_caps;
4453 * Allocate sub-channels for multi-TX/RX rings.
4456 * The # of RX rings that can be used is equivalent to the # of
4457 * channels to be requested.
4459 nsubch = sc->hn_rx_ring_cnt - 1;
4460 error = hn_synth_alloc_subchans(sc, &nsubch);
4466 /* Only the primary channel can be used; done */
4471 * Configure RSS key and indirect table _after_ all sub-channels
4475 if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) {
4477 * RSS key is not set yet; set it to the default RSS key.
4480 if_printf(sc->hn_ifp, "setup default RSS key\n");
4481 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
4482 sc->hn_flags |= HN_FLAG_HAS_RSSKEY;
4485 if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) {
4487 * RSS indirect table is not set yet; set it up in round-
4491 if_printf(sc->hn_ifp, "setup default RSS indirect "
4494 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
4495 rss->rss_ind[i] = i % nchan;
4496 sc->hn_flags |= HN_FLAG_HAS_RSSIND;
4499 * # of usable channels may be changed, so we have to
4500 * make sure that all entries in RSS indirect table
4503 hn_rss_ind_fixup(sc, nchan);
4506 error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE);
4509 * Failed to configure RSS key or indirect table; only
4510 * the primary channel can be used.
4516 * Set the # of TX/RX rings that could be used according to
4517 * the # of channels that NVS offered.
4519 hn_set_ring_inuse(sc, nchan);
4522 * Attach the sub-channels, if any.
4524 error = hn_attach_subchans(sc);
4529 * Fixup transmission aggregation setup.
4533 sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED;
4539 * The interface must have been suspended though hn_suspend(), before
4540 * this function get called.
4543 hn_synth_detach(struct hn_softc *sc)
4547 KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
4548 ("synthetic parts were not attached"));
4550 /* Detach the RNDIS first. */
4551 hn_rndis_detach(sc);
4556 /* Detach all of the channels. */
4557 hn_detach_allchans(sc);
4559 sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED;
4563 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
4565 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
4566 ("invalid ring count %d", ring_cnt));
4568 if (sc->hn_tx_ring_cnt > ring_cnt)
4569 sc->hn_tx_ring_inuse = ring_cnt;
4571 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
4572 sc->hn_rx_ring_inuse = ring_cnt;
4575 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
4576 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
4581 hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan)
4586 * The TX bufring will not be drained by the hypervisor,
4587 * if the primary channel is revoked.
4589 while (!vmbus_chan_rx_empty(chan) ||
4590 (!vmbus_chan_is_revoked(sc->hn_prichan) &&
4591 !vmbus_chan_tx_empty(chan)))
4593 vmbus_chan_intr_drain(chan);
4597 hn_suspend_data(struct hn_softc *sc)
4599 struct vmbus_channel **subch = NULL;
4600 struct hn_tx_ring *txr;
4608 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4609 txr = &sc->hn_tx_ring[i];
4611 mtx_lock(&txr->hn_tx_lock);
4612 txr->hn_suspended = 1;
4613 mtx_unlock(&txr->hn_tx_lock);
4614 /* No one is able send more packets now. */
4617 * Wait for all pending sends to finish.
4620 * We will _not_ receive all pending send-done, if the
4621 * primary channel is revoked.
4623 while (hn_tx_ring_pending(txr) &&
4624 !vmbus_chan_is_revoked(sc->hn_prichan))
4625 pause("hnwtx", 1 /* 1 tick */);
4629 * Disable RX by clearing RX filter.
4631 sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE;
4632 hn_rndis_set_rxfilter(sc, sc->hn_rx_filter);
4635 * Give RNDIS enough time to flush all pending data packets.
4637 pause("waitrx", (200 * hz) / 1000);
4640 * Drain RX/TX bufrings and interrupts.
4642 nsubch = sc->hn_rx_ring_inuse - 1;
4644 subch = vmbus_subchan_get(sc->hn_prichan, nsubch);
4646 if (subch != NULL) {
4647 for (i = 0; i < nsubch; ++i)
4648 hn_chan_drain(sc, subch[i]);
4650 hn_chan_drain(sc, sc->hn_prichan);
4653 vmbus_subchan_rel(subch, nsubch);
4656 * Drain any pending TX tasks.
4659 * The above hn_chan_drain() can dispatch TX tasks, so the TX
4660 * tasks will have to be drained _after_ the above hn_chan_drain()
4663 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4664 txr = &sc->hn_tx_ring[i];
4666 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
4667 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
4672 hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused)
4675 ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL;
4679 hn_suspend_mgmt(struct hn_softc *sc)
4686 * Make sure that hn_mgmt_taskq0 can nolonger be accessed
4687 * through hn_mgmt_taskq.
4689 TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc);
4690 vmbus_chan_run_task(sc->hn_prichan, &task);
4693 * Make sure that all pending management tasks are completed.
4695 taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init);
4696 taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status);
4697 taskqueue_drain_all(sc->hn_mgmt_taskq0);
4701 hn_suspend(struct hn_softc *sc)
4704 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4705 hn_suspend_data(sc);
4706 hn_suspend_mgmt(sc);
4710 hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt)
4714 KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt,
4715 ("invalid TX ring count %d", tx_ring_cnt));
4717 for (i = 0; i < tx_ring_cnt; ++i) {
4718 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4720 mtx_lock(&txr->hn_tx_lock);
4721 txr->hn_suspended = 0;
4722 mtx_unlock(&txr->hn_tx_lock);
4727 hn_resume_data(struct hn_softc *sc)
4736 hn_set_rxfilter(sc);
4739 * Make sure to clear suspend status on "all" TX rings,
4740 * since hn_tx_ring_inuse can be changed after
4741 * hn_suspend_data().
4743 hn_resume_tx(sc, sc->hn_tx_ring_cnt);
4745 #ifdef HN_IFSTART_SUPPORT
4746 if (!hn_use_if_start)
4750 * Flush unused drbrs, since hn_tx_ring_inuse may be
4753 for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i)
4754 hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
4760 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
4761 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
4764 * Use txeof task, so that any pending oactive can be
4767 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
4772 hn_resume_mgmt(struct hn_softc *sc)
4775 sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
4778 * Kick off network change detection, if it was pending.
4779 * If no network change was pending, start link status
4780 * checks, which is more lightweight than network change
4783 if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG)
4784 hn_change_network(sc);
4786 hn_update_link_status(sc);
4790 hn_resume(struct hn_softc *sc)
4793 if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING)
4799 hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen)
4801 const struct rndis_status_msg *msg;
4804 if (dlen < sizeof(*msg)) {
4805 if_printf(sc->hn_ifp, "invalid RNDIS status\n");
4810 switch (msg->rm_status) {
4811 case RNDIS_STATUS_MEDIA_CONNECT:
4812 case RNDIS_STATUS_MEDIA_DISCONNECT:
4813 hn_update_link_status(sc);
4816 case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG:
4817 /* Not really useful; ignore. */
4820 case RNDIS_STATUS_NETWORK_CHANGE:
4821 ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset);
4822 if (dlen < ofs + msg->rm_stbuflen ||
4823 msg->rm_stbuflen < sizeof(uint32_t)) {
4824 if_printf(sc->hn_ifp, "network changed\n");
4828 memcpy(&change, ((const uint8_t *)msg) + ofs,
4830 if_printf(sc->hn_ifp, "network changed, change %u\n",
4833 hn_change_network(sc);
4837 if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n",
4844 hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info)
4846 const struct rndis_pktinfo *pi = info_data;
4849 while (info_dlen != 0) {
4853 if (__predict_false(info_dlen < sizeof(*pi)))
4855 if (__predict_false(info_dlen < pi->rm_size))
4857 info_dlen -= pi->rm_size;
4859 if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK))
4861 if (__predict_false(pi->rm_size < pi->rm_pktinfooffset))
4863 dlen = pi->rm_size - pi->rm_pktinfooffset;
4866 switch (pi->rm_type) {
4867 case NDIS_PKTINFO_TYPE_VLAN:
4868 if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE))
4870 info->vlan_info = *((const uint32_t *)data);
4871 mask |= HN_RXINFO_VLAN;
4874 case NDIS_PKTINFO_TYPE_CSUM:
4875 if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE))
4877 info->csum_info = *((const uint32_t *)data);
4878 mask |= HN_RXINFO_CSUM;
4881 case HN_NDIS_PKTINFO_TYPE_HASHVAL:
4882 if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE))
4884 info->hash_value = *((const uint32_t *)data);
4885 mask |= HN_RXINFO_HASHVAL;
4888 case HN_NDIS_PKTINFO_TYPE_HASHINF:
4889 if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE))
4891 info->hash_info = *((const uint32_t *)data);
4892 mask |= HN_RXINFO_HASHINF;
4899 if (mask == HN_RXINFO_ALL) {
4900 /* All found; done */
4904 pi = (const struct rndis_pktinfo *)
4905 ((const uint8_t *)pi + pi->rm_size);
4910 * - If there is no hash value, invalidate the hash info.
4912 if ((mask & HN_RXINFO_HASHVAL) == 0)
4913 info->hash_info = HN_NDIS_HASH_INFO_INVALID;
4917 static __inline bool
4918 hn_rndis_check_overlap(int off, int len, int check_off, int check_len)
4921 if (off < check_off) {
4922 if (__predict_true(off + len <= check_off))
4924 } else if (off > check_off) {
4925 if (__predict_true(check_off + check_len <= off))
4932 hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen)
4934 const struct rndis_packet_msg *pkt;
4935 struct hn_rxinfo info;
4936 int data_off, pktinfo_off, data_len, pktinfo_len;
4941 if (__predict_false(dlen < sizeof(*pkt))) {
4942 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n");
4947 if (__predict_false(dlen < pkt->rm_len)) {
4948 if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, "
4949 "dlen %d, msglen %u\n", dlen, pkt->rm_len);
4952 if (__predict_false(pkt->rm_len <
4953 pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) {
4954 if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, "
4955 "msglen %u, data %u, oob %u, pktinfo %u\n",
4956 pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen,
4957 pkt->rm_pktinfolen);
4960 if (__predict_false(pkt->rm_datalen == 0)) {
4961 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n");
4968 #define IS_OFFSET_INVALID(ofs) \
4969 ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \
4970 ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK))
4972 /* XXX Hyper-V does not meet data offset alignment requirement */
4973 if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) {
4974 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4975 "data offset %u\n", pkt->rm_dataoffset);
4978 if (__predict_false(pkt->rm_oobdataoffset > 0 &&
4979 IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) {
4980 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4981 "oob offset %u\n", pkt->rm_oobdataoffset);
4984 if (__predict_true(pkt->rm_pktinfooffset > 0) &&
4985 __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) {
4986 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
4987 "pktinfo offset %u\n", pkt->rm_pktinfooffset);
4991 #undef IS_OFFSET_INVALID
4993 data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset);
4994 data_len = pkt->rm_datalen;
4995 pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset);
4996 pktinfo_len = pkt->rm_pktinfolen;
4999 * Check OOB coverage.
5001 if (__predict_false(pkt->rm_oobdatalen != 0)) {
5002 int oob_off, oob_len;
5004 if_printf(rxr->hn_ifp, "got oobdata\n");
5005 oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset);
5006 oob_len = pkt->rm_oobdatalen;
5008 if (__predict_false(oob_off + oob_len > pkt->rm_len)) {
5009 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5010 "oob overflow, msglen %u, oob abs %d len %d\n",
5011 pkt->rm_len, oob_off, oob_len);
5016 * Check against data.
5018 if (hn_rndis_check_overlap(oob_off, oob_len,
5019 data_off, data_len)) {
5020 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5021 "oob overlaps data, oob abs %d len %d, "
5022 "data abs %d len %d\n",
5023 oob_off, oob_len, data_off, data_len);
5028 * Check against pktinfo.
5030 if (pktinfo_len != 0 &&
5031 hn_rndis_check_overlap(oob_off, oob_len,
5032 pktinfo_off, pktinfo_len)) {
5033 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5034 "oob overlaps pktinfo, oob abs %d len %d, "
5035 "pktinfo abs %d len %d\n",
5036 oob_off, oob_len, pktinfo_off, pktinfo_len);
5042 * Check per-packet-info coverage and find useful per-packet-info.
5044 info.vlan_info = HN_NDIS_VLAN_INFO_INVALID;
5045 info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID;
5046 info.hash_info = HN_NDIS_HASH_INFO_INVALID;
5047 if (__predict_true(pktinfo_len != 0)) {
5051 if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) {
5052 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5053 "pktinfo overflow, msglen %u, "
5054 "pktinfo abs %d len %d\n",
5055 pkt->rm_len, pktinfo_off, pktinfo_len);
5060 * Check packet info coverage.
5062 overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len,
5063 data_off, data_len);
5064 if (__predict_false(overlap)) {
5065 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5066 "pktinfo overlap data, pktinfo abs %d len %d, "
5067 "data abs %d len %d\n",
5068 pktinfo_off, pktinfo_len, data_off, data_len);
5073 * Find useful per-packet-info.
5075 error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off,
5076 pktinfo_len, &info);
5077 if (__predict_false(error)) {
5078 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg "
5084 if (__predict_false(data_off + data_len > pkt->rm_len)) {
5085 if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, "
5086 "data overflow, msglen %u, data abs %d len %d\n",
5087 pkt->rm_len, data_off, data_len);
5090 hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info);
5093 static __inline void
5094 hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen)
5096 const struct rndis_msghdr *hdr;
5098 if (__predict_false(dlen < sizeof(*hdr))) {
5099 if_printf(rxr->hn_ifp, "invalid RNDIS msg\n");
5104 if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) {
5105 /* Hot data path. */
5106 hn_rndis_rx_data(rxr, data, dlen);
5111 if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG)
5112 hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen);
5114 hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen);
5118 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
5120 const struct hn_nvs_hdr *hdr;
5122 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
5123 if_printf(sc->hn_ifp, "invalid nvs notify\n");
5126 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
5128 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
5129 /* Useless; ignore */
5132 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
5136 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
5137 const struct vmbus_chanpkt_hdr *pkt)
5139 struct hn_nvs_sendctx *sndc;
5141 sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid;
5142 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
5143 VMBUS_CHANPKT_DATALEN(pkt));
5146 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
5152 hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5153 const struct vmbus_chanpkt_hdr *pkthdr)
5155 const struct vmbus_chanpkt_rxbuf *pkt;
5156 const struct hn_nvs_hdr *nvs_hdr;
5159 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
5160 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
5163 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
5165 /* Make sure that this is a RNDIS message. */
5166 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
5167 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
5172 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
5173 if (__predict_false(hlen < sizeof(*pkt))) {
5174 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
5177 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
5179 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
5180 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
5185 count = pkt->cp_rxbuf_cnt;
5186 if (__predict_false(hlen <
5187 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
5188 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
5192 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
5193 for (i = 0; i < count; ++i) {
5196 ofs = pkt->cp_rxbuf[i].rb_ofs;
5197 len = pkt->cp_rxbuf[i].rb_len;
5198 if (__predict_false(ofs + len > HN_RXBUF_SIZE)) {
5199 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
5200 "ofs %d, len %d\n", i, ofs, len);
5203 hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len);
5207 * Ack the consumed RXBUF associated w/ this channel packet,
5208 * so that this RXBUF can be recycled by the hypervisor.
5210 hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid);
5214 hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan,
5217 struct hn_nvs_rndis_ack ack;
5220 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
5221 ack.nvs_status = HN_NVS_STATUS_OK;
5225 error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
5226 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
5227 if (__predict_false(error == EAGAIN)) {
5230 * This should _not_ happen in real world, since the
5231 * consumption of the TX bufring from the TX path is
5234 if (rxr->hn_ack_failed == 0)
5235 if_printf(rxr->hn_ifp, "RXBUF ack retry\n");
5236 rxr->hn_ack_failed++;
5243 if_printf(rxr->hn_ifp, "RXBUF ack failed\n");
5248 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
5250 struct hn_rx_ring *rxr = xrxr;
5251 struct hn_softc *sc = rxr->hn_ifp->if_softc;
5254 struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf;
5257 pktlen = rxr->hn_pktbuf_len;
5258 error = vmbus_chan_recv_pkt(chan, pkt, &pktlen);
5259 if (__predict_false(error == ENOBUFS)) {
5264 * Expand channel packet buffer.
5267 * Use M_WAITOK here, since allocation failure
5270 nlen = rxr->hn_pktbuf_len * 2;
5271 while (nlen < pktlen)
5273 nbuf = malloc(nlen, M_DEVBUF, M_WAITOK);
5275 if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n",
5276 rxr->hn_pktbuf_len, nlen);
5278 free(rxr->hn_pktbuf, M_DEVBUF);
5279 rxr->hn_pktbuf = nbuf;
5280 rxr->hn_pktbuf_len = nlen;
5283 } else if (__predict_false(error == EAGAIN)) {
5284 /* No more channel packets; done! */
5287 KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error));
5289 switch (pkt->cph_type) {
5290 case VMBUS_CHANPKT_TYPE_COMP:
5291 hn_nvs_handle_comp(sc, chan, pkt);
5294 case VMBUS_CHANPKT_TYPE_RXBUF:
5295 hn_nvs_handle_rxbuf(rxr, chan, pkt);
5298 case VMBUS_CHANPKT_TYPE_INBAND:
5299 hn_nvs_handle_notify(sc, pkt);
5303 if_printf(rxr->hn_ifp, "unknown chan pkt %u\n",
5308 hn_chan_rollup(rxr, rxr->hn_txr);
5312 hn_tx_taskq_create(void *arg __unused)
5315 if (vm_guest != VM_GUEST_HV)
5318 if (!hn_share_tx_taskq)
5321 hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
5322 taskqueue_thread_enqueue, &hn_tx_taskq);
5323 taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
5324 if (hn_bind_tx_taskq >= 0) {
5325 int cpu = hn_bind_tx_taskq;
5326 struct task cpuset_task;
5329 if (cpu > mp_ncpus - 1)
5331 CPU_SETOF(cpu, &cpu_set);
5332 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, &cpu_set);
5333 taskqueue_enqueue(hn_tx_taskq, &cpuset_task);
5334 taskqueue_drain(hn_tx_taskq, &cpuset_task);
5337 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5338 hn_tx_taskq_create, NULL);
5341 hn_tx_taskq_destroy(void *arg __unused)
5344 if (hn_tx_taskq != NULL)
5345 taskqueue_free(hn_tx_taskq);
5347 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND,
5348 hn_tx_taskq_destroy, NULL);