2 * Copyright (c) 2010-2012 Citrix Inc.
3 * Copyright (c) 2009-2012,2016 Microsoft Corp.
4 * Copyright (c) 2012 NetApp Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice unmodified, this list of conditions, and the following
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * Copyright (c) 2004-2006 Kip Macy
31 * All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
58 #include "opt_inet6.h"
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/sockio.h>
65 #include <sys/malloc.h>
66 #include <sys/module.h>
67 #include <sys/kernel.h>
68 #include <sys/socket.h>
70 #include <sys/queue.h>
74 #include <sys/sysctl.h>
75 #include <sys/buf_ring.h>
78 #include <net/if_arp.h>
79 #include <net/ethernet.h>
80 #include <net/if_dl.h>
81 #include <net/if_media.h>
82 #include <net/rndis.h>
85 #include <net/if_types.h>
86 #include <net/if_vlan_var.h>
89 #include <netinet/in_systm.h>
90 #include <netinet/in.h>
91 #include <netinet/ip.h>
92 #include <netinet/if_ether.h>
93 #include <netinet/tcp.h>
94 #include <netinet/udp.h>
95 #include <netinet/ip6.h>
98 #include <vm/vm_param.h>
99 #include <vm/vm_kern.h>
102 #include <machine/bus.h>
103 #include <machine/resource.h>
104 #include <machine/frame.h>
105 #include <machine/vmparam.h>
108 #include <sys/rman.h>
109 #include <sys/mutex.h>
110 #include <sys/errno.h>
111 #include <sys/types.h>
112 #include <machine/atomic.h>
114 #include <machine/intr_machdep.h>
116 #include <machine/in_cksum.h>
118 #include <dev/hyperv/include/hyperv.h>
119 #include <dev/hyperv/include/hyperv_busdma.h>
120 #include <dev/hyperv/include/vmbus_xact.h>
122 #include <dev/hyperv/netvsc/hv_net_vsc.h>
123 #include <dev/hyperv/netvsc/hv_rndis_filter.h>
124 #include <dev/hyperv/netvsc/ndis.h>
126 #include "vmbus_if.h"
128 /* Short for Hyper-V network interface */
129 #define NETVSC_DEVNAME "hn"
132 * It looks like offset 0 of buf is reserved to hold the softc pointer.
133 * The sc pointer evidently not needed, and is not presently populated.
134 * The packet offset is where the netvsc_packet starts in the buffer.
136 #define HV_NV_SC_PTR_OFFSET_IN_BUF 0
137 #define HV_NV_PACKET_OFFSET_IN_BUF 16
139 /* YYY should get it from the underlying channel */
140 #define HN_TX_DESC_CNT 512
142 #define HN_LROENT_CNT_DEF 128
144 #define HN_RING_CNT_DEF_MAX 8
146 #define HN_RNDIS_PKT_LEN \
147 (sizeof(struct rndis_packet_msg) + \
148 HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \
149 HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \
150 HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \
151 HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE))
152 #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE
153 #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE
155 #define HN_TX_DATA_BOUNDARY PAGE_SIZE
156 #define HN_TX_DATA_MAXSIZE IP_MAXPACKET
157 #define HN_TX_DATA_SEGSIZE PAGE_SIZE
158 /* -1 for RNDIS packet message */
159 #define HN_TX_DATA_SEGCNT_MAX (NETVSC_PACKET_MAXPAGE - 1)
161 #define HN_DIRECT_TX_SIZE_DEF 128
163 #define HN_EARLY_TXEOF_THRESH 8
166 #ifndef HN_USE_TXDESC_BUFRING
167 SLIST_ENTRY(hn_txdesc) link;
170 struct hn_tx_ring *txr;
172 uint32_t flags; /* HN_TXD_FLAG_ */
173 struct hn_send_ctx send_ctx;
175 bus_dmamap_t data_dmap;
177 bus_addr_t rndis_pkt_paddr;
178 struct rndis_packet_msg *rndis_pkt;
179 bus_dmamap_t rndis_pkt_dmap;
182 #define HN_TXD_FLAG_ONLIST 0x1
183 #define HN_TXD_FLAG_DMAMAP 0x2
186 * Only enable UDP checksum offloading when it is on 2012R2 or
187 * later. UDP checksum offloading doesn't work on earlier
190 #define HN_CSUM_ASSIST_WIN8 (CSUM_IP | CSUM_TCP)
191 #define HN_CSUM_ASSIST (CSUM_IP | CSUM_UDP | CSUM_TCP)
193 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU)
194 #define HN_LRO_LENLIM_DEF (25 * ETHERMTU)
195 /* YYY 2*MTU is a bit rough, but should be good enough. */
196 #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu)
198 #define HN_LRO_ACKCNT_DEF 1
200 #define HN_LOCK_INIT(sc) \
201 sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev))
202 #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED)
203 #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock)
204 #define HN_LOCK(sc) sx_xlock(&(sc)->hn_lock)
205 #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock)
211 int hv_promisc_mode = 0; /* normal mode by default */
213 SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
214 "Hyper-V network interface");
216 /* Trust tcp segements verification on host side. */
217 static int hn_trust_hosttcp = 1;
218 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN,
219 &hn_trust_hosttcp, 0,
220 "Trust tcp segement verification on host side, "
221 "when csum info is missing (global setting)");
223 /* Trust udp datagrams verification on host side. */
224 static int hn_trust_hostudp = 1;
225 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN,
226 &hn_trust_hostudp, 0,
227 "Trust udp datagram verification on host side, "
228 "when csum info is missing (global setting)");
230 /* Trust ip packets verification on host side. */
231 static int hn_trust_hostip = 1;
232 SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN,
234 "Trust ip packet verification on host side, "
235 "when csum info is missing (global setting)");
237 /* Limit TSO burst size */
238 static int hn_tso_maxlen = 0;
239 SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
240 &hn_tso_maxlen, 0, "TSO burst limit");
242 /* Limit chimney send size */
243 static int hn_tx_chimney_size = 0;
244 SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN,
245 &hn_tx_chimney_size, 0, "Chimney send packet size limit");
247 /* Limit the size of packet for direct transmission */
248 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
249 SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN,
250 &hn_direct_tx_size, 0, "Size of the packet for direct transmission");
252 #if defined(INET) || defined(INET6)
253 #if __FreeBSD_version >= 1100095
254 static int hn_lro_entry_count = HN_LROENT_CNT_DEF;
255 SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
256 &hn_lro_entry_count, 0, "LRO entry count");
260 static int hn_share_tx_taskq = 0;
261 SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN,
262 &hn_share_tx_taskq, 0, "Enable shared TX taskqueue");
264 static struct taskqueue *hn_tx_taskq;
266 #ifndef HN_USE_TXDESC_BUFRING
267 static int hn_use_txdesc_bufring = 0;
269 static int hn_use_txdesc_bufring = 1;
271 SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD,
272 &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors");
274 static int hn_bind_tx_taskq = -1;
275 SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN,
276 &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu");
278 static int hn_use_if_start = 0;
279 SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN,
280 &hn_use_if_start, 0, "Use if_start TX method");
282 static int hn_chan_cnt = 0;
283 SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN,
285 "# of channels to use; each channel has one RX ring and one TX ring");
287 static int hn_tx_ring_cnt = 0;
288 SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN,
289 &hn_tx_ring_cnt, 0, "# of TX rings to use");
291 static int hn_tx_swq_depth = 0;
292 SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN,
293 &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING");
295 #if __FreeBSD_version >= 1100095
296 static u_int hn_lro_mbufq_depth = 0;
297 SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
298 &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue");
301 static u_int hn_cpu_index;
304 * Forward declarations
306 static void hn_stop(struct hn_softc *sc);
307 static void hn_init_locked(struct hn_softc *sc);
308 static void hn_init(void *xsc);
309 static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
310 static int hn_start_locked(struct hn_tx_ring *txr, int len);
311 static void hn_start(struct ifnet *ifp);
312 static void hn_start_txeof(struct hn_tx_ring *);
313 static int hn_ifmedia_upd(struct ifnet *ifp);
314 static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
315 #if __FreeBSD_version >= 1100099
316 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS);
317 static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS);
319 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
320 static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS);
321 #if __FreeBSD_version < 1100095
322 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS);
324 static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS);
326 static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
327 static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS);
328 static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS);
329 static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS);
330 static int hn_check_iplen(const struct mbuf *, int);
331 static int hn_create_tx_ring(struct hn_softc *, int);
332 static void hn_destroy_tx_ring(struct hn_tx_ring *);
333 static int hn_create_tx_data(struct hn_softc *, int);
334 static void hn_destroy_tx_data(struct hn_softc *);
335 static void hn_start_taskfunc(void *, int);
336 static void hn_start_txeof_taskfunc(void *, int);
337 static void hn_stop_tx_tasks(struct hn_softc *);
338 static int hn_encap(struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **);
339 static int hn_create_rx_data(struct hn_softc *sc, int);
340 static void hn_destroy_rx_data(struct hn_softc *sc);
341 static void hn_set_chim_size(struct hn_softc *, int);
342 static int hn_chan_attach(struct hn_softc *, struct vmbus_channel *);
343 static void hn_chan_detach(struct hn_softc *, struct vmbus_channel *);
344 static int hn_attach_subchans(struct hn_softc *);
345 static void hn_detach_allchans(struct hn_softc *);
346 static void hn_chan_callback(struct vmbus_channel *chan, void *xrxr);
347 static void hn_set_ring_inuse(struct hn_softc *, int);
348 static int hn_synth_attach(struct hn_softc *, int);
350 static void hn_nvs_handle_notify(struct hn_softc *sc,
351 const struct vmbus_chanpkt_hdr *pkt);
352 static void hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
353 const struct vmbus_chanpkt_hdr *pkt);
354 static void hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr,
355 struct vmbus_channel *chan,
356 const struct vmbus_chanpkt_hdr *pkthdr);
357 static void hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid);
359 static int hn_transmit(struct ifnet *, struct mbuf *);
360 static void hn_xmit_qflush(struct ifnet *);
361 static int hn_xmit(struct hn_tx_ring *, int);
362 static void hn_xmit_txeof(struct hn_tx_ring *);
363 static void hn_xmit_taskfunc(void *, int);
364 static void hn_xmit_txeof_taskfunc(void *, int);
366 static const uint8_t hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
367 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
368 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
369 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
370 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
371 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
374 #if __FreeBSD_version >= 1100099
376 hn_set_lro_lenlim(struct hn_softc *sc, int lenlim)
380 for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
381 sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim;
386 hn_get_txswq_depth(const struct hn_tx_ring *txr)
389 KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet"));
390 if (hn_tx_swq_depth < txr->hn_txdesc_cnt)
391 return txr->hn_txdesc_cnt;
392 return hn_tx_swq_depth;
396 hn_ifmedia_upd(struct ifnet *ifp __unused)
403 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
405 struct hn_softc *sc = ifp->if_softc;
407 ifmr->ifm_status = IFM_AVALID;
408 ifmr->ifm_active = IFM_ETHER;
410 if (!sc->hn_carrier) {
411 ifmr->ifm_active |= IFM_NONE;
414 ifmr->ifm_status |= IFM_ACTIVE;
415 ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
418 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
419 static const struct hyperv_guid g_net_vsc_device_type = {
420 .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
421 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
425 * Standard probe entry point.
429 netvsc_probe(device_t dev)
431 if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
432 &g_net_vsc_device_type) == 0) {
433 device_set_desc(dev, "Hyper-V Network Interface");
434 return BUS_PROBE_DEFAULT;
440 hn_cpuset_setthread_task(void *xmask, int pending __unused)
442 cpuset_t *mask = xmask;
445 error = cpuset_setthread(curthread->td_tid, mask);
447 panic("curthread=%ju: can't pin; error=%d",
448 (uintmax_t)curthread->td_tid, error);
453 * Standard attach entry point.
455 * Called when the driver is loaded. It allocates needed resources,
456 * and initializes the "hardware" and software.
459 netvsc_attach(device_t dev)
461 struct hn_softc *sc = device_get_softc(dev);
462 struct sysctl_oid_list *child;
463 struct sysctl_ctx_list *ctx;
464 uint8_t eaddr[ETHER_ADDR_LEN];
465 uint32_t link_status;
466 struct ifnet *ifp = NULL;
467 int error, ring_cnt, tx_ring_cnt;
471 sc->hn_prichan = vmbus_get_channel(dev);
475 * Setup taskqueue for transmission.
477 if (hn_tx_taskq == NULL) {
478 sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
479 taskqueue_thread_enqueue, &sc->hn_tx_taskq);
480 taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx",
481 device_get_nameunit(dev));
482 if (hn_bind_tx_taskq >= 0) {
483 int cpu = hn_bind_tx_taskq;
484 struct task cpuset_task;
487 if (cpu > mp_ncpus - 1)
489 CPU_SETOF(cpu, &cpu_set);
490 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task,
492 taskqueue_enqueue(sc->hn_tx_taskq, &cpuset_task);
493 taskqueue_drain(sc->hn_tx_taskq, &cpuset_task);
496 sc->hn_tx_taskq = hn_tx_taskq;
500 * Allocate ifnet and setup its name earlier, so that if_printf
501 * can be used by functions, which will be called after
504 ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
506 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
509 * Initialize ifmedia earlier so that it can be unconditionally
510 * destroyed, if error happened later on.
512 ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
515 * Figure out the # of RX rings (ring_cnt) and the # of TX rings
516 * to use (tx_ring_cnt).
519 * The # of RX rings to use is same as the # of channels to use.
521 ring_cnt = hn_chan_cnt;
525 if (ring_cnt > HN_RING_CNT_DEF_MAX)
526 ring_cnt = HN_RING_CNT_DEF_MAX;
527 } else if (ring_cnt > mp_ncpus) {
531 tx_ring_cnt = hn_tx_ring_cnt;
532 if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
533 tx_ring_cnt = ring_cnt;
534 if (hn_use_if_start) {
535 /* ifnet.if_start only needs one TX ring. */
540 * Set the leader CPU for channels.
542 sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus;
545 * Create enough TX/RX rings, even if only limited number of
546 * channels can be allocated.
548 error = hn_create_tx_data(sc, tx_ring_cnt);
551 error = hn_create_rx_data(sc, ring_cnt);
556 * Create transaction context for NVS and RNDIS transactions.
558 sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev),
559 HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0);
560 if (sc->hn_xact == NULL)
564 * Attach the synthetic parts, i.e. NVS and RNDIS.
566 error = hn_synth_attach(sc, ETHERMTU);
570 error = hn_rndis_get_linkstatus(sc, &link_status);
573 if (link_status == NDIS_MEDIA_STATE_CONNECTED)
576 error = hn_rndis_get_eaddr(sc, eaddr);
580 #if __FreeBSD_version >= 1100099
581 if (sc->hn_rx_ring_inuse > 1) {
583 * Reduce TCP segment aggregation limit for multiple
584 * RX rings to increase ACK timeliness.
586 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF);
590 hn_set_chim_size(sc, sc->hn_chim_szmax);
591 if (hn_tx_chimney_size > 0 &&
592 hn_tx_chimney_size < sc->hn_chim_szmax)
593 hn_set_chim_size(sc, hn_tx_chimney_size);
595 ctx = device_get_sysctl_ctx(dev);
596 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
597 SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD,
598 &sc->hn_nvs_ver, 0, "NVS version");
599 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version",
600 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
601 hn_ndis_version_sysctl, "A", "NDIS version");
604 * Setup the ifmedia, which has been initialized earlier.
606 ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
607 ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
608 /* XXX ifmedia_set really should do this for us */
609 sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
612 * Setup the ifnet for this interface.
615 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
616 ifp->if_ioctl = hn_ioctl;
617 ifp->if_init = hn_init;
618 if (hn_use_if_start) {
619 int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]);
621 ifp->if_start = hn_start;
622 IFQ_SET_MAXLEN(&ifp->if_snd, qdepth);
623 ifp->if_snd.ifq_drv_maxlen = qdepth - 1;
624 IFQ_SET_READY(&ifp->if_snd);
626 ifp->if_transmit = hn_transmit;
627 ifp->if_qflush = hn_xmit_qflush;
630 ifp->if_capabilities |=
631 IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO |
634 IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO |
636 ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist | CSUM_TSO;
638 tso_maxlen = hn_tso_maxlen;
639 if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET)
640 tso_maxlen = IP_MAXPACKET;
641 ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
642 ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
643 ifp->if_hw_tsomax = tso_maxlen -
644 (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
646 ether_ifattach(ifp, eaddr);
649 if_printf(ifp, "TSO: %u/%u/%u\n", ifp->if_hw_tsomax,
650 ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
653 /* Inform the upper layer about the long frame support. */
654 ifp->if_hdrlen = sizeof(struct ether_vlan_header);
658 /* TODO: reuse netvsc_detach() */
659 hn_destroy_tx_data(sc);
666 * Standard detach entry point
669 netvsc_detach(device_t dev)
671 struct hn_softc *sc = device_get_softc(dev);
674 printf("netvsc_detach\n");
677 * XXXKYS: Need to clean up all our
678 * driver state; this is the driver
683 * XXXKYS: Need to stop outgoing traffic and unregister
687 hv_rf_on_device_remove(sc);
688 hn_detach_allchans(sc);
690 hn_stop_tx_tasks(sc);
692 ifmedia_removeall(&sc->hn_media);
693 hn_destroy_rx_data(sc);
694 hn_destroy_tx_data(sc);
696 if (sc->hn_tx_taskq != hn_tx_taskq)
697 taskqueue_free(sc->hn_tx_taskq);
699 vmbus_xact_ctx_destroy(sc->hn_xact);
705 * Standard shutdown entry point
708 netvsc_shutdown(device_t dev)
714 hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd,
715 struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
717 struct mbuf *m = *m_head;
720 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap,
721 m, segs, nsegs, BUS_DMA_NOWAIT);
722 if (error == EFBIG) {
725 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
730 txr->hn_tx_collapsed++;
732 error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag,
733 txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
736 bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap,
737 BUS_DMASYNC_PREWRITE);
738 txd->flags |= HN_TXD_FLAG_DMAMAP;
744 hn_txdesc_dmamap_unload(struct hn_tx_ring *txr, struct hn_txdesc *txd)
747 if (txd->flags & HN_TXD_FLAG_DMAMAP) {
748 bus_dmamap_sync(txr->hn_tx_data_dtag,
749 txd->data_dmap, BUS_DMASYNC_POSTWRITE);
750 bus_dmamap_unload(txr->hn_tx_data_dtag,
752 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
757 hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd)
760 KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
761 ("put an onlist txd %#x", txd->flags));
763 KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
764 if (atomic_fetchadd_int(&txd->refs, -1) != 1)
767 hn_txdesc_dmamap_unload(txr, txd);
768 if (txd->m != NULL) {
773 txd->flags |= HN_TXD_FLAG_ONLIST;
775 #ifndef HN_USE_TXDESC_BUFRING
776 mtx_lock_spin(&txr->hn_txlist_spin);
777 KASSERT(txr->hn_txdesc_avail >= 0 &&
778 txr->hn_txdesc_avail < txr->hn_txdesc_cnt,
779 ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail));
780 txr->hn_txdesc_avail++;
781 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
782 mtx_unlock_spin(&txr->hn_txlist_spin);
784 atomic_add_int(&txr->hn_txdesc_avail, 1);
785 buf_ring_enqueue(txr->hn_txdesc_br, txd);
791 static __inline struct hn_txdesc *
792 hn_txdesc_get(struct hn_tx_ring *txr)
794 struct hn_txdesc *txd;
796 #ifndef HN_USE_TXDESC_BUFRING
797 mtx_lock_spin(&txr->hn_txlist_spin);
798 txd = SLIST_FIRST(&txr->hn_txlist);
800 KASSERT(txr->hn_txdesc_avail > 0,
801 ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail));
802 txr->hn_txdesc_avail--;
803 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
805 mtx_unlock_spin(&txr->hn_txlist_spin);
807 txd = buf_ring_dequeue_sc(txr->hn_txdesc_br);
811 #ifdef HN_USE_TXDESC_BUFRING
812 atomic_subtract_int(&txr->hn_txdesc_avail, 1);
814 KASSERT(txd->m == NULL && txd->refs == 0 &&
815 (txd->flags & HN_TXD_FLAG_ONLIST), ("invalid txd"));
816 txd->flags &= ~HN_TXD_FLAG_ONLIST;
823 hn_txdesc_hold(struct hn_txdesc *txd)
826 /* 0->1 transition will never work */
827 KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
828 atomic_add_int(&txd->refs, 1);
832 hn_txeof(struct hn_tx_ring *txr)
834 txr->hn_has_txeof = 0;
839 hn_tx_done(struct hn_send_ctx *sndc, struct hn_softc *sc,
840 struct vmbus_channel *chan, const void *data __unused, int dlen __unused)
842 struct hn_txdesc *txd = sndc->hn_cbarg;
843 struct hn_tx_ring *txr;
845 if (sndc->hn_chim_idx != HN_NVS_CHIM_IDX_INVALID)
846 hn_chim_free(sc, sndc->hn_chim_idx);
849 KASSERT(txr->hn_chan == chan,
850 ("channel mismatch, on chan%u, should be chan%u",
851 vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan)));
853 txr->hn_has_txeof = 1;
854 hn_txdesc_put(txr, txd);
856 ++txr->hn_txdone_cnt;
857 if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) {
858 txr->hn_txdone_cnt = 0;
865 hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr)
867 #if defined(INET) || defined(INET6)
868 struct lro_ctrl *lro = &rxr->hn_lro;
869 struct lro_entry *queued;
871 while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
872 SLIST_REMOVE_HEAD(&lro->lro_active, next);
873 tcp_lro_flush(lro, queued);
879 * 'txr' could be NULL, if multiple channels and
880 * ifnet.if_start method are enabled.
882 if (txr == NULL || !txr->hn_has_txeof)
885 txr->hn_txdone_cnt = 0;
889 static __inline uint32_t
890 hn_rndis_pktmsg_offset(uint32_t ofs)
893 KASSERT(ofs >= sizeof(struct rndis_packet_msg),
894 ("invalid RNDIS packet msg offset %u", ofs));
895 return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset));
900 * If this function fails, then both txd and m_head0 will be freed.
903 hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0)
905 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
907 struct mbuf *m_head = *m_head0;
908 struct rndis_packet_msg *pkt;
909 uint32_t send_buf_section_idx;
910 int send_buf_section_size, pktlen;
914 * extension points to the area reserved for the
915 * rndis_filter_packet, which is placed just after
916 * the netvsc_packet (and rppi struct, if present;
917 * length is updated later).
919 pkt = txd->rndis_pkt;
920 pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
921 pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len;
922 pkt->rm_dataoffset = sizeof(*pkt);
923 pkt->rm_datalen = m_head->m_pkthdr.len;
924 pkt->rm_pktinfooffset = sizeof(*pkt);
925 pkt->rm_pktinfolen = 0;
928 * Set the hash value for this packet, so that the host could
929 * dispatch the TX done event for this packet back to this TX
932 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
933 HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL);
934 *pi_data = txr->hn_tx_idx;
936 if (m_head->m_flags & M_VLANTAG) {
937 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
938 NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
939 *pi_data = NDIS_VLAN_INFO_MAKE(
940 EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag),
941 EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag),
942 EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag));
945 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
946 #if defined(INET6) || defined(INET)
947 struct ether_vlan_header *eh;
951 * XXX need m_pullup and use mtodo
953 eh = mtod(m_head, struct ether_vlan_header*);
954 if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN))
955 ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
957 ether_len = ETHER_HDR_LEN;
959 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
960 NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO);
962 if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
964 (struct ip *)(m_head->m_data + ether_len);
965 unsigned long iph_len = ip->ip_hl << 2;
967 (struct tcphdr *)((caddr_t)ip + iph_len);
971 th->th_sum = in_pseudo(ip->ip_src.s_addr,
972 ip->ip_dst.s_addr, htons(IPPROTO_TCP));
973 *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0,
974 m_head->m_pkthdr.tso_segsz);
977 #if defined(INET6) && defined(INET)
982 struct ip6_hdr *ip6 = (struct ip6_hdr *)
983 (m_head->m_data + ether_len);
984 struct tcphdr *th = (struct tcphdr *)(ip6 + 1);
987 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
988 *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0,
989 m_head->m_pkthdr.tso_segsz);
992 #endif /* INET6 || INET */
993 } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) {
994 pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN,
995 NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
996 *pi_data = NDIS_TXCSUM_INFO_IPV4;
998 if (m_head->m_pkthdr.csum_flags & CSUM_IP)
999 *pi_data |= NDIS_TXCSUM_INFO_IPCS;
1001 if (m_head->m_pkthdr.csum_flags & CSUM_TCP)
1002 *pi_data |= NDIS_TXCSUM_INFO_TCPCS;
1003 else if (m_head->m_pkthdr.csum_flags & CSUM_UDP)
1004 *pi_data |= NDIS_TXCSUM_INFO_UDPCS;
1007 pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1008 /* Convert RNDIS packet message offsets */
1009 pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset);
1010 pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset);
1013 * Chimney send, if the packet could fit into one chimney buffer.
1015 if (pkt->rm_len < txr->hn_chim_size) {
1016 txr->hn_tx_chimney_tried++;
1017 send_buf_section_idx = hn_chim_alloc(txr->hn_sc);
1018 if (send_buf_section_idx != HN_NVS_CHIM_IDX_INVALID) {
1019 uint8_t *dest = txr->hn_sc->hn_chim +
1020 (send_buf_section_idx * txr->hn_sc->hn_chim_szmax);
1022 memcpy(dest, pkt, pktlen);
1024 m_copydata(m_head, 0, m_head->m_pkthdr.len, dest);
1026 send_buf_section_size = pkt->rm_len;
1027 txr->hn_gpa_cnt = 0;
1028 txr->hn_tx_chimney++;
1033 error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs);
1038 * This mbuf is not linked w/ the txd yet, so free it now.
1043 freed = hn_txdesc_put(txr, txd);
1045 ("fail to free txd upon txdma error"));
1047 txr->hn_txdma_failed++;
1048 if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1);
1053 /* +1 RNDIS packet message */
1054 txr->hn_gpa_cnt = nsegs + 1;
1056 /* send packet with page buffer */
1057 txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr);
1058 txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK;
1059 txr->hn_gpa[0].gpa_len = pktlen;
1062 * Fill the page buffers with mbuf info after the page
1063 * buffer for RNDIS packet message.
1065 for (i = 0; i < nsegs; ++i) {
1066 struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1];
1068 gpa->gpa_page = atop(segs[i].ds_addr);
1069 gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK;
1070 gpa->gpa_len = segs[i].ds_len;
1073 send_buf_section_idx = HN_NVS_CHIM_IDX_INVALID;
1074 send_buf_section_size = 0;
1078 /* Set the completion routine */
1079 hn_send_ctx_init(&txd->send_ctx, hn_tx_done, txd,
1080 send_buf_section_idx, send_buf_section_size);
1087 * If this function fails, then txd will be freed, but the mbuf
1088 * associated w/ the txd will _not_ be freed.
1091 hn_send_pkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd)
1093 int error, send_failed = 0;
1097 * Make sure that txd is not freed before ETHER_BPF_MTAP.
1099 hn_txdesc_hold(txd);
1100 error = hv_nv_on_send(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA,
1101 &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt);
1103 ETHER_BPF_MTAP(ifp, txd->m);
1104 if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
1105 if (!hn_use_if_start) {
1106 if_inc_counter(ifp, IFCOUNTER_OBYTES,
1107 txd->m->m_pkthdr.len);
1108 if (txd->m->m_flags & M_MCAST)
1109 if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
1113 hn_txdesc_put(txr, txd);
1115 if (__predict_false(error)) {
1119 * This should "really rarely" happen.
1121 * XXX Too many RX to be acked or too many sideband
1122 * commands to run? Ask netvsc_channel_rollup()
1123 * to kick start later.
1125 txr->hn_has_txeof = 1;
1127 txr->hn_send_failed++;
1130 * Try sending again after set hn_has_txeof;
1131 * in case that we missed the last
1132 * netvsc_channel_rollup().
1136 if_printf(ifp, "send failed\n");
1139 * Caller will perform further processing on the
1140 * associated mbuf, so don't free it in hn_txdesc_put();
1141 * only unload it from the DMA map in hn_txdesc_put(),
1145 freed = hn_txdesc_put(txr, txd);
1147 ("fail to free txd upon send error"));
1149 txr->hn_send_failed++;
1155 * Start a transmit of one or more packets
1158 hn_start_locked(struct hn_tx_ring *txr, int len)
1160 struct hn_softc *sc = txr->hn_sc;
1161 struct ifnet *ifp = sc->hn_ifp;
1163 KASSERT(hn_use_if_start,
1164 ("hn_start_locked is called, when if_start is disabled"));
1165 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
1166 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
1168 if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
1172 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
1173 struct hn_txdesc *txd;
1174 struct mbuf *m_head;
1177 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
1181 if (len > 0 && m_head->m_pkthdr.len > len) {
1183 * This sending could be time consuming; let callers
1184 * dispatch this packet sending (and sending of any
1185 * following up packets) to tx taskqueue.
1187 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1191 txd = hn_txdesc_get(txr);
1193 txr->hn_no_txdescs++;
1194 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1195 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1199 error = hn_encap(txr, txd, &m_head);
1201 /* Both txd and m_head are freed */
1205 error = hn_send_pkt(ifp, txr, txd);
1206 if (__predict_false(error)) {
1207 /* txd is freed, but m_head is not */
1208 IFQ_DRV_PREPEND(&ifp->if_snd, m_head);
1209 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1217 * Link up/down notification
1220 netvsc_linkstatus_callback(struct hn_softc *sc, uint32_t status)
1230 * Append the specified data to the indicated mbuf chain,
1231 * Extend the mbuf chain if the new data does not fit in
1234 * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
1235 * There should be an equivalent in the kernel mbuf code,
1236 * but there does not appear to be one yet.
1238 * Differs from m_append() in that additional mbufs are
1239 * allocated with cluster size MJUMPAGESIZE, and filled
1242 * Return 1 if able to complete the job; otherwise 0.
1245 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
1248 int remainder, space;
1250 for (m = m0; m->m_next != NULL; m = m->m_next)
1253 space = M_TRAILINGSPACE(m);
1256 * Copy into available space.
1258 if (space > remainder)
1260 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1265 while (remainder > 0) {
1267 * Allocate a new mbuf; could check space
1268 * and allocate a cluster instead.
1270 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
1273 n->m_len = min(MJUMPAGESIZE, remainder);
1274 bcopy(cp, mtod(n, caddr_t), n->m_len);
1276 remainder -= n->m_len;
1280 if (m0->m_flags & M_PKTHDR)
1281 m0->m_pkthdr.len += len - remainder;
1283 return (remainder == 0);
1286 #if defined(INET) || defined(INET6)
1288 hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m)
1290 #if __FreeBSD_version >= 1100095
1291 if (hn_lro_mbufq_depth) {
1292 tcp_lro_queue_mbuf(lc, m);
1296 return tcp_lro_rx(lc, m, 0);
1301 * Called when we receive a data packet from the "wire" on the
1304 * Note: This is no longer used as a callback
1307 hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
1308 const struct hn_recvinfo *info)
1310 struct ifnet *ifp = rxr->hn_ifp;
1312 int size, do_lro = 0, do_csum = 1;
1313 int hash_type = M_HASHTYPE_OPAQUE;
1315 if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
1319 * Bail out if packet contains more data than configured MTU.
1321 if (dlen > (ifp->if_mtu + ETHER_HDR_LEN)) {
1323 } else if (dlen <= MHLEN) {
1324 m_new = m_gethdr(M_NOWAIT, MT_DATA);
1325 if (m_new == NULL) {
1326 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1329 memcpy(mtod(m_new, void *), data, dlen);
1330 m_new->m_pkthdr.len = m_new->m_len = dlen;
1331 rxr->hn_small_pkts++;
1334 * Get an mbuf with a cluster. For packets 2K or less,
1335 * get a standard 2K cluster. For anything larger, get a
1336 * 4K cluster. Any buffers larger than 4K can cause problems
1337 * if looped around to the Hyper-V TX channel, so avoid them.
1340 if (dlen > MCLBYTES) {
1342 size = MJUMPAGESIZE;
1345 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
1346 if (m_new == NULL) {
1347 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
1351 hv_m_append(m_new, dlen, data);
1353 m_new->m_pkthdr.rcvif = ifp;
1355 if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
1358 /* receive side checksum offload */
1359 if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) {
1360 /* IP csum offload */
1361 if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) {
1362 m_new->m_pkthdr.csum_flags |=
1363 (CSUM_IP_CHECKED | CSUM_IP_VALID);
1367 /* TCP/UDP csum offload */
1368 if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK |
1369 NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) {
1370 m_new->m_pkthdr.csum_flags |=
1371 (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1372 m_new->m_pkthdr.csum_data = 0xffff;
1373 if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK)
1379 if ((info->csum_info &
1380 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) ==
1381 (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK))
1384 const struct ether_header *eh;
1389 if (m_new->m_len < hoff)
1391 eh = mtod(m_new, struct ether_header *);
1392 etype = ntohs(eh->ether_type);
1393 if (etype == ETHERTYPE_VLAN) {
1394 const struct ether_vlan_header *evl;
1396 hoff = sizeof(*evl);
1397 if (m_new->m_len < hoff)
1399 evl = mtod(m_new, struct ether_vlan_header *);
1400 etype = ntohs(evl->evl_proto);
1403 if (etype == ETHERTYPE_IP) {
1406 pr = hn_check_iplen(m_new, hoff);
1407 if (pr == IPPROTO_TCP) {
1409 (rxr->hn_trust_hcsum &
1410 HN_TRUST_HCSUM_TCP)) {
1411 rxr->hn_csum_trusted++;
1412 m_new->m_pkthdr.csum_flags |=
1413 (CSUM_IP_CHECKED | CSUM_IP_VALID |
1414 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1415 m_new->m_pkthdr.csum_data = 0xffff;
1418 } else if (pr == IPPROTO_UDP) {
1420 (rxr->hn_trust_hcsum &
1421 HN_TRUST_HCSUM_UDP)) {
1422 rxr->hn_csum_trusted++;
1423 m_new->m_pkthdr.csum_flags |=
1424 (CSUM_IP_CHECKED | CSUM_IP_VALID |
1425 CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1426 m_new->m_pkthdr.csum_data = 0xffff;
1428 } else if (pr != IPPROTO_DONE && do_csum &&
1429 (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
1430 rxr->hn_csum_trusted++;
1431 m_new->m_pkthdr.csum_flags |=
1432 (CSUM_IP_CHECKED | CSUM_IP_VALID);
1437 if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) {
1438 m_new->m_pkthdr.ether_vtag = EVL_MAKETAG(
1439 NDIS_VLAN_INFO_ID(info->vlan_info),
1440 NDIS_VLAN_INFO_PRI(info->vlan_info),
1441 NDIS_VLAN_INFO_CFI(info->vlan_info));
1442 m_new->m_flags |= M_VLANTAG;
1445 if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
1447 m_new->m_pkthdr.flowid = info->hash_value;
1448 if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) ==
1449 NDIS_HASH_FUNCTION_TOEPLITZ) {
1450 uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK);
1453 case NDIS_HASH_IPV4:
1454 hash_type = M_HASHTYPE_RSS_IPV4;
1457 case NDIS_HASH_TCP_IPV4:
1458 hash_type = M_HASHTYPE_RSS_TCP_IPV4;
1461 case NDIS_HASH_IPV6:
1462 hash_type = M_HASHTYPE_RSS_IPV6;
1465 case NDIS_HASH_IPV6_EX:
1466 hash_type = M_HASHTYPE_RSS_IPV6_EX;
1469 case NDIS_HASH_TCP_IPV6:
1470 hash_type = M_HASHTYPE_RSS_TCP_IPV6;
1473 case NDIS_HASH_TCP_IPV6_EX:
1474 hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX;
1479 m_new->m_pkthdr.flowid = rxr->hn_rx_idx;
1481 M_HASHTYPE_SET(m_new, hash_type);
1484 * Note: Moved RX completion back to hv_nv_on_receive() so all
1485 * messages (not just data messages) will trigger a response.
1491 if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
1492 #if defined(INET) || defined(INET6)
1493 struct lro_ctrl *lro = &rxr->hn_lro;
1496 rxr->hn_lro_tried++;
1497 if (hn_lro_rx(lro, m_new) == 0) {
1505 /* We're not holding the lock here, so don't release it */
1506 (*ifp->if_input)(ifp, m_new);
1512 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1514 struct hn_softc *sc = ifp->if_softc;
1515 struct ifreq *ifr = (struct ifreq *)data;
1516 int mask, error = 0;
1520 if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) {
1527 if (ifp->if_mtu == ifr->ifr_mtu) {
1532 /* Obtain and record requested MTU */
1533 ifp->if_mtu = ifr->ifr_mtu;
1535 #if __FreeBSD_version >= 1100099
1537 * Make sure that LRO aggregation length limit is still
1538 * valid, after the MTU change.
1540 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
1541 HN_LRO_LENLIM_MIN(ifp))
1542 hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
1545 /* We must remove and add back the device to cause the new
1546 * MTU to take effect. This includes tearing down, but not
1547 * deleting the channel, then bringing it back up.
1549 error = hv_rf_on_device_remove(sc);
1556 * Detach all of the channels.
1558 hn_detach_allchans(sc);
1561 * Attach the synthetic parts, i.e. NVS and RNDIS.
1564 hn_synth_attach(sc, ifr->ifr_mtu);
1566 if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
1567 hn_set_chim_size(sc, sc->hn_chim_szmax);
1577 if (ifp->if_flags & IFF_UP) {
1579 * If only the state of the PROMISC flag changed,
1580 * then just use the 'set promisc mode' command
1581 * instead of reinitializing the entire NIC. Doing
1582 * a full re-init means reloading the firmware and
1583 * waiting for it to start up, which may take a
1587 /* Fixme: Promiscuous mode? */
1588 if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1589 ifp->if_flags & IFF_PROMISC &&
1590 !(sc->hn_if_flags & IFF_PROMISC)) {
1591 /* do something here for Hyper-V */
1592 } else if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1593 !(ifp->if_flags & IFF_PROMISC) &&
1594 sc->hn_if_flags & IFF_PROMISC) {
1595 /* do something here for Hyper-V */
1600 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1604 sc->hn_if_flags = ifp->if_flags;
1612 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
1613 if (mask & IFCAP_TXCSUM) {
1614 ifp->if_capenable ^= IFCAP_TXCSUM;
1615 if (ifp->if_capenable & IFCAP_TXCSUM) {
1617 sc->hn_tx_ring[0].hn_csum_assist;
1620 ~sc->hn_tx_ring[0].hn_csum_assist;
1624 if (mask & IFCAP_RXCSUM)
1625 ifp->if_capenable ^= IFCAP_RXCSUM;
1627 if (mask & IFCAP_LRO)
1628 ifp->if_capenable ^= IFCAP_LRO;
1630 if (mask & IFCAP_TSO4) {
1631 ifp->if_capenable ^= IFCAP_TSO4;
1632 if (ifp->if_capenable & IFCAP_TSO4)
1633 ifp->if_hwassist |= CSUM_IP_TSO;
1635 ifp->if_hwassist &= ~CSUM_IP_TSO;
1638 if (mask & IFCAP_TSO6) {
1639 ifp->if_capenable ^= IFCAP_TSO6;
1640 if (ifp->if_capenable & IFCAP_TSO6)
1641 ifp->if_hwassist |= CSUM_IP6_TSO;
1643 ifp->if_hwassist &= ~CSUM_IP6_TSO;
1651 /* Always all-multi */
1654 * Enable/disable all-multi according to the emptiness of
1655 * the mcast address list.
1661 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
1665 error = ether_ioctl(ifp, cmd, data);
1672 hn_stop(struct hn_softc *sc)
1682 printf(" Closing Device ...\n");
1684 atomic_clear_int(&ifp->if_drv_flags,
1685 (IFF_DRV_RUNNING | IFF_DRV_OACTIVE));
1686 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
1687 sc->hn_tx_ring[i].hn_oactive = 0;
1689 if_link_state_change(ifp, LINK_STATE_DOWN);
1691 ret = hv_rf_on_close(sc);
1695 * FreeBSD transmit entry point
1698 hn_start(struct ifnet *ifp)
1700 struct hn_softc *sc = ifp->if_softc;
1701 struct hn_tx_ring *txr = &sc->hn_tx_ring[0];
1703 if (txr->hn_sched_tx)
1706 if (mtx_trylock(&txr->hn_tx_lock)) {
1709 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
1710 mtx_unlock(&txr->hn_tx_lock);
1715 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
1719 hn_start_txeof(struct hn_tx_ring *txr)
1721 struct hn_softc *sc = txr->hn_sc;
1722 struct ifnet *ifp = sc->hn_ifp;
1724 KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring"));
1726 if (txr->hn_sched_tx)
1729 if (mtx_trylock(&txr->hn_tx_lock)) {
1732 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1733 sched = hn_start_locked(txr, txr->hn_direct_tx_size);
1734 mtx_unlock(&txr->hn_tx_lock);
1736 taskqueue_enqueue(txr->hn_tx_taskq,
1742 * Release the OACTIVE earlier, with the hope, that
1743 * others could catch up. The task will clear the
1744 * flag again with the hn_tx_lock to avoid possible
1747 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1748 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
1753 hn_init_locked(struct hn_softc *sc)
1762 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1766 hv_promisc_mode = 1;
1768 ret = hv_rf_on_open(sc);
1772 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1773 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
1774 sc->hn_tx_ring[i].hn_oactive = 0;
1776 atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
1777 if_link_state_change(ifp, LINK_STATE_UP);
1783 struct hn_softc *sc = xsc;
1795 hn_watchdog(struct ifnet *ifp)
1798 if_printf(ifp, "watchdog timeout -- resetting\n");
1799 hn_init(ifp->if_softc); /* XXX */
1804 #if __FreeBSD_version >= 1100099
1807 hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS)
1809 struct hn_softc *sc = arg1;
1810 unsigned int lenlim;
1813 lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim;
1814 error = sysctl_handle_int(oidp, &lenlim, 0, req);
1815 if (error || req->newptr == NULL)
1819 if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) ||
1820 lenlim > TCP_LRO_LENGTH_MAX) {
1824 hn_set_lro_lenlim(sc, lenlim);
1831 hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS)
1833 struct hn_softc *sc = arg1;
1834 int ackcnt, error, i;
1837 * lro_ackcnt_lim is append count limit,
1838 * +1 to turn it into aggregation limit.
1840 ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1;
1841 error = sysctl_handle_int(oidp, &ackcnt, 0, req);
1842 if (error || req->newptr == NULL)
1845 if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1))
1849 * Convert aggregation limit back to append
1854 for (i = 0; i < sc->hn_rx_ring_inuse; ++i)
1855 sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt;
1863 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
1865 struct hn_softc *sc = arg1;
1870 if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum)
1873 error = sysctl_handle_int(oidp, &on, 0, req);
1874 if (error || req->newptr == NULL)
1878 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
1879 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
1882 rxr->hn_trust_hcsum |= hcsum;
1884 rxr->hn_trust_hcsum &= ~hcsum;
1891 hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS)
1893 struct hn_softc *sc = arg1;
1894 int chim_size, error;
1896 chim_size = sc->hn_tx_ring[0].hn_chim_size;
1897 error = sysctl_handle_int(oidp, &chim_size, 0, req);
1898 if (error || req->newptr == NULL)
1901 if (chim_size > sc->hn_chim_szmax || chim_size <= 0)
1905 hn_set_chim_size(sc, chim_size);
1910 #if __FreeBSD_version < 1100095
1912 hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS)
1914 struct hn_softc *sc = arg1;
1915 int ofs = arg2, i, error;
1916 struct hn_rx_ring *rxr;
1920 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
1921 rxr = &sc->hn_rx_ring[i];
1922 stat += *((int *)((uint8_t *)rxr + ofs));
1925 error = sysctl_handle_64(oidp, &stat, 0, req);
1926 if (error || req->newptr == NULL)
1929 /* Zero out this stat. */
1930 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
1931 rxr = &sc->hn_rx_ring[i];
1932 *((int *)((uint8_t *)rxr + ofs)) = 0;
1938 hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS)
1940 struct hn_softc *sc = arg1;
1941 int ofs = arg2, i, error;
1942 struct hn_rx_ring *rxr;
1946 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
1947 rxr = &sc->hn_rx_ring[i];
1948 stat += *((uint64_t *)((uint8_t *)rxr + ofs));
1951 error = sysctl_handle_64(oidp, &stat, 0, req);
1952 if (error || req->newptr == NULL)
1955 /* Zero out this stat. */
1956 for (i = 0; i < sc->hn_rx_ring_inuse; ++i) {
1957 rxr = &sc->hn_rx_ring[i];
1958 *((uint64_t *)((uint8_t *)rxr + ofs)) = 0;
1966 hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
1968 struct hn_softc *sc = arg1;
1969 int ofs = arg2, i, error;
1970 struct hn_rx_ring *rxr;
1974 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1975 rxr = &sc->hn_rx_ring[i];
1976 stat += *((u_long *)((uint8_t *)rxr + ofs));
1979 error = sysctl_handle_long(oidp, &stat, 0, req);
1980 if (error || req->newptr == NULL)
1983 /* Zero out this stat. */
1984 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
1985 rxr = &sc->hn_rx_ring[i];
1986 *((u_long *)((uint8_t *)rxr + ofs)) = 0;
1992 hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS)
1994 struct hn_softc *sc = arg1;
1995 int ofs = arg2, i, error;
1996 struct hn_tx_ring *txr;
2000 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2001 txr = &sc->hn_tx_ring[i];
2002 stat += *((u_long *)((uint8_t *)txr + ofs));
2005 error = sysctl_handle_long(oidp, &stat, 0, req);
2006 if (error || req->newptr == NULL)
2009 /* Zero out this stat. */
2010 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2011 txr = &sc->hn_tx_ring[i];
2012 *((u_long *)((uint8_t *)txr + ofs)) = 0;
2018 hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS)
2020 struct hn_softc *sc = arg1;
2021 int ofs = arg2, i, error, conf;
2022 struct hn_tx_ring *txr;
2024 txr = &sc->hn_tx_ring[0];
2025 conf = *((int *)((uint8_t *)txr + ofs));
2027 error = sysctl_handle_int(oidp, &conf, 0, req);
2028 if (error || req->newptr == NULL)
2032 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2033 txr = &sc->hn_tx_ring[i];
2034 *((int *)((uint8_t *)txr + ofs)) = conf;
2042 hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS)
2044 struct hn_softc *sc = arg1;
2047 snprintf(verstr, sizeof(verstr), "%u.%u",
2048 HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver),
2049 HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver));
2050 return sysctl_handle_string(oidp, verstr, sizeof(verstr), req);
2054 hn_check_iplen(const struct mbuf *m, int hoff)
2056 const struct ip *ip;
2057 int len, iphlen, iplen;
2058 const struct tcphdr *th;
2059 int thoff; /* TCP data offset */
2061 len = hoff + sizeof(struct ip);
2063 /* The packet must be at least the size of an IP header. */
2064 if (m->m_pkthdr.len < len)
2065 return IPPROTO_DONE;
2067 /* The fixed IP header must reside completely in the first mbuf. */
2069 return IPPROTO_DONE;
2071 ip = mtodo(m, hoff);
2073 /* Bound check the packet's stated IP header length. */
2074 iphlen = ip->ip_hl << 2;
2075 if (iphlen < sizeof(struct ip)) /* minimum header length */
2076 return IPPROTO_DONE;
2078 /* The full IP header must reside completely in the one mbuf. */
2079 if (m->m_len < hoff + iphlen)
2080 return IPPROTO_DONE;
2082 iplen = ntohs(ip->ip_len);
2085 * Check that the amount of data in the buffers is as
2086 * at least much as the IP header would have us expect.
2088 if (m->m_pkthdr.len < hoff + iplen)
2089 return IPPROTO_DONE;
2092 * Ignore IP fragments.
2094 if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
2095 return IPPROTO_DONE;
2098 * The TCP/IP or UDP/IP header must be entirely contained within
2099 * the first fragment of a packet.
2103 if (iplen < iphlen + sizeof(struct tcphdr))
2104 return IPPROTO_DONE;
2105 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
2106 return IPPROTO_DONE;
2107 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
2108 thoff = th->th_off << 2;
2109 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
2110 return IPPROTO_DONE;
2111 if (m->m_len < hoff + iphlen + thoff)
2112 return IPPROTO_DONE;
2115 if (iplen < iphlen + sizeof(struct udphdr))
2116 return IPPROTO_DONE;
2117 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
2118 return IPPROTO_DONE;
2122 return IPPROTO_DONE;
2129 hn_create_rx_data(struct hn_softc *sc, int ring_cnt)
2131 struct sysctl_oid_list *child;
2132 struct sysctl_ctx_list *ctx;
2133 device_t dev = sc->hn_dev;
2134 #if defined(INET) || defined(INET6)
2135 #if __FreeBSD_version >= 1100095
2142 * Create RXBUF for reception.
2145 * - It is shared by all channels.
2146 * - A large enough buffer is allocated, certain version of NVSes
2147 * may further limit the usable space.
2149 sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev),
2150 PAGE_SIZE, 0, NETVSC_RECEIVE_BUFFER_SIZE, &sc->hn_rxbuf_dma,
2151 BUS_DMA_WAITOK | BUS_DMA_ZERO);
2152 if (sc->hn_rxbuf == NULL) {
2153 device_printf(sc->hn_dev, "allocate rxbuf failed\n");
2157 sc->hn_rx_ring_cnt = ring_cnt;
2158 sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt;
2160 sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt,
2161 M_NETVSC, M_WAITOK | M_ZERO);
2163 #if defined(INET) || defined(INET6)
2164 #if __FreeBSD_version >= 1100095
2165 lroent_cnt = hn_lro_entry_count;
2166 if (lroent_cnt < TCP_LRO_ENTRIES)
2167 lroent_cnt = TCP_LRO_ENTRIES;
2168 device_printf(dev, "LRO: entry count %d\n", lroent_cnt);
2170 #endif /* INET || INET6 */
2172 ctx = device_get_sysctl_ctx(dev);
2173 child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
2175 /* Create dev.hn.UNIT.rx sysctl tree */
2176 sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx",
2177 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2179 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2180 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2182 if (hn_trust_hosttcp)
2183 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
2184 if (hn_trust_hostudp)
2185 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
2186 if (hn_trust_hostip)
2187 rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
2188 rxr->hn_ifp = sc->hn_ifp;
2189 if (i < sc->hn_tx_ring_cnt)
2190 rxr->hn_txr = &sc->hn_tx_ring[i];
2191 rxr->hn_rdbuf = malloc(NETVSC_PACKET_SIZE, M_NETVSC, M_WAITOK);
2193 rxr->hn_rxbuf = sc->hn_rxbuf;
2198 #if defined(INET) || defined(INET6)
2199 #if __FreeBSD_version >= 1100095
2200 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt,
2201 hn_lro_mbufq_depth);
2203 tcp_lro_init(&rxr->hn_lro);
2204 rxr->hn_lro.ifp = sc->hn_ifp;
2206 #if __FreeBSD_version >= 1100099
2207 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF;
2208 rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF;
2210 #endif /* INET || INET6 */
2212 if (sc->hn_rx_sysctl_tree != NULL) {
2216 * Create per RX ring sysctl tree:
2217 * dev.hn.UNIT.rx.RINGID
2219 snprintf(name, sizeof(name), "%d", i);
2220 rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx,
2221 SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree),
2222 OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2224 if (rxr->hn_rx_sysctl_tree != NULL) {
2225 SYSCTL_ADD_ULONG(ctx,
2226 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2227 OID_AUTO, "packets", CTLFLAG_RW,
2228 &rxr->hn_pkts, "# of packets received");
2229 SYSCTL_ADD_ULONG(ctx,
2230 SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree),
2231 OID_AUTO, "rss_pkts", CTLFLAG_RW,
2233 "# of packets w/ RSS info received");
2238 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued",
2239 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2240 __offsetof(struct hn_rx_ring, hn_lro.lro_queued),
2241 #if __FreeBSD_version < 1100095
2242 hn_rx_stat_int_sysctl,
2244 hn_rx_stat_u64_sysctl,
2246 "LU", "LRO queued");
2247 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed",
2248 CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2249 __offsetof(struct hn_rx_ring, hn_lro.lro_flushed),
2250 #if __FreeBSD_version < 1100095
2251 hn_rx_stat_int_sysctl,
2253 hn_rx_stat_u64_sysctl,
2255 "LU", "LRO flushed");
2256 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried",
2257 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2258 __offsetof(struct hn_rx_ring, hn_lro_tried),
2259 hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries");
2260 #if __FreeBSD_version >= 1100099
2261 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim",
2262 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2263 hn_lro_lenlim_sysctl, "IU",
2264 "Max # of data bytes to be aggregated by LRO");
2265 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim",
2266 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2267 hn_lro_ackcnt_sysctl, "I",
2268 "Max # of ACKs to be aggregated by LRO");
2270 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
2271 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP,
2272 hn_trust_hcsum_sysctl, "I",
2273 "Trust tcp segement verification on host side, "
2274 "when csum info is missing");
2275 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
2276 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP,
2277 hn_trust_hcsum_sysctl, "I",
2278 "Trust udp datagram verification on host side, "
2279 "when csum info is missing");
2280 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
2281 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP,
2282 hn_trust_hcsum_sysctl, "I",
2283 "Trust ip packet verification on host side, "
2284 "when csum info is missing");
2285 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip",
2286 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2287 __offsetof(struct hn_rx_ring, hn_csum_ip),
2288 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP");
2289 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp",
2290 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2291 __offsetof(struct hn_rx_ring, hn_csum_tcp),
2292 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP");
2293 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp",
2294 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2295 __offsetof(struct hn_rx_ring, hn_csum_udp),
2296 hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP");
2297 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted",
2298 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2299 __offsetof(struct hn_rx_ring, hn_csum_trusted),
2300 hn_rx_stat_ulong_sysctl, "LU",
2301 "# of packets that we trust host's csum verification");
2302 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts",
2303 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2304 __offsetof(struct hn_rx_ring, hn_small_pkts),
2305 hn_rx_stat_ulong_sysctl, "LU", "# of small packets received");
2306 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt",
2307 CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings");
2308 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse",
2309 CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings");
2315 hn_destroy_rx_data(struct hn_softc *sc)
2319 if (sc->hn_rxbuf != NULL) {
2320 hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf);
2321 sc->hn_rxbuf = NULL;
2324 if (sc->hn_rx_ring_cnt == 0)
2327 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
2328 struct hn_rx_ring *rxr = &sc->hn_rx_ring[i];
2330 #if defined(INET) || defined(INET6)
2331 tcp_lro_free(&rxr->hn_lro);
2333 free(rxr->hn_rdbuf, M_NETVSC);
2335 free(sc->hn_rx_ring, M_NETVSC);
2336 sc->hn_rx_ring = NULL;
2338 sc->hn_rx_ring_cnt = 0;
2339 sc->hn_rx_ring_inuse = 0;
2343 hn_create_tx_ring(struct hn_softc *sc, int id)
2345 struct hn_tx_ring *txr = &sc->hn_tx_ring[id];
2346 device_t dev = sc->hn_dev;
2347 bus_dma_tag_t parent_dtag;
2352 txr->hn_tx_idx = id;
2354 #ifndef HN_USE_TXDESC_BUFRING
2355 mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
2357 mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF);
2359 txr->hn_txdesc_cnt = HN_TX_DESC_CNT;
2360 txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt,
2361 M_NETVSC, M_WAITOK | M_ZERO);
2362 #ifndef HN_USE_TXDESC_BUFRING
2363 SLIST_INIT(&txr->hn_txlist);
2365 txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_NETVSC,
2366 M_WAITOK, &txr->hn_tx_lock);
2369 txr->hn_tx_taskq = sc->hn_tx_taskq;
2371 if (hn_use_if_start) {
2372 txr->hn_txeof = hn_start_txeof;
2373 TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr);
2374 TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr);
2378 txr->hn_txeof = hn_xmit_txeof;
2379 TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr);
2380 TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr);
2382 br_depth = hn_get_txswq_depth(txr);
2383 txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_NETVSC,
2384 M_WAITOK, &txr->hn_tx_lock);
2387 txr->hn_direct_tx_size = hn_direct_tx_size;
2388 version = VMBUS_GET_VERSION(device_get_parent(dev), dev);
2389 if (version >= VMBUS_VERSION_WIN8_1) {
2390 txr->hn_csum_assist = HN_CSUM_ASSIST;
2392 txr->hn_csum_assist = HN_CSUM_ASSIST_WIN8;
2394 device_printf(dev, "bus version %u.%u, "
2395 "no UDP checksum offloading\n",
2396 VMBUS_VERSION_MAJOR(version),
2397 VMBUS_VERSION_MINOR(version));
2402 * Always schedule transmission instead of trying to do direct
2403 * transmission. This one gives the best performance so far.
2405 txr->hn_sched_tx = 1;
2407 parent_dtag = bus_get_dma_tag(dev);
2409 /* DMA tag for RNDIS packet messages. */
2410 error = bus_dma_tag_create(parent_dtag, /* parent */
2411 HN_RNDIS_PKT_ALIGN, /* alignment */
2412 HN_RNDIS_PKT_BOUNDARY, /* boundary */
2413 BUS_SPACE_MAXADDR, /* lowaddr */
2414 BUS_SPACE_MAXADDR, /* highaddr */
2415 NULL, NULL, /* filter, filterarg */
2416 HN_RNDIS_PKT_LEN, /* maxsize */
2418 HN_RNDIS_PKT_LEN, /* maxsegsize */
2420 NULL, /* lockfunc */
2421 NULL, /* lockfuncarg */
2422 &txr->hn_tx_rndis_dtag);
2424 device_printf(dev, "failed to create rndis dmatag\n");
2428 /* DMA tag for data. */
2429 error = bus_dma_tag_create(parent_dtag, /* parent */
2431 HN_TX_DATA_BOUNDARY, /* boundary */
2432 BUS_SPACE_MAXADDR, /* lowaddr */
2433 BUS_SPACE_MAXADDR, /* highaddr */
2434 NULL, NULL, /* filter, filterarg */
2435 HN_TX_DATA_MAXSIZE, /* maxsize */
2436 HN_TX_DATA_SEGCNT_MAX, /* nsegments */
2437 HN_TX_DATA_SEGSIZE, /* maxsegsize */
2439 NULL, /* lockfunc */
2440 NULL, /* lockfuncarg */
2441 &txr->hn_tx_data_dtag);
2443 device_printf(dev, "failed to create data dmatag\n");
2447 for (i = 0; i < txr->hn_txdesc_cnt; ++i) {
2448 struct hn_txdesc *txd = &txr->hn_txdesc[i];
2453 * Allocate and load RNDIS packet message.
2455 error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag,
2456 (void **)&txd->rndis_pkt,
2457 BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO,
2458 &txd->rndis_pkt_dmap);
2461 "failed to allocate rndis_packet_msg, %d\n", i);
2465 error = bus_dmamap_load(txr->hn_tx_rndis_dtag,
2466 txd->rndis_pkt_dmap,
2467 txd->rndis_pkt, HN_RNDIS_PKT_LEN,
2468 hyperv_dma_map_paddr, &txd->rndis_pkt_paddr,
2472 "failed to load rndis_packet_msg, %d\n", i);
2473 bus_dmamem_free(txr->hn_tx_rndis_dtag,
2474 txd->rndis_pkt, txd->rndis_pkt_dmap);
2478 /* DMA map for TX data. */
2479 error = bus_dmamap_create(txr->hn_tx_data_dtag, 0,
2483 "failed to allocate tx data dmamap\n");
2484 bus_dmamap_unload(txr->hn_tx_rndis_dtag,
2485 txd->rndis_pkt_dmap);
2486 bus_dmamem_free(txr->hn_tx_rndis_dtag,
2487 txd->rndis_pkt, txd->rndis_pkt_dmap);
2491 /* All set, put it to list */
2492 txd->flags |= HN_TXD_FLAG_ONLIST;
2493 #ifndef HN_USE_TXDESC_BUFRING
2494 SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link);
2496 buf_ring_enqueue(txr->hn_txdesc_br, txd);
2499 txr->hn_txdesc_avail = txr->hn_txdesc_cnt;
2501 if (sc->hn_tx_sysctl_tree != NULL) {
2502 struct sysctl_oid_list *child;
2503 struct sysctl_ctx_list *ctx;
2507 * Create per TX ring sysctl tree:
2508 * dev.hn.UNIT.tx.RINGID
2510 ctx = device_get_sysctl_ctx(dev);
2511 child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree);
2513 snprintf(name, sizeof(name), "%d", id);
2514 txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
2515 name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2517 if (txr->hn_tx_sysctl_tree != NULL) {
2518 child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree);
2520 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
2521 CTLFLAG_RD, &txr->hn_txdesc_avail, 0,
2522 "# of available TX descs");
2523 if (!hn_use_if_start) {
2524 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive",
2525 CTLFLAG_RD, &txr->hn_oactive, 0,
2528 SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets",
2529 CTLFLAG_RW, &txr->hn_pkts,
2530 "# of packets transmitted");
2538 hn_txdesc_dmamap_destroy(struct hn_txdesc *txd)
2540 struct hn_tx_ring *txr = txd->txr;
2542 KASSERT(txd->m == NULL, ("still has mbuf installed"));
2543 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped"));
2545 bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap);
2546 bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt,
2547 txd->rndis_pkt_dmap);
2548 bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap);
2552 hn_destroy_tx_ring(struct hn_tx_ring *txr)
2554 struct hn_txdesc *txd;
2556 if (txr->hn_txdesc == NULL)
2559 #ifndef HN_USE_TXDESC_BUFRING
2560 while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) {
2561 SLIST_REMOVE_HEAD(&txr->hn_txlist, link);
2562 hn_txdesc_dmamap_destroy(txd);
2565 mtx_lock(&txr->hn_tx_lock);
2566 while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL)
2567 hn_txdesc_dmamap_destroy(txd);
2568 mtx_unlock(&txr->hn_tx_lock);
2571 if (txr->hn_tx_data_dtag != NULL)
2572 bus_dma_tag_destroy(txr->hn_tx_data_dtag);
2573 if (txr->hn_tx_rndis_dtag != NULL)
2574 bus_dma_tag_destroy(txr->hn_tx_rndis_dtag);
2576 #ifdef HN_USE_TXDESC_BUFRING
2577 buf_ring_free(txr->hn_txdesc_br, M_NETVSC);
2580 free(txr->hn_txdesc, M_NETVSC);
2581 txr->hn_txdesc = NULL;
2583 if (txr->hn_mbuf_br != NULL)
2584 buf_ring_free(txr->hn_mbuf_br, M_NETVSC);
2586 #ifndef HN_USE_TXDESC_BUFRING
2587 mtx_destroy(&txr->hn_txlist_spin);
2589 mtx_destroy(&txr->hn_tx_lock);
2593 hn_create_tx_data(struct hn_softc *sc, int ring_cnt)
2595 struct sysctl_oid_list *child;
2596 struct sysctl_ctx_list *ctx;
2600 * Create TXBUF for chimney sending.
2602 * NOTE: It is shared by all channels.
2604 sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev),
2605 PAGE_SIZE, 0, NETVSC_SEND_BUFFER_SIZE, &sc->hn_chim_dma,
2606 BUS_DMA_WAITOK | BUS_DMA_ZERO);
2607 if (sc->hn_chim == NULL) {
2608 device_printf(sc->hn_dev, "allocate txbuf failed\n");
2612 sc->hn_tx_ring_cnt = ring_cnt;
2613 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
2615 sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt,
2616 M_NETVSC, M_WAITOK | M_ZERO);
2618 ctx = device_get_sysctl_ctx(sc->hn_dev);
2619 child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev));
2621 /* Create dev.hn.UNIT.tx sysctl tree */
2622 sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx",
2623 CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
2625 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
2628 error = hn_create_tx_ring(sc, i);
2633 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs",
2634 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2635 __offsetof(struct hn_tx_ring, hn_no_txdescs),
2636 hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs");
2637 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed",
2638 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2639 __offsetof(struct hn_tx_ring, hn_send_failed),
2640 hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure");
2641 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed",
2642 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2643 __offsetof(struct hn_tx_ring, hn_txdma_failed),
2644 hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure");
2645 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed",
2646 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2647 __offsetof(struct hn_tx_ring, hn_tx_collapsed),
2648 hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed");
2649 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney",
2650 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2651 __offsetof(struct hn_tx_ring, hn_tx_chimney),
2652 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send");
2653 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried",
2654 CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2655 __offsetof(struct hn_tx_ring, hn_tx_chimney_tried),
2656 hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries");
2657 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
2658 CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0,
2659 "# of total TX descs");
2660 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
2661 CTLFLAG_RD, &sc->hn_chim_szmax, 0,
2662 "Chimney send packet size upper boundary");
2663 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
2664 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
2665 hn_chim_size_sysctl, "I", "Chimney send packet size limit");
2666 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size",
2667 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2668 __offsetof(struct hn_tx_ring, hn_direct_tx_size),
2669 hn_tx_conf_int_sysctl, "I",
2670 "Size of the packet for direct transmission");
2671 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx",
2672 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc,
2673 __offsetof(struct hn_tx_ring, hn_sched_tx),
2674 hn_tx_conf_int_sysctl, "I",
2675 "Always schedule transmission "
2676 "instead of doing direct transmission");
2677 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt",
2678 CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings");
2679 SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse",
2680 CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings");
2686 hn_set_chim_size(struct hn_softc *sc, int chim_size)
2690 for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
2691 sc->hn_tx_ring[i].hn_chim_size = chim_size;
2695 hn_destroy_tx_data(struct hn_softc *sc)
2699 if (sc->hn_chim != NULL) {
2700 hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim);
2704 if (sc->hn_tx_ring_cnt == 0)
2707 for (i = 0; i < sc->hn_tx_ring_cnt; ++i)
2708 hn_destroy_tx_ring(&sc->hn_tx_ring[i]);
2710 free(sc->hn_tx_ring, M_NETVSC);
2711 sc->hn_tx_ring = NULL;
2713 sc->hn_tx_ring_cnt = 0;
2714 sc->hn_tx_ring_inuse = 0;
2718 hn_start_taskfunc(void *xtxr, int pending __unused)
2720 struct hn_tx_ring *txr = xtxr;
2722 mtx_lock(&txr->hn_tx_lock);
2723 hn_start_locked(txr, 0);
2724 mtx_unlock(&txr->hn_tx_lock);
2728 hn_start_txeof_taskfunc(void *xtxr, int pending __unused)
2730 struct hn_tx_ring *txr = xtxr;
2732 mtx_lock(&txr->hn_tx_lock);
2733 atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE);
2734 hn_start_locked(txr, 0);
2735 mtx_unlock(&txr->hn_tx_lock);
2739 hn_stop_tx_tasks(struct hn_softc *sc)
2743 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2744 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
2746 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task);
2747 taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task);
2752 hn_xmit(struct hn_tx_ring *txr, int len)
2754 struct hn_softc *sc = txr->hn_sc;
2755 struct ifnet *ifp = sc->hn_ifp;
2756 struct mbuf *m_head;
2758 mtx_assert(&txr->hn_tx_lock, MA_OWNED);
2759 KASSERT(hn_use_if_start == 0,
2760 ("hn_xmit is called, when if_start is enabled"));
2762 if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive)
2765 while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) {
2766 struct hn_txdesc *txd;
2769 if (len > 0 && m_head->m_pkthdr.len > len) {
2771 * This sending could be time consuming; let callers
2772 * dispatch this packet sending (and sending of any
2773 * following up packets) to tx taskqueue.
2775 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
2779 txd = hn_txdesc_get(txr);
2781 txr->hn_no_txdescs++;
2782 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
2783 txr->hn_oactive = 1;
2787 error = hn_encap(txr, txd, &m_head);
2789 /* Both txd and m_head are freed; discard */
2790 drbr_advance(ifp, txr->hn_mbuf_br);
2794 error = hn_send_pkt(ifp, txr, txd);
2795 if (__predict_false(error)) {
2796 /* txd is freed, but m_head is not */
2797 drbr_putback(ifp, txr->hn_mbuf_br, m_head);
2798 txr->hn_oactive = 1;
2803 drbr_advance(ifp, txr->hn_mbuf_br);
2809 hn_transmit(struct ifnet *ifp, struct mbuf *m)
2811 struct hn_softc *sc = ifp->if_softc;
2812 struct hn_tx_ring *txr;
2816 * Select the TX ring based on flowid
2818 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
2819 idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse;
2820 txr = &sc->hn_tx_ring[idx];
2822 error = drbr_enqueue(ifp, txr->hn_mbuf_br, m);
2824 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
2828 if (txr->hn_oactive)
2831 if (txr->hn_sched_tx)
2834 if (mtx_trylock(&txr->hn_tx_lock)) {
2837 sched = hn_xmit(txr, txr->hn_direct_tx_size);
2838 mtx_unlock(&txr->hn_tx_lock);
2843 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task);
2848 hn_xmit_qflush(struct ifnet *ifp)
2850 struct hn_softc *sc = ifp->if_softc;
2853 for (i = 0; i < sc->hn_tx_ring_inuse; ++i) {
2854 struct hn_tx_ring *txr = &sc->hn_tx_ring[i];
2857 mtx_lock(&txr->hn_tx_lock);
2858 while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL)
2860 mtx_unlock(&txr->hn_tx_lock);
2866 hn_xmit_txeof(struct hn_tx_ring *txr)
2869 if (txr->hn_sched_tx)
2872 if (mtx_trylock(&txr->hn_tx_lock)) {
2875 txr->hn_oactive = 0;
2876 sched = hn_xmit(txr, txr->hn_direct_tx_size);
2877 mtx_unlock(&txr->hn_tx_lock);
2879 taskqueue_enqueue(txr->hn_tx_taskq,
2885 * Release the oactive earlier, with the hope, that
2886 * others could catch up. The task will clear the
2887 * oactive again with the hn_tx_lock to avoid possible
2890 txr->hn_oactive = 0;
2891 taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task);
2896 hn_xmit_taskfunc(void *xtxr, int pending __unused)
2898 struct hn_tx_ring *txr = xtxr;
2900 mtx_lock(&txr->hn_tx_lock);
2902 mtx_unlock(&txr->hn_tx_lock);
2906 hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused)
2908 struct hn_tx_ring *txr = xtxr;
2910 mtx_lock(&txr->hn_tx_lock);
2911 txr->hn_oactive = 0;
2913 mtx_unlock(&txr->hn_tx_lock);
2917 hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan)
2919 struct hn_rx_ring *rxr;
2920 struct hn_tx_ring *txr = NULL;
2923 idx = vmbus_chan_subidx(chan);
2926 * Link this channel to RX/TX ring.
2928 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
2929 ("invalid channel index %d, should > 0 && < %d",
2930 idx, sc->hn_rx_ring_inuse));
2931 rxr = &sc->hn_rx_ring[idx];
2932 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0,
2933 ("RX ring %d already attached", idx));
2934 rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED;
2937 if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n",
2938 idx, vmbus_chan_id(chan));
2941 if (idx < sc->hn_tx_ring_inuse) {
2942 txr = &sc->hn_tx_ring[idx];
2943 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0,
2944 ("TX ring %d already attached", idx));
2945 txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED;
2947 txr->hn_chan = chan;
2949 if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n",
2950 idx, vmbus_chan_id(chan));
2954 /* Bind this channel to a proper CPU. */
2955 vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus);
2957 /* Open this channel */
2958 error = vmbus_chan_open(chan, NETVSC_DEVICE_RING_BUFFER_SIZE,
2959 NETVSC_DEVICE_RING_BUFFER_SIZE, NULL, 0, hn_chan_callback, rxr);
2961 if_printf(sc->hn_ifp, "open chan%u failed: %d\n",
2962 vmbus_chan_id(chan), error);
2963 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
2965 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
2971 hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan)
2973 struct hn_rx_ring *rxr;
2976 idx = vmbus_chan_subidx(chan);
2979 * Link this channel to RX/TX ring.
2981 KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse,
2982 ("invalid channel index %d, should > 0 && < %d",
2983 idx, sc->hn_rx_ring_inuse));
2984 rxr = &sc->hn_rx_ring[idx];
2985 KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED),
2986 ("RX ring %d is not attached", idx));
2987 rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED;
2989 if (idx < sc->hn_tx_ring_inuse) {
2990 struct hn_tx_ring *txr = &sc->hn_tx_ring[idx];
2992 KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED),
2993 ("TX ring %d is not attached attached", idx));
2994 txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED;
2998 * Close this channel.
3001 * Channel closing does _not_ destroy the target channel.
3003 vmbus_chan_close(chan);
3007 hn_attach_subchans(struct hn_softc *sc)
3009 struct vmbus_channel **subchans;
3010 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
3013 if (subchan_cnt == 0)
3016 /* Attach the sub-channels. */
3017 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
3018 for (i = 0; i < subchan_cnt; ++i) {
3019 error = hn_chan_attach(sc, subchans[i]);
3023 vmbus_subchan_rel(subchans, subchan_cnt);
3026 if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error);
3029 if_printf(sc->hn_ifp, "%d sub-channels attached\n",
3037 hn_detach_allchans(struct hn_softc *sc)
3039 struct vmbus_channel **subchans;
3040 int subchan_cnt = sc->hn_rx_ring_inuse - 1;
3043 if (subchan_cnt == 0)
3046 /* Detach the sub-channels. */
3047 subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt);
3048 for (i = 0; i < subchan_cnt; ++i)
3049 hn_chan_detach(sc, subchans[i]);
3050 vmbus_subchan_rel(subchans, subchan_cnt);
3054 * Detach the primary channel, _after_ all sub-channels
3057 hn_chan_detach(sc, sc->hn_prichan);
3059 /* Wait for sub-channels to be destroyed, if any. */
3060 vmbus_subchan_drain(sc->hn_prichan);
3063 for (i = 0; i < sc->hn_rx_ring_cnt; ++i) {
3064 KASSERT((sc->hn_rx_ring[i].hn_rx_flags &
3065 HN_RX_FLAG_ATTACHED) == 0,
3066 ("%dth RX ring is still attached", i));
3068 for (i = 0; i < sc->hn_tx_ring_cnt; ++i) {
3069 KASSERT((sc->hn_tx_ring[i].hn_tx_flags &
3070 HN_TX_FLAG_ATTACHED) == 0,
3071 ("%dth TX ring is still attached", i));
3077 hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch)
3079 struct vmbus_channel **subchans;
3080 int nchan, rxr_cnt, error;
3082 nchan = *nsubch + 1;
3083 if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_30 || nchan == 1) {
3085 * Either RSS is not supported, or multiple RX/TX rings
3086 * are not requested.
3093 * Get RSS capabilities, e.g. # of RX rings, and # of indirect
3096 error = hn_rndis_get_rsscaps(sc, &rxr_cnt);
3098 /* No RSS; this is benign. */
3102 if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n",
3105 if (nchan > rxr_cnt)
3108 if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n");
3114 * Allocate sub-channels from NVS.
3116 *nsubch = nchan - 1;
3117 error = hn_nvs_alloc_subchans(sc, nsubch);
3118 if (error || *nsubch == 0) {
3119 /* Failed to allocate sub-channels. */
3125 * Wait for all sub-channels to become ready before moving on.
3127 subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch);
3128 vmbus_subchan_rel(subchans, *nsubch);
3133 hn_synth_attach(struct hn_softc *sc, int mtu)
3135 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss;
3136 int error, nsubch, nchan, i;
3139 * Attach the primary channel _before_ attaching NVS and RNDIS.
3141 error = hn_chan_attach(sc, sc->hn_prichan);
3148 error = hn_nvs_attach(sc, mtu);
3153 * Attach RNDIS _after_ NVS is attached.
3155 error = hn_rndis_attach(sc);
3160 * Allocate sub-channels for multi-TX/RX rings.
3163 * The # of RX rings that can be used is equivalent to the # of
3164 * channels to be requested.
3166 nsubch = sc->hn_rx_ring_cnt - 1;
3167 error = hn_synth_alloc_subchans(sc, &nsubch);
3173 /* Only the primary channel can be used; done */
3178 * Configure RSS key and indirect table _after_ all sub-channels
3182 /* Setup default RSS key. */
3183 memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key));
3185 /* Setup default RSS indirect table. */
3186 /* TODO: Take ndis_rss_caps.ndis_nind into account. */
3187 for (i = 0; i < NDIS_HASH_INDCNT; ++i)
3188 rss->rss_ind[i] = i % nchan;
3190 error = hn_rndis_conf_rss(sc);
3193 * Failed to configure RSS key or indirect table; only
3194 * the primary channel can be used.
3200 * Set the # of TX/RX rings that could be used according to
3201 * the # of channels that NVS offered.
3203 hn_set_ring_inuse(sc, nchan);
3206 * Attach the sub-channels, if any.
3208 error = hn_attach_subchans(sc);
3215 hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt)
3217 KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt,
3218 ("invalid ring count %d", ring_cnt));
3220 if (sc->hn_tx_ring_cnt > ring_cnt)
3221 sc->hn_tx_ring_inuse = ring_cnt;
3223 sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt;
3224 sc->hn_rx_ring_inuse = ring_cnt;
3227 if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n",
3228 sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse);
3233 hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt)
3235 const struct hn_nvs_hdr *hdr;
3237 if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) {
3238 if_printf(sc->hn_ifp, "invalid nvs notify\n");
3241 hdr = VMBUS_CHANPKT_CONST_DATA(pkt);
3243 if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) {
3244 /* Useless; ignore */
3247 if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type);
3251 hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan,
3252 const struct vmbus_chanpkt_hdr *pkt)
3254 struct hn_send_ctx *sndc;
3256 sndc = (struct hn_send_ctx *)(uintptr_t)pkt->cph_xactid;
3257 sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt),
3258 VMBUS_CHANPKT_DATALEN(pkt));
3261 * 'sndc' CAN NOT be accessed anymore, since it can be freed by
3267 hn_nvs_handle_rxbuf(struct hn_softc *sc, struct hn_rx_ring *rxr,
3268 struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr)
3270 const struct vmbus_chanpkt_rxbuf *pkt;
3271 const struct hn_nvs_hdr *nvs_hdr;
3274 if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) {
3275 if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n");
3278 nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr);
3280 /* Make sure that this is a RNDIS message. */
3281 if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) {
3282 if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n",
3287 hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen);
3288 if (__predict_false(hlen < sizeof(*pkt))) {
3289 if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n");
3292 pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr;
3294 if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) {
3295 if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n",
3300 count = pkt->cp_rxbuf_cnt;
3301 if (__predict_false(hlen <
3302 __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) {
3303 if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count);
3307 /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */
3308 for (i = 0; i < count; ++i) {
3311 ofs = pkt->cp_rxbuf[i].rb_ofs;
3312 len = pkt->cp_rxbuf[i].rb_len;
3313 if (__predict_false(ofs + len > NETVSC_RECEIVE_BUFFER_SIZE)) {
3314 if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, "
3315 "ofs %d, len %d\n", i, ofs, len);
3318 hv_rf_on_receive(sc, rxr, rxr->hn_rxbuf + ofs, len);
3322 * Moved completion call back here so that all received
3323 * messages (not just data messages) will trigger a response
3324 * message back to the host.
3326 hn_nvs_ack_rxbuf(chan, pkt->cp_hdr.cph_xactid);
3330 * Net VSC on receive completion
3332 * Send a receive completion packet to RNDIS device (ie NetVsp)
3335 hn_nvs_ack_rxbuf(struct vmbus_channel *chan, uint64_t tid)
3337 struct hn_nvs_rndis_ack ack;
3341 ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK;
3342 ack.nvs_status = HN_NVS_STATUS_OK;
3345 /* Send the completion */
3346 ret = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP,
3347 VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid);
3351 } else if (ret == EAGAIN) {
3352 /* no more room... wait a bit and attempt to retry 3 times */
3357 goto retry_send_cmplt;
3363 hn_chan_callback(struct vmbus_channel *chan, void *xrxr)
3365 struct hn_rx_ring *rxr = xrxr;
3366 struct hn_softc *sc = rxr->hn_ifp->if_softc;
3368 int bufferlen = NETVSC_PACKET_SIZE;
3370 buffer = rxr->hn_rdbuf;
3372 struct vmbus_chanpkt_hdr *pkt = buffer;
3373 uint32_t bytes_rxed;
3376 bytes_rxed = bufferlen;
3377 ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed);
3379 switch (pkt->cph_type) {
3380 case VMBUS_CHANPKT_TYPE_COMP:
3381 hn_nvs_handle_comp(sc, chan, pkt);
3383 case VMBUS_CHANPKT_TYPE_RXBUF:
3384 hn_nvs_handle_rxbuf(sc, rxr, chan, pkt);
3386 case VMBUS_CHANPKT_TYPE_INBAND:
3387 hn_nvs_handle_notify(sc, pkt);
3390 if_printf(rxr->hn_ifp,
3391 "unknown chan pkt %u\n",
3395 } else if (ret == ENOBUFS) {
3396 /* Handle large packet */
3397 if (bufferlen > NETVSC_PACKET_SIZE) {
3398 free(buffer, M_NETVSC);
3402 /* alloc new buffer */
3403 buffer = malloc(bytes_rxed, M_NETVSC, M_NOWAIT);
3404 if (buffer == NULL) {
3405 if_printf(rxr->hn_ifp,
3406 "hv_cb malloc buffer failed, len=%u\n",
3411 bufferlen = bytes_rxed;
3413 /* No more packets */
3418 if (bufferlen > NETVSC_PACKET_SIZE)
3419 free(buffer, M_NETVSC);
3421 hv_rf_channel_rollup(rxr, rxr->hn_txr);
3425 hn_tx_taskq_create(void *arg __unused)
3427 if (!hn_share_tx_taskq)
3430 hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK,
3431 taskqueue_thread_enqueue, &hn_tx_taskq);
3432 taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx");
3433 if (hn_bind_tx_taskq >= 0) {
3434 int cpu = hn_bind_tx_taskq;
3435 struct task cpuset_task;
3438 if (cpu > mp_ncpus - 1)
3440 CPU_SETOF(cpu, &cpu_set);
3441 TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, &cpu_set);
3442 taskqueue_enqueue(hn_tx_taskq, &cpuset_task);
3443 taskqueue_drain(hn_tx_taskq, &cpuset_task);
3446 SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_FIRST,
3447 hn_tx_taskq_create, NULL);
3450 hn_tx_taskq_destroy(void *arg __unused)
3452 if (hn_tx_taskq != NULL)
3453 taskqueue_free(hn_tx_taskq);
3455 SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_FIRST,
3456 hn_tx_taskq_destroy, NULL);
3458 static device_method_t netvsc_methods[] = {
3459 /* Device interface */
3460 DEVMETHOD(device_probe, netvsc_probe),
3461 DEVMETHOD(device_attach, netvsc_attach),
3462 DEVMETHOD(device_detach, netvsc_detach),
3463 DEVMETHOD(device_shutdown, netvsc_shutdown),
3468 static driver_t netvsc_driver = {
3471 sizeof(struct hn_softc)
3474 static devclass_t netvsc_devclass;
3476 DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0);
3477 MODULE_VERSION(hn, 1);
3478 MODULE_DEPEND(hn, vmbus, 1, 1, 1);