]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
MFC 295296,295297,295298,295299,295300,295301
[FreeBSD/stable/10.git] / sys / dev / hyperv / netvsc / hv_netvsc_drv_freebsd.c
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57
58 #include "opt_inet6.h"
59 #include "opt_inet.h"
60
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/sockio.h>
64 #include <sys/mbuf.h>
65 #include <sys/malloc.h>
66 #include <sys/module.h>
67 #include <sys/kernel.h>
68 #include <sys/socket.h>
69 #include <sys/queue.h>
70 #include <sys/lock.h>
71 #include <sys/sx.h>
72 #include <sys/sysctl.h>
73
74 #include <net/if.h>
75 #include <net/if_arp.h>
76 #include <net/ethernet.h>
77 #include <net/if_dl.h>
78 #include <net/if_media.h>
79
80 #include <net/bpf.h>
81
82 #include <net/if_types.h>
83 #include <net/if_vlan_var.h>
84 #include <net/if.h>
85
86 #include <netinet/in_systm.h>
87 #include <netinet/in.h>
88 #include <netinet/ip.h>
89 #include <netinet/if_ether.h>
90 #include <netinet/tcp.h>
91 #include <netinet/udp.h>
92 #include <netinet/ip6.h>
93
94 #include <vm/vm.h>
95 #include <vm/vm_param.h>
96 #include <vm/vm_kern.h>
97 #include <vm/pmap.h>
98
99 #include <machine/bus.h>
100 #include <machine/resource.h>
101 #include <machine/frame.h>
102 #include <machine/vmparam.h>
103
104 #include <sys/bus.h>
105 #include <sys/rman.h>
106 #include <sys/mutex.h>
107 #include <sys/errno.h>
108 #include <sys/types.h>
109 #include <machine/atomic.h>
110
111 #include <machine/intr_machdep.h>
112
113 #include <machine/in_cksum.h>
114
115 #include <dev/hyperv/include/hyperv.h>
116 #include "hv_net_vsc.h"
117 #include "hv_rndis.h"
118 #include "hv_rndis_filter.h"
119
120
121 /* Short for Hyper-V network interface */
122 #define NETVSC_DEVNAME    "hn"
123
124 /*
125  * It looks like offset 0 of buf is reserved to hold the softc pointer.
126  * The sc pointer evidently not needed, and is not presently populated.
127  * The packet offset is where the netvsc_packet starts in the buffer.
128  */
129 #define HV_NV_SC_PTR_OFFSET_IN_BUF         0
130 #define HV_NV_PACKET_OFFSET_IN_BUF         16
131
132 /* YYY should get it from the underlying channel */
133 #define HN_TX_DESC_CNT                  512
134
135 #define HN_RNDIS_MSG_LEN                \
136     (sizeof(rndis_msg) +                \
137      RNDIS_VLAN_PPI_SIZE +              \
138      RNDIS_TSO_PPI_SIZE +               \
139      RNDIS_CSUM_PPI_SIZE)
140 #define HN_RNDIS_MSG_BOUNDARY           PAGE_SIZE
141 #define HN_RNDIS_MSG_ALIGN              CACHE_LINE_SIZE
142
143 #define HN_TX_DATA_BOUNDARY             PAGE_SIZE
144 #define HN_TX_DATA_MAXSIZE              IP_MAXPACKET
145 #define HN_TX_DATA_SEGSIZE              PAGE_SIZE
146 #define HN_TX_DATA_SEGCNT_MAX           \
147     (NETVSC_PACKET_MAXPAGE - HV_RF_NUM_TX_RESERVED_PAGE_BUFS)
148
149 #define HN_DIRECT_TX_SIZE_DEF           128
150
151 struct hn_txdesc {
152         SLIST_ENTRY(hn_txdesc) link;
153         struct mbuf     *m;
154         struct hn_softc *sc;
155         int             refs;
156         uint32_t        flags;          /* HN_TXD_FLAG_ */
157         netvsc_packet   netvsc_pkt;     /* XXX to be removed */
158
159         bus_dmamap_t    data_dmap;
160
161         bus_addr_t      rndis_msg_paddr;
162         rndis_msg       *rndis_msg;
163         bus_dmamap_t    rndis_msg_dmap;
164 };
165
166 #define HN_TXD_FLAG_ONLIST      0x1
167 #define HN_TXD_FLAG_DMAMAP      0x2
168
169 /*
170  * Only enable UDP checksum offloading when it is on 2012R2 or
171  * later.  UDP checksum offloading doesn't work on earlier
172  * Windows releases.
173  */
174 #define HN_CSUM_ASSIST_WIN8     (CSUM_TCP)
175 #define HN_CSUM_ASSIST          (CSUM_IP | CSUM_UDP | CSUM_TCP)
176
177 /* XXX move to netinet/tcp_lro.h */
178 #define HN_LRO_HIWAT_MAX                                65535
179 #define HN_LRO_HIWAT_DEF                                HN_LRO_HIWAT_MAX
180 /* YYY 2*MTU is a bit rough, but should be good enough. */
181 #define HN_LRO_HIWAT_MTULIM(ifp)                        (2 * (ifp)->if_mtu)
182 #define HN_LRO_HIWAT_ISVALID(sc, hiwat)                 \
183     ((hiwat) >= HN_LRO_HIWAT_MTULIM((sc)->hn_ifp) ||    \
184      (hiwat) <= HN_LRO_HIWAT_MAX)
185
186 /*
187  * Be aware that this sleepable mutex will exhibit WITNESS errors when
188  * certain TCP and ARP code paths are taken.  This appears to be a
189  * well-known condition, as all other drivers checked use a sleeping
190  * mutex to protect their transmit paths.
191  * Also Be aware that mutexes do not play well with semaphores, and there
192  * is a conflicting semaphore in a certain channel code path.
193  */
194 #define NV_LOCK_INIT(_sc, _name) \
195             mtx_init(&(_sc)->hn_lock, _name, MTX_NETWORK_LOCK, MTX_DEF)
196 #define NV_LOCK(_sc)            mtx_lock(&(_sc)->hn_lock)
197 #define NV_TRYLOCK(_sc)         mtx_trylock(&(_sc)->hn_lock)
198 #define NV_LOCK_ASSERT(_sc)     mtx_assert(&(_sc)->hn_lock, MA_OWNED)
199 #define NV_UNLOCK(_sc)          mtx_unlock(&(_sc)->hn_lock)
200 #define NV_LOCK_DESTROY(_sc)    mtx_destroy(&(_sc)->hn_lock)
201
202
203 /*
204  * Globals
205  */
206
207 int hv_promisc_mode = 0;    /* normal mode by default */
208
209 /* Trust tcp segements verification on host side. */
210 static int hn_trust_hosttcp = 1;
211 TUNABLE_INT("dev.hn.trust_hosttcp", &hn_trust_hosttcp);
212
213 /* Trust udp datagrams verification on host side. */
214 static int hn_trust_hostudp = 1;
215 TUNABLE_INT("dev.hn.trust_hostudp", &hn_trust_hostudp);
216
217 /* Trust ip packets verification on host side. */
218 static int hn_trust_hostip = 1;
219 TUNABLE_INT("dev.hn.trust_hostip", &hn_trust_hostip);
220
221 #if __FreeBSD_version >= 1100045
222 /* Limit TSO burst size */
223 static int hn_tso_maxlen = 0;
224 TUNABLE_INT("dev.hn.tso_maxlen", &hn_tso_maxlen);
225 #endif
226
227 /* Limit chimney send size */
228 static int hn_tx_chimney_size = 0;
229 TUNABLE_INT("dev.hn.tx_chimney_size", &hn_tx_chimney_size);
230
231 /* Limit the size of packet for direct transmission */
232 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
233 TUNABLE_INT("dev.hn.direct_tx_size", &hn_direct_tx_size);
234
235 /*
236  * Forward declarations
237  */
238 static void hn_stop(hn_softc_t *sc);
239 static void hn_ifinit_locked(hn_softc_t *sc);
240 static void hn_ifinit(void *xsc);
241 static int  hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
242 static int hn_start_locked(struct ifnet *ifp, int len);
243 static void hn_start(struct ifnet *ifp);
244 static void hn_start_txeof(struct ifnet *ifp);
245 static int hn_ifmedia_upd(struct ifnet *ifp);
246 static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
247 #ifdef HN_LRO_HIWAT
248 static int hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS);
249 #endif
250 static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS);
251 static int hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS);
252 static int hn_check_iplen(const struct mbuf *, int);
253 static int hn_create_tx_ring(struct hn_softc *sc);
254 static void hn_destroy_tx_ring(struct hn_softc *sc);
255 static void hn_start_taskfunc(void *xsc, int pending);
256 static void hn_txeof_taskfunc(void *xsc, int pending);
257
258 static __inline void
259 hn_set_lro_hiwat(struct hn_softc *sc, int hiwat)
260 {
261         sc->hn_lro_hiwat = hiwat;
262 #ifdef HN_LRO_HIWAT
263         sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat;
264 #endif
265 }
266
267 static int
268 hn_ifmedia_upd(struct ifnet *ifp __unused)
269 {
270
271         return EOPNOTSUPP;
272 }
273
274 static void
275 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
276 {
277         struct hn_softc *sc = ifp->if_softc;
278
279         ifmr->ifm_status = IFM_AVALID;
280         ifmr->ifm_active = IFM_ETHER;
281
282         if (!sc->hn_carrier) {
283                 ifmr->ifm_active |= IFM_NONE;
284                 return;
285         }
286         ifmr->ifm_status |= IFM_ACTIVE;
287         ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
288 }
289
290 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
291 static const hv_guid g_net_vsc_device_type = {
292         .data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
293                 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
294 };
295
296 /*
297  * Standard probe entry point.
298  *
299  */
300 static int
301 netvsc_probe(device_t dev)
302 {
303         const char *p;
304
305         p = vmbus_get_type(dev);
306         if (!memcmp(p, &g_net_vsc_device_type.data, sizeof(hv_guid))) {
307                 device_set_desc(dev, "Synthetic Network Interface");
308                 if (bootverbose)
309                         printf("Netvsc probe... DONE \n");
310
311                 return (BUS_PROBE_DEFAULT);
312         }
313
314         return (ENXIO);
315 }
316
317 /*
318  * Standard attach entry point.
319  *
320  * Called when the driver is loaded.  It allocates needed resources,
321  * and initializes the "hardware" and software.
322  */
323 static int
324 netvsc_attach(device_t dev)
325 {
326         struct hv_device *device_ctx = vmbus_get_devctx(dev);
327         netvsc_device_info device_info;
328         hn_softc_t *sc;
329         int unit = device_get_unit(dev);
330         struct ifnet *ifp = NULL;
331         struct sysctl_oid_list *child;
332         struct sysctl_ctx_list *ctx;
333         int error;
334 #if __FreeBSD_version >= 1100045
335         int tso_maxlen;
336 #endif
337
338         sc = device_get_softc(dev);
339         if (sc == NULL) {
340                 return (ENOMEM);
341         }
342
343         bzero(sc, sizeof(hn_softc_t));
344         sc->hn_unit = unit;
345         sc->hn_dev = dev;
346         sc->hn_lro_hiwat = HN_LRO_HIWAT_DEF;
347         sc->hn_direct_tx_size = hn_direct_tx_size;
348         if (hn_trust_hosttcp)
349                 sc->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP;
350         if (hn_trust_hostudp)
351                 sc->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP;
352         if (hn_trust_hostip)
353                 sc->hn_trust_hcsum |= HN_TRUST_HCSUM_IP;
354
355         sc->hn_tx_taskq = taskqueue_create_fast("hn_tx", M_WAITOK,
356             taskqueue_thread_enqueue, &sc->hn_tx_taskq);
357         taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx",
358             device_get_nameunit(dev));
359         TASK_INIT(&sc->hn_start_task, 0, hn_start_taskfunc, sc);
360         TASK_INIT(&sc->hn_txeof_task, 0, hn_txeof_taskfunc, sc);
361
362         error = hn_create_tx_ring(sc);
363         if (error)
364                 goto failed;
365
366         NV_LOCK_INIT(sc, "NetVSCLock");
367
368         sc->hn_dev_obj = device_ctx;
369
370         ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
371         ifp->if_softc = sc;
372
373         if_initname(ifp, device_get_name(dev), device_get_unit(dev));
374         ifp->if_dunit = unit;
375         ifp->if_dname = NETVSC_DEVNAME;
376
377         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
378         ifp->if_ioctl = hn_ioctl;
379         ifp->if_start = hn_start;
380         ifp->if_init = hn_ifinit;
381         /* needed by hv_rf_on_device_add() code */
382         ifp->if_mtu = ETHERMTU;
383         IFQ_SET_MAXLEN(&ifp->if_snd, 512);
384         ifp->if_snd.ifq_drv_maxlen = 511;
385         IFQ_SET_READY(&ifp->if_snd);
386
387         ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
388         ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
389         ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
390         /* XXX ifmedia_set really should do this for us */
391         sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
392
393         /*
394          * Tell upper layers that we support full VLAN capability.
395          */
396         ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header);
397         ifp->if_capabilities |=
398             IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO |
399             IFCAP_LRO;
400         ifp->if_capenable |=
401             IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO |
402             IFCAP_LRO;
403
404         if (hv_vmbus_protocal_version >= HV_VMBUS_VERSION_WIN8_1)
405                 sc->hn_csum_assist = HN_CSUM_ASSIST;
406         else
407                 sc->hn_csum_assist = HN_CSUM_ASSIST_WIN8;
408         ifp->if_hwassist = sc->hn_csum_assist | CSUM_TSO;
409
410         error = hv_rf_on_device_add(device_ctx, &device_info);
411         if (error)
412                 goto failed;
413
414         if (device_info.link_state == 0) {
415                 sc->hn_carrier = 1;
416         }
417
418 #if defined(INET) || defined(INET6)
419         tcp_lro_init(&sc->hn_lro);
420         /* Driver private LRO settings */
421         sc->hn_lro.ifp = ifp;
422 #ifdef HN_LRO_HIWAT
423         sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat;
424 #endif
425 #endif  /* INET || INET6 */
426
427 #if __FreeBSD_version >= 1100045
428         tso_maxlen = hn_tso_maxlen;
429         if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET)
430                 tso_maxlen = IP_MAXPACKET;
431
432         ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
433         ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
434         ifp->if_hw_tsomax = tso_maxlen -
435             (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
436 #endif
437
438         ether_ifattach(ifp, device_info.mac_addr);
439
440 #if __FreeBSD_version >= 1100045
441         if_printf(ifp, "TSO: %u/%u/%u\n", ifp->if_hw_tsomax,
442             ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
443 #endif
444
445         sc->hn_tx_chimney_max = sc->net_dev->send_section_size;
446         sc->hn_tx_chimney_size = sc->hn_tx_chimney_max;
447         if (hn_tx_chimney_size > 0 &&
448             hn_tx_chimney_size < sc->hn_tx_chimney_max)
449                 sc->hn_tx_chimney_size = hn_tx_chimney_size;
450
451         ctx = device_get_sysctl_ctx(dev);
452         child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
453
454         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_queued",
455             CTLFLAG_RW, &sc->hn_lro.lro_queued, 0, "LRO queued");
456         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_flushed",
457             CTLFLAG_RW, &sc->hn_lro.lro_flushed, 0, "LRO flushed");
458         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "lro_tried",
459             CTLFLAG_RW, &sc->hn_lro_tried, "# of LRO tries");
460 #ifdef HN_LRO_HIWAT
461         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_hiwat",
462             CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_hiwat_sysctl,
463             "I", "LRO high watermark");
464 #endif
465         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp",
466             CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_TCP,
467             hn_trust_hcsum_sysctl, "I",
468             "Trust tcp segement verification on host side, "
469             "when csum info is missing");
470         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp",
471             CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_UDP,
472             hn_trust_hcsum_sysctl, "I",
473             "Trust udp datagram verification on host side, "
474             "when csum info is missing");
475         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip",
476             CTLTYPE_INT | CTLFLAG_RW, sc, HN_TRUST_HCSUM_IP,
477             hn_trust_hcsum_sysctl, "I",
478             "Trust ip packet verification on host side, "
479             "when csum info is missing");
480         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_ip",
481             CTLFLAG_RW, &sc->hn_csum_ip, "RXCSUM IP");
482         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_tcp",
483             CTLFLAG_RW, &sc->hn_csum_tcp, "RXCSUM TCP");
484         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_udp",
485             CTLFLAG_RW, &sc->hn_csum_udp, "RXCSUM UDP");
486         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_trusted",
487             CTLFLAG_RW, &sc->hn_csum_trusted,
488             "# of packets that we trust host's csum verification");
489         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "small_pkts",
490             CTLFLAG_RW, &sc->hn_small_pkts, "# of small packets received");
491         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "no_txdescs",
492             CTLFLAG_RW, &sc->hn_no_txdescs, "# of times short of TX descs");
493         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "send_failed",
494             CTLFLAG_RW, &sc->hn_send_failed, "# of hyper-v sending failure");
495         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "txdma_failed",
496             CTLFLAG_RW, &sc->hn_txdma_failed, "# of TX DMA failure");
497         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_collapsed",
498             CTLFLAG_RW, &sc->hn_tx_collapsed, "# of TX mbuf collapsed");
499         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_chimney",
500             CTLFLAG_RW, &sc->hn_tx_chimney, "# of chimney send");
501         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
502             CTLFLAG_RD, &sc->hn_txdesc_cnt, 0, "# of total TX descs");
503         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
504             CTLFLAG_RD, &sc->hn_txdesc_avail, 0, "# of available TX descs");
505         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
506             CTLFLAG_RD, &sc->hn_tx_chimney_max, 0,
507             "Chimney send packet size upper boundary");
508         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
509             CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_tx_chimney_size_sysctl,
510             "I", "Chimney send packet size limit");
511         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "direct_tx_size",
512             CTLFLAG_RW, &sc->hn_direct_tx_size, 0,
513             "Size of the packet for direct transmission");
514
515         if (unit == 0) {
516                 struct sysctl_ctx_list *dc_ctx;
517                 struct sysctl_oid_list *dc_child;
518                 devclass_t dc;
519
520                 /*
521                  * Add sysctl nodes for devclass
522                  */
523                 dc = device_get_devclass(dev);
524                 dc_ctx = devclass_get_sysctl_ctx(dc);
525                 dc_child = SYSCTL_CHILDREN(devclass_get_sysctl_tree(dc));
526
527                 SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "trust_hosttcp",
528                     CTLFLAG_RD, &hn_trust_hosttcp, 0,
529                     "Trust tcp segement verification on host side, "
530                     "when csum info is missing (global setting)");
531                 SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "trust_hostudp",
532                     CTLFLAG_RD, &hn_trust_hostudp, 0,
533                     "Trust udp datagram verification on host side, "
534                     "when csum info is missing (global setting)");
535                 SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "trust_hostip",
536                     CTLFLAG_RD, &hn_trust_hostip, 0,
537                     "Trust ip packet verification on host side, "
538                     "when csum info is missing (global setting)");
539                 SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tx_chimney_size",
540                     CTLFLAG_RD, &hn_tx_chimney_size, 0,
541                     "Chimney send packet size limit");
542 #if __FreeBSD_version >= 1100045
543                 SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tso_maxlen",
544                     CTLFLAG_RD, &hn_tso_maxlen, 0, "TSO burst limit");
545 #endif
546                 SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "direct_tx_size",
547                     CTLFLAG_RD, &hn_direct_tx_size, 0,
548                     "Size of the packet for direct transmission");
549         }
550
551         return (0);
552 failed:
553         hn_destroy_tx_ring(sc);
554         if (ifp != NULL)
555                 if_free(ifp);
556         return (error);
557 }
558
559 /*
560  * Standard detach entry point
561  */
562 static int
563 netvsc_detach(device_t dev)
564 {
565         struct hn_softc *sc = device_get_softc(dev);
566         struct hv_device *hv_device = vmbus_get_devctx(dev); 
567
568         if (bootverbose)
569                 printf("netvsc_detach\n");
570
571         /*
572          * XXXKYS:  Need to clean up all our
573          * driver state; this is the driver
574          * unloading.
575          */
576
577         /*
578          * XXXKYS:  Need to stop outgoing traffic and unregister
579          * the netdevice.
580          */
581
582         hv_rf_on_device_remove(hv_device, HV_RF_NV_DESTROY_CHANNEL);
583
584         taskqueue_drain(sc->hn_tx_taskq, &sc->hn_start_task);
585         taskqueue_drain(sc->hn_tx_taskq, &sc->hn_txeof_task);
586         taskqueue_free(sc->hn_tx_taskq);
587
588         ifmedia_removeall(&sc->hn_media);
589 #if defined(INET) || defined(INET6)
590         tcp_lro_free(&sc->hn_lro);
591 #endif
592         hn_destroy_tx_ring(sc);
593
594         return (0);
595 }
596
597 /*
598  * Standard shutdown entry point
599  */
600 static int
601 netvsc_shutdown(device_t dev)
602 {
603         return (0);
604 }
605
606 static __inline int
607 hn_txdesc_dmamap_load(struct hn_softc *sc, struct hn_txdesc *txd,
608     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
609 {
610         struct mbuf *m = *m_head;
611         int error;
612
613         error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag, txd->data_dmap,
614             m, segs, nsegs, BUS_DMA_NOWAIT);
615         if (error == EFBIG) {
616                 struct mbuf *m_new;
617
618                 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
619                 if (m_new == NULL)
620                         return ENOBUFS;
621                 else
622                         *m_head = m = m_new;
623                 sc->hn_tx_collapsed++;
624
625                 error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag,
626                     txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
627         }
628         if (!error) {
629                 bus_dmamap_sync(sc->hn_tx_data_dtag, txd->data_dmap,
630                     BUS_DMASYNC_PREWRITE);
631                 txd->flags |= HN_TXD_FLAG_DMAMAP;
632         }
633         return error;
634 }
635
636 static __inline void
637 hn_txdesc_dmamap_unload(struct hn_softc *sc, struct hn_txdesc *txd)
638 {
639
640         if (txd->flags & HN_TXD_FLAG_DMAMAP) {
641                 bus_dmamap_sync(sc->hn_tx_data_dtag,
642                     txd->data_dmap, BUS_DMASYNC_POSTWRITE);
643                 bus_dmamap_unload(sc->hn_tx_data_dtag,
644                     txd->data_dmap);
645                 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
646         }
647 }
648
649 static __inline int
650 hn_txdesc_put(struct hn_softc *sc, struct hn_txdesc *txd)
651 {
652
653         KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
654             ("put an onlist txd %#x", txd->flags));
655
656         KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
657         if (atomic_fetchadd_int(&txd->refs, -1) != 1)
658                 return 0;
659
660         hn_txdesc_dmamap_unload(sc, txd);
661         if (txd->m != NULL) {
662                 m_freem(txd->m);
663                 txd->m = NULL;
664         }
665
666         txd->flags |= HN_TXD_FLAG_ONLIST;
667
668         mtx_lock_spin(&sc->hn_txlist_spin);
669         KASSERT(sc->hn_txdesc_avail >= 0 &&
670             sc->hn_txdesc_avail < sc->hn_txdesc_cnt,
671             ("txdesc_put: invalid txd avail %d", sc->hn_txdesc_avail));
672         sc->hn_txdesc_avail++;
673         SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link);
674         mtx_unlock_spin(&sc->hn_txlist_spin);
675
676         return 1;
677 }
678
679 static __inline struct hn_txdesc *
680 hn_txdesc_get(struct hn_softc *sc)
681 {
682         struct hn_txdesc *txd;
683
684         mtx_lock_spin(&sc->hn_txlist_spin);
685         txd = SLIST_FIRST(&sc->hn_txlist);
686         if (txd != NULL) {
687                 KASSERT(sc->hn_txdesc_avail > 0,
688                     ("txdesc_get: invalid txd avail %d", sc->hn_txdesc_avail));
689                 sc->hn_txdesc_avail--;
690                 SLIST_REMOVE_HEAD(&sc->hn_txlist, link);
691         }
692         mtx_unlock_spin(&sc->hn_txlist_spin);
693
694         if (txd != NULL) {
695                 KASSERT(txd->m == NULL && txd->refs == 0 &&
696                     (txd->flags & HN_TXD_FLAG_ONLIST), ("invalid txd"));
697                 txd->flags &= ~HN_TXD_FLAG_ONLIST;
698                 txd->refs = 1;
699         }
700         return txd;
701 }
702
703 static __inline void
704 hn_txdesc_hold(struct hn_txdesc *txd)
705 {
706
707         /* 0->1 transition will never work */
708         KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
709         atomic_add_int(&txd->refs, 1);
710 }
711
712 /*
713  * Send completion processing
714  *
715  * Note:  It looks like offset 0 of buf is reserved to hold the softc
716  * pointer.  The sc pointer is not currently needed in this function, and
717  * it is not presently populated by the TX function.
718  */
719 void
720 netvsc_xmit_completion(void *context)
721 {
722         netvsc_packet *packet = context;
723         struct hn_txdesc *txd;
724         struct hn_softc *sc;
725
726         txd = (struct hn_txdesc *)(uintptr_t)
727             packet->compl.send.send_completion_tid;
728
729         sc = txd->sc;
730         sc->hn_txeof = 1;
731         hn_txdesc_put(sc, txd);
732 }
733
734 void
735 netvsc_channel_rollup(struct hv_device *device_ctx)
736 {
737         struct hn_softc *sc = device_get_softc(device_ctx->device);
738
739         if (!sc->hn_txeof)
740                 return;
741
742         sc->hn_txeof = 0;
743         hn_start_txeof(sc->hn_ifp);
744 }
745
746 /*
747  * Start a transmit of one or more packets
748  */
749 static int
750 hn_start_locked(struct ifnet *ifp, int len)
751 {
752         hn_softc_t *sc = ifp->if_softc;
753         struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
754         netvsc_dev *net_dev = sc->net_dev;
755         rndis_msg *rndis_mesg;
756         rndis_packet *rndis_pkt;
757         rndis_per_packet_info *rppi;
758         ndis_8021q_info *rppi_vlan_info;
759         rndis_tcp_ip_csum_info *csum_info;
760         rndis_tcp_tso_info *tso_info;   
761         uint32_t rndis_msg_size = 0;
762
763         if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
764             IFF_DRV_RUNNING)
765                 return 0;
766
767         while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
768                 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
769                 int error, nsegs, i, send_failed = 0;
770                 struct hn_txdesc *txd;
771                 netvsc_packet *packet;
772                 struct mbuf *m_head;
773
774                 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
775                 if (m_head == NULL)
776                         break;
777
778                 if (len > 0 && m_head->m_pkthdr.len > len) {
779                         /*
780                          * This sending could be time consuming; let callers
781                          * dispatch this packet sending (and sending of any
782                          * following up packets) to tx taskqueue.
783                          */
784                         IF_PREPEND(&ifp->if_snd, m_head);
785                         return 1;
786                 }
787
788                 txd = hn_txdesc_get(sc);
789                 if (txd == NULL) {
790                         sc->hn_no_txdescs++;
791                         IF_PREPEND(&ifp->if_snd, m_head);
792                         atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
793                         break;
794                 }
795
796                 packet = &txd->netvsc_pkt;
797                 packet->is_data_pkt = TRUE;
798                 /* Initialize it from the mbuf */
799                 packet->tot_data_buf_len = m_head->m_pkthdr.len;
800
801                 /*
802                  * extension points to the area reserved for the
803                  * rndis_filter_packet, which is placed just after
804                  * the netvsc_packet (and rppi struct, if present;
805                  * length is updated later).
806                  */
807                 rndis_mesg = txd->rndis_msg;
808                 /* XXX not necessary */
809                 memset(rndis_mesg, 0, HN_RNDIS_MSG_LEN);
810                 rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG;
811
812                 rndis_pkt = &rndis_mesg->msg.packet;
813                 rndis_pkt->data_offset = sizeof(rndis_packet);
814                 rndis_pkt->data_length = packet->tot_data_buf_len;
815                 rndis_pkt->per_pkt_info_offset = sizeof(rndis_packet);
816
817                 rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet);
818
819                 /*
820                  * If the Hyper-V infrastructure needs to embed a VLAN tag,
821                  * initialize netvsc_packet and rppi struct values as needed.
822                  */
823                 if (m_head->m_flags & M_VLANTAG) {
824                         /*
825                          * set up some additional fields so the Hyper-V infrastructure will stuff the VLAN tag
826                          * into the frame.
827                          */
828                         rndis_msg_size += RNDIS_VLAN_PPI_SIZE;
829
830                         rppi = hv_set_rppi_data(rndis_mesg, RNDIS_VLAN_PPI_SIZE,
831                             ieee_8021q_info);
832                 
833                         /* VLAN info immediately follows rppi struct */
834                         rppi_vlan_info = (ndis_8021q_info *)((char*)rppi + 
835                             rppi->per_packet_info_offset);
836                         /* FreeBSD does not support CFI or priority */
837                         rppi_vlan_info->u1.s1.vlan_id =
838                             m_head->m_pkthdr.ether_vtag & 0xfff;
839                 }
840
841                 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
842                         struct ether_vlan_header *eh;
843                         int ether_len;
844
845                         eh = mtod(m_head, struct ether_vlan_header*);
846                         if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
847                                 ether_len = ETHER_HDR_LEN +
848                                     ETHER_VLAN_ENCAP_LEN;
849                         } else {
850                                 ether_len = ETHER_HDR_LEN;
851                         }
852
853                         rndis_msg_size += RNDIS_TSO_PPI_SIZE;
854                         rppi = hv_set_rppi_data(rndis_mesg, RNDIS_TSO_PPI_SIZE,
855                             tcp_large_send_info);
856
857                         tso_info = (rndis_tcp_tso_info *)((char *)rppi +
858                             rppi->per_packet_info_offset);
859                         tso_info->lso_v2_xmit.type =
860                             RNDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
861
862 #ifdef INET
863                         if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) {
864                                 struct ip *ip =
865                                     (struct ip *)(m_head->m_data + ether_len);
866                                 unsigned long iph_len = ip->ip_hl << 2;
867                                 struct tcphdr *th =
868                                     (struct tcphdr *)((caddr_t)ip + iph_len);
869                         
870                                 tso_info->lso_v2_xmit.ip_version =
871                                     RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV4;
872                                 ip->ip_len = 0;
873                                 ip->ip_sum = 0;
874                         
875                                 th->th_sum = in_pseudo(ip->ip_src.s_addr,
876                                     ip->ip_dst.s_addr, htons(IPPROTO_TCP));
877                         }
878 #endif
879 #if defined(INET6) && defined(INET)
880                         else
881 #endif
882 #ifdef INET6
883                         {
884                                 struct ip6_hdr *ip6 = (struct ip6_hdr *)
885                                     (m_head->m_data + ether_len);
886                                 struct tcphdr *th = (struct tcphdr *)(ip6 + 1);
887
888                                 tso_info->lso_v2_xmit.ip_version =
889                                     RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV6;
890                                 ip6->ip6_plen = 0;
891                                 th->th_sum =
892                                     in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
893                         }
894 #endif
895                         tso_info->lso_v2_xmit.tcp_header_offset = 0;
896                         tso_info->lso_v2_xmit.mss = m_head->m_pkthdr.tso_segsz;
897                 } else if (m_head->m_pkthdr.csum_flags & sc->hn_csum_assist) {
898                         rndis_msg_size += RNDIS_CSUM_PPI_SIZE;
899                         rppi = hv_set_rppi_data(rndis_mesg, RNDIS_CSUM_PPI_SIZE,
900                             tcpip_chksum_info);
901                         csum_info = (rndis_tcp_ip_csum_info *)((char*)rppi +
902                             rppi->per_packet_info_offset);
903
904                         csum_info->xmit.is_ipv4 = 1;
905                         if (m_head->m_pkthdr.csum_flags & CSUM_IP)
906                                 csum_info->xmit.ip_header_csum = 1;
907
908                         if (m_head->m_pkthdr.csum_flags & CSUM_TCP) {
909                                 csum_info->xmit.tcp_csum = 1;
910                                 csum_info->xmit.tcp_header_offset = 0;
911                         } else if (m_head->m_pkthdr.csum_flags & CSUM_UDP) {
912                                 csum_info->xmit.udp_csum = 1;
913                         }
914                 }
915
916                 rndis_mesg->msg_len = packet->tot_data_buf_len + rndis_msg_size;
917                 packet->tot_data_buf_len = rndis_mesg->msg_len;
918
919                 /* send packet with send buffer */
920                 if (packet->tot_data_buf_len < sc->hn_tx_chimney_size) {
921                         uint32_t send_buf_section_idx;
922
923                         send_buf_section_idx =
924                             hv_nv_get_next_send_section(net_dev);
925                         if (send_buf_section_idx !=
926                             NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) {
927                                 uint8_t *dest = ((uint8_t *)net_dev->send_buf +
928                                     (send_buf_section_idx *
929                                      net_dev->send_section_size));
930
931                                 memcpy(dest, rndis_mesg, rndis_msg_size);
932                                 dest += rndis_msg_size;
933
934                                 m_copydata(m_head, 0, m_head->m_pkthdr.len,
935                                     dest);
936
937                                 packet->send_buf_section_idx =
938                                     send_buf_section_idx;
939                                 packet->send_buf_section_size =
940                                     packet->tot_data_buf_len;
941                                 packet->page_buf_count = 0;
942                                 sc->hn_tx_chimney++;
943                                 goto do_send;
944                         }
945                 }
946
947                 error = hn_txdesc_dmamap_load(sc, txd, &m_head, segs, &nsegs);
948                 if (error) {
949                         int freed;
950
951                         /*
952                          * This mbuf is not linked w/ the txd yet, so free
953                          * it now.
954                          */
955                         m_freem(m_head);
956                         freed = hn_txdesc_put(sc, txd);
957                         KASSERT(freed != 0,
958                             ("fail to free txd upon txdma error"));
959
960                         sc->hn_txdma_failed++;
961                         if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
962                         continue;
963                 }
964
965                 packet->page_buf_count = nsegs +
966                     HV_RF_NUM_TX_RESERVED_PAGE_BUFS;
967
968                 /* send packet with page buffer */
969                 packet->page_buffers[0].pfn = atop(txd->rndis_msg_paddr);
970                 packet->page_buffers[0].offset =
971                     txd->rndis_msg_paddr & PAGE_MASK;
972                 packet->page_buffers[0].length = rndis_msg_size;
973
974                 /*
975                  * Fill the page buffers with mbuf info starting at index
976                  * HV_RF_NUM_TX_RESERVED_PAGE_BUFS.
977                  */
978                 for (i = 0; i < nsegs; ++i) {
979                         hv_vmbus_page_buffer *pb = &packet->page_buffers[
980                             i + HV_RF_NUM_TX_RESERVED_PAGE_BUFS];
981
982                         pb->pfn = atop(segs[i].ds_addr);
983                         pb->offset = segs[i].ds_addr & PAGE_MASK;
984                         pb->length = segs[i].ds_len;
985                 }
986
987                 packet->send_buf_section_idx = 
988                     NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX;
989                 packet->send_buf_section_size = 0;
990
991 do_send:
992                 txd->m = m_head;
993
994                 /* Set the completion routine */
995                 packet->compl.send.on_send_completion = netvsc_xmit_completion;
996                 packet->compl.send.send_completion_context = packet;
997                 packet->compl.send.send_completion_tid =
998                     (uint64_t)(uintptr_t)txd;
999
1000 again:
1001                 /*
1002                  * Make sure that txd is not freed before ETHER_BPF_MTAP.
1003                  */
1004                 hn_txdesc_hold(txd);
1005                 error = hv_nv_on_send(device_ctx, packet);
1006                 if (!error) {
1007                         ETHER_BPF_MTAP(ifp, m_head);
1008                         if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
1009                 }
1010                 hn_txdesc_put(sc, txd);
1011
1012                 if (__predict_false(error)) {
1013                         int freed;
1014
1015                         /*
1016                          * This should "really rarely" happen.
1017                          *
1018                          * XXX Too many RX to be acked or too many sideband
1019                          * commands to run?  Ask netvsc_channel_rollup()
1020                          * to kick start later.
1021                          */
1022                         sc->hn_txeof = 1;
1023                         if (!send_failed) {
1024                                 sc->hn_send_failed++;
1025                                 send_failed = 1;
1026                                 /*
1027                                  * Try sending again after set hn_txeof;
1028                                  * in case that we missed the last
1029                                  * netvsc_channel_rollup().
1030                                  */
1031                                 goto again;
1032                         }
1033                         if_printf(ifp, "send failed\n");
1034
1035                         /*
1036                          * This mbuf will be prepended, don't free it
1037                          * in hn_txdesc_put(); only unload it from the
1038                          * DMA map in hn_txdesc_put(), if it was loaded.
1039                          */
1040                         txd->m = NULL;
1041                         freed = hn_txdesc_put(sc, txd);
1042                         KASSERT(freed != 0,
1043                             ("fail to free txd upon send error"));
1044
1045                         sc->hn_send_failed++;
1046                         IF_PREPEND(&ifp->if_snd, m_head);
1047                         atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1048                         break;
1049                 }
1050         }
1051         return 0;
1052 }
1053
1054 /*
1055  * Link up/down notification
1056  */
1057 void
1058 netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status)
1059 {
1060         hn_softc_t *sc = device_get_softc(device_obj->device);
1061
1062         if (sc == NULL) {
1063                 return;
1064         }
1065
1066         if (status == 1) {
1067                 sc->hn_carrier = 1;
1068         } else {
1069                 sc->hn_carrier = 0;
1070         }
1071 }
1072
1073 /*
1074  * Append the specified data to the indicated mbuf chain,
1075  * Extend the mbuf chain if the new data does not fit in
1076  * existing space.
1077  *
1078  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
1079  * There should be an equivalent in the kernel mbuf code,
1080  * but there does not appear to be one yet.
1081  *
1082  * Differs from m_append() in that additional mbufs are
1083  * allocated with cluster size MJUMPAGESIZE, and filled
1084  * accordingly.
1085  *
1086  * Return 1 if able to complete the job; otherwise 0.
1087  */
1088 static int
1089 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
1090 {
1091         struct mbuf *m, *n;
1092         int remainder, space;
1093
1094         for (m = m0; m->m_next != NULL; m = m->m_next)
1095                 ;
1096         remainder = len;
1097         space = M_TRAILINGSPACE(m);
1098         if (space > 0) {
1099                 /*
1100                  * Copy into available space.
1101                  */
1102                 if (space > remainder)
1103                         space = remainder;
1104                 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1105                 m->m_len += space;
1106                 cp += space;
1107                 remainder -= space;
1108         }
1109         while (remainder > 0) {
1110                 /*
1111                  * Allocate a new mbuf; could check space
1112                  * and allocate a cluster instead.
1113                  */
1114                 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
1115                 if (n == NULL)
1116                         break;
1117                 n->m_len = min(MJUMPAGESIZE, remainder);
1118                 bcopy(cp, mtod(n, caddr_t), n->m_len);
1119                 cp += n->m_len;
1120                 remainder -= n->m_len;
1121                 m->m_next = n;
1122                 m = n;
1123         }
1124         if (m0->m_flags & M_PKTHDR)
1125                 m0->m_pkthdr.len += len - remainder;
1126
1127         return (remainder == 0);
1128 }
1129
1130
1131 /*
1132  * Called when we receive a data packet from the "wire" on the
1133  * specified device
1134  *
1135  * Note:  This is no longer used as a callback
1136  */
1137 int
1138 netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
1139     rndis_tcp_ip_csum_info *csum_info)
1140 {
1141         hn_softc_t *sc = (hn_softc_t *)device_get_softc(device_ctx->device);
1142         struct mbuf *m_new;
1143         struct ifnet *ifp;
1144         device_t dev = device_ctx->device;
1145         int size, do_lro = 0, do_csum = 1;
1146
1147         if (sc == NULL) {
1148                 return (0); /* TODO: KYS how can this be! */
1149         }
1150
1151         ifp = sc->hn_ifp;
1152         
1153         ifp = sc->arpcom.ac_ifp;
1154
1155         if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1156                 return (0);
1157         }
1158
1159         /*
1160          * Bail out if packet contains more data than configured MTU.
1161          */
1162         if (packet->tot_data_buf_len > (ifp->if_mtu + ETHER_HDR_LEN)) {
1163                 return (0);
1164         } else if (packet->tot_data_buf_len <= MHLEN) {
1165                 m_new = m_gethdr(M_NOWAIT, MT_DATA);
1166                 if (m_new == NULL)
1167                         return (0);
1168                 memcpy(mtod(m_new, void *), packet->data,
1169                     packet->tot_data_buf_len);
1170                 m_new->m_pkthdr.len = m_new->m_len = packet->tot_data_buf_len;
1171                 sc->hn_small_pkts++;
1172         } else {
1173                 /*
1174                  * Get an mbuf with a cluster.  For packets 2K or less,
1175                  * get a standard 2K cluster.  For anything larger, get a
1176                  * 4K cluster.  Any buffers larger than 4K can cause problems
1177                  * if looped around to the Hyper-V TX channel, so avoid them.
1178                  */
1179                 size = MCLBYTES;
1180                 if (packet->tot_data_buf_len > MCLBYTES) {
1181                         /* 4096 */
1182                         size = MJUMPAGESIZE;
1183                 }
1184
1185                 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
1186                 if (m_new == NULL) {
1187                         device_printf(dev, "alloc mbuf failed.\n");
1188                         return (0);
1189                 }
1190
1191                 hv_m_append(m_new, packet->tot_data_buf_len, packet->data);
1192         }
1193         m_new->m_pkthdr.rcvif = ifp;
1194
1195         if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
1196                 do_csum = 0;
1197
1198         /* receive side checksum offload */
1199         if (csum_info != NULL) {
1200                 /* IP csum offload */
1201                 if (csum_info->receive.ip_csum_succeeded && do_csum) {
1202                         m_new->m_pkthdr.csum_flags |=
1203                             (CSUM_IP_CHECKED | CSUM_IP_VALID);
1204                         sc->hn_csum_ip++;
1205                 }
1206
1207                 /* TCP/UDP csum offload */
1208                 if ((csum_info->receive.tcp_csum_succeeded ||
1209                      csum_info->receive.udp_csum_succeeded) && do_csum) {
1210                         m_new->m_pkthdr.csum_flags |=
1211                             (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1212                         m_new->m_pkthdr.csum_data = 0xffff;
1213                         if (csum_info->receive.tcp_csum_succeeded)
1214                                 sc->hn_csum_tcp++;
1215                         else
1216                                 sc->hn_csum_udp++;
1217                 }
1218
1219                 if (csum_info->receive.ip_csum_succeeded &&
1220                     csum_info->receive.tcp_csum_succeeded)
1221                         do_lro = 1;
1222         } else {
1223                 const struct ether_header *eh;
1224                 uint16_t etype;
1225                 int hoff;
1226
1227                 hoff = sizeof(*eh);
1228                 if (m_new->m_len < hoff)
1229                         goto skip;
1230                 eh = mtod(m_new, struct ether_header *);
1231                 etype = ntohs(eh->ether_type);
1232                 if (etype == ETHERTYPE_VLAN) {
1233                         const struct ether_vlan_header *evl;
1234
1235                         hoff = sizeof(*evl);
1236                         if (m_new->m_len < hoff)
1237                                 goto skip;
1238                         evl = mtod(m_new, struct ether_vlan_header *);
1239                         etype = ntohs(evl->evl_proto);
1240                 }
1241
1242                 if (etype == ETHERTYPE_IP) {
1243                         int pr;
1244
1245                         pr = hn_check_iplen(m_new, hoff);
1246                         if (pr == IPPROTO_TCP) {
1247                                 if (do_csum &&
1248                                     (sc->hn_trust_hcsum & HN_TRUST_HCSUM_TCP)) {
1249                                         sc->hn_csum_trusted++;
1250                                         m_new->m_pkthdr.csum_flags |=
1251                                            (CSUM_IP_CHECKED | CSUM_IP_VALID |
1252                                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1253                                         m_new->m_pkthdr.csum_data = 0xffff;
1254                                 }
1255                                 /* Rely on SW csum verification though... */
1256                                 do_lro = 1;
1257                         } else if (pr == IPPROTO_UDP) {
1258                                 if (do_csum &&
1259                                     (sc->hn_trust_hcsum & HN_TRUST_HCSUM_UDP)) {
1260                                         sc->hn_csum_trusted++;
1261                                         m_new->m_pkthdr.csum_flags |=
1262                                            (CSUM_IP_CHECKED | CSUM_IP_VALID |
1263                                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1264                                         m_new->m_pkthdr.csum_data = 0xffff;
1265                                 }
1266                         } else if (pr != IPPROTO_DONE && do_csum &&
1267                             (sc->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) {
1268                                 sc->hn_csum_trusted++;
1269                                 m_new->m_pkthdr.csum_flags |=
1270                                     (CSUM_IP_CHECKED | CSUM_IP_VALID);
1271                         }
1272                 }
1273         }
1274 skip:
1275         if ((packet->vlan_tci != 0) &&
1276             (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0) {
1277                 m_new->m_pkthdr.ether_vtag = packet->vlan_tci;
1278                 m_new->m_flags |= M_VLANTAG;
1279         }
1280
1281         /*
1282          * Note:  Moved RX completion back to hv_nv_on_receive() so all
1283          * messages (not just data messages) will trigger a response.
1284          */
1285
1286         ifp->if_ipackets++;
1287
1288         if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
1289 #if defined(INET) || defined(INET6)
1290                 struct lro_ctrl *lro = &sc->hn_lro;
1291
1292                 if (lro->lro_cnt) {
1293                         sc->hn_lro_tried++;
1294                         if (tcp_lro_rx(lro, m_new, 0) == 0) {
1295                                 /* DONE! */
1296                                 return 0;
1297                         }
1298                 }
1299 #endif
1300         }
1301
1302         /* We're not holding the lock here, so don't release it */
1303         (*ifp->if_input)(ifp, m_new);
1304
1305         return (0);
1306 }
1307
1308 void
1309 netvsc_recv_rollup(struct hv_device *device_ctx)
1310 {
1311 #if defined(INET) || defined(INET6)
1312         hn_softc_t *sc = device_get_softc(device_ctx->device);
1313         struct lro_ctrl *lro = &sc->hn_lro;
1314         struct lro_entry *queued;
1315
1316         while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1317                 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1318                 tcp_lro_flush(lro, queued);
1319         }
1320 #endif
1321 }
1322
1323 /*
1324  * Rules for using sc->temp_unusable:
1325  * 1.  sc->temp_unusable can only be read or written while holding NV_LOCK()
1326  * 2.  code reading sc->temp_unusable under NV_LOCK(), and finding 
1327  *     sc->temp_unusable set, must release NV_LOCK() and exit
1328  * 3.  to retain exclusive control of the interface,
1329  *     sc->temp_unusable must be set by code before releasing NV_LOCK()
1330  * 4.  only code setting sc->temp_unusable can clear sc->temp_unusable
1331  * 5.  code setting sc->temp_unusable must eventually clear sc->temp_unusable
1332  */
1333
1334 /*
1335  * Standard ioctl entry point.  Called when the user wants to configure
1336  * the interface.
1337  */
1338 static int
1339 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1340 {
1341         hn_softc_t *sc = ifp->if_softc;
1342         struct ifreq *ifr = (struct ifreq *)data;
1343 #ifdef INET
1344         struct ifaddr *ifa = (struct ifaddr *)data;
1345 #endif
1346         netvsc_device_info device_info;
1347         struct hv_device *hn_dev;
1348         int mask, error = 0;
1349         int retry_cnt = 500;
1350         
1351         switch(cmd) {
1352
1353         case SIOCSIFADDR:
1354 #ifdef INET
1355                 if (ifa->ifa_addr->sa_family == AF_INET) {
1356                         ifp->if_flags |= IFF_UP;
1357                         if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
1358                                 hn_ifinit(sc);
1359                         arp_ifinit(ifp, ifa);
1360                 } else
1361 #endif
1362                 error = ether_ioctl(ifp, cmd, data);
1363                 break;
1364         case SIOCSIFMTU:
1365                 hn_dev = vmbus_get_devctx(sc->hn_dev);
1366
1367                 /* Check MTU value change */
1368                 if (ifp->if_mtu == ifr->ifr_mtu)
1369                         break;
1370
1371                 if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) {
1372                         error = EINVAL;
1373                         break;
1374                 }
1375
1376                 /* Obtain and record requested MTU */
1377                 ifp->if_mtu = ifr->ifr_mtu;
1378                 /*
1379                  * Make sure that LRO high watermark is still valid,
1380                  * after MTU change (the 2*MTU limit).
1381                  */
1382                 if (!HN_LRO_HIWAT_ISVALID(sc, sc->hn_lro_hiwat))
1383                         hn_set_lro_hiwat(sc, HN_LRO_HIWAT_MTULIM(ifp));
1384
1385                 do {
1386                         NV_LOCK(sc);
1387                         if (!sc->temp_unusable) {
1388                                 sc->temp_unusable = TRUE;
1389                                 retry_cnt = -1;
1390                         }
1391                         NV_UNLOCK(sc);
1392                         if (retry_cnt > 0) {
1393                                 retry_cnt--;
1394                                 DELAY(5 * 1000);
1395                         }
1396                 } while (retry_cnt > 0);
1397
1398                 if (retry_cnt == 0) {
1399                         error = EINVAL;
1400                         break;
1401                 }
1402
1403                 /* We must remove and add back the device to cause the new
1404                  * MTU to take effect.  This includes tearing down, but not
1405                  * deleting the channel, then bringing it back up.
1406                  */
1407                 error = hv_rf_on_device_remove(hn_dev, HV_RF_NV_RETAIN_CHANNEL);
1408                 if (error) {
1409                         NV_LOCK(sc);
1410                         sc->temp_unusable = FALSE;
1411                         NV_UNLOCK(sc);
1412                         break;
1413                 }
1414                 error = hv_rf_on_device_add(hn_dev, &device_info);
1415                 if (error) {
1416                         NV_LOCK(sc);
1417                         sc->temp_unusable = FALSE;
1418                         NV_UNLOCK(sc);
1419                         break;
1420                 }
1421
1422                 sc->hn_tx_chimney_max = sc->net_dev->send_section_size;
1423                 if (sc->hn_tx_chimney_size > sc->hn_tx_chimney_max)
1424                         sc->hn_tx_chimney_size = sc->hn_tx_chimney_max;
1425                 hn_ifinit_locked(sc);
1426
1427                 NV_LOCK(sc);
1428                 sc->temp_unusable = FALSE;
1429                 NV_UNLOCK(sc);
1430                 break;
1431         case SIOCSIFFLAGS:
1432                 do {
1433                        NV_LOCK(sc);
1434                        if (!sc->temp_unusable) {
1435                                sc->temp_unusable = TRUE;
1436                                retry_cnt = -1;
1437                        }
1438                        NV_UNLOCK(sc);
1439                        if (retry_cnt > 0) {
1440                                 retry_cnt--;
1441                                 DELAY(5 * 1000);
1442                        }
1443                 } while (retry_cnt > 0);
1444
1445                 if (retry_cnt == 0) {
1446                        error = EINVAL;
1447                        break;
1448                 }
1449
1450                 if (ifp->if_flags & IFF_UP) {
1451                         /*
1452                          * If only the state of the PROMISC flag changed,
1453                          * then just use the 'set promisc mode' command
1454                          * instead of reinitializing the entire NIC. Doing
1455                          * a full re-init means reloading the firmware and
1456                          * waiting for it to start up, which may take a
1457                          * second or two.
1458                          */
1459 #ifdef notyet
1460                         /* Fixme:  Promiscuous mode? */
1461                         if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1462                             ifp->if_flags & IFF_PROMISC &&
1463                             !(sc->hn_if_flags & IFF_PROMISC)) {
1464                                 /* do something here for Hyper-V */
1465                         } else if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1466                             !(ifp->if_flags & IFF_PROMISC) &&
1467                             sc->hn_if_flags & IFF_PROMISC) {
1468                                 /* do something here for Hyper-V */
1469                         } else
1470 #endif
1471                                 hn_ifinit_locked(sc);
1472                 } else {
1473                         if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1474                                 hn_stop(sc);
1475                         }
1476                 }
1477                 NV_LOCK(sc);
1478                 sc->temp_unusable = FALSE;
1479                 NV_UNLOCK(sc);
1480                 sc->hn_if_flags = ifp->if_flags;
1481                 error = 0;
1482                 break;
1483         case SIOCSIFCAP:
1484                 NV_LOCK(sc);
1485
1486                 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
1487                 if (mask & IFCAP_TXCSUM) {
1488                         ifp->if_capenable ^= IFCAP_TXCSUM;
1489                         if (ifp->if_capenable & IFCAP_TXCSUM)
1490                                 ifp->if_hwassist |= sc->hn_csum_assist;
1491                         else
1492                                 ifp->if_hwassist &= ~sc->hn_csum_assist;
1493                 }
1494
1495                 if (mask & IFCAP_RXCSUM)
1496                         ifp->if_capenable ^= IFCAP_RXCSUM;
1497
1498                 if (mask & IFCAP_LRO)
1499                         ifp->if_capenable ^= IFCAP_LRO;
1500
1501                 if (mask & IFCAP_TSO4) {
1502                         ifp->if_capenable ^= IFCAP_TSO4;
1503                         if (ifp->if_capenable & IFCAP_TSO4)
1504                                 ifp->if_hwassist |= CSUM_IP_TSO;
1505                         else
1506                                 ifp->if_hwassist &= ~CSUM_IP_TSO;
1507                 }
1508
1509                 if (mask & IFCAP_TSO6) {
1510                         ifp->if_capenable ^= IFCAP_TSO6;
1511                         if (ifp->if_capenable & IFCAP_TSO6)
1512                                 ifp->if_hwassist |= CSUM_IP6_TSO;
1513                         else
1514                                 ifp->if_hwassist &= ~CSUM_IP6_TSO;
1515                 }
1516
1517                 NV_UNLOCK(sc);
1518                 error = 0;
1519                 break;
1520         case SIOCADDMULTI:
1521         case SIOCDELMULTI:
1522 #ifdef notyet
1523                 /* Fixme:  Multicast mode? */
1524                 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1525                         NV_LOCK(sc);
1526                         netvsc_setmulti(sc);
1527                         NV_UNLOCK(sc);
1528                         error = 0;
1529                 }
1530 #endif
1531                 error = EINVAL;
1532                 break;
1533         case SIOCSIFMEDIA:
1534         case SIOCGIFMEDIA:
1535                 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
1536                 break;
1537         default:
1538                 error = ether_ioctl(ifp, cmd, data);
1539                 break;
1540         }
1541
1542         return (error);
1543 }
1544
1545 /*
1546  *
1547  */
1548 static void
1549 hn_stop(hn_softc_t *sc)
1550 {
1551         struct ifnet *ifp;
1552         int ret;
1553         struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
1554
1555         ifp = sc->hn_ifp;
1556
1557         if (bootverbose)
1558                 printf(" Closing Device ...\n");
1559
1560         atomic_clear_int(&ifp->if_drv_flags,
1561             (IFF_DRV_RUNNING | IFF_DRV_OACTIVE));
1562         if_link_state_change(ifp, LINK_STATE_DOWN);
1563         sc->hn_initdone = 0;
1564
1565         ret = hv_rf_on_close(device_ctx);
1566 }
1567
1568 /*
1569  * FreeBSD transmit entry point
1570  */
1571 static void
1572 hn_start(struct ifnet *ifp)
1573 {
1574         hn_softc_t *sc;
1575
1576         sc = ifp->if_softc;
1577         if (NV_TRYLOCK(sc)) {
1578                 int sched;
1579
1580                 sched = hn_start_locked(ifp, sc->hn_direct_tx_size);
1581                 NV_UNLOCK(sc);
1582                 if (!sched)
1583                         return;
1584         }
1585         taskqueue_enqueue_fast(sc->hn_tx_taskq, &sc->hn_start_task);
1586 }
1587
1588 static void
1589 hn_start_txeof(struct ifnet *ifp)
1590 {
1591         hn_softc_t *sc;
1592
1593         sc = ifp->if_softc;
1594         if (NV_TRYLOCK(sc)) {
1595                 int sched;
1596
1597                 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1598                 sched = hn_start_locked(ifp, sc->hn_direct_tx_size);
1599                 NV_UNLOCK(sc);
1600                 if (sched) {
1601                         taskqueue_enqueue_fast(sc->hn_tx_taskq,
1602                             &sc->hn_start_task);
1603                 }
1604         } else {
1605                 /*
1606                  * Release the OACTIVE earlier, with the hope, that
1607                  * others could catch up.  The task will clear the
1608                  * flag again with the NV_LOCK to avoid possible
1609                  * races.
1610                  */
1611                 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1612                 taskqueue_enqueue_fast(sc->hn_tx_taskq, &sc->hn_txeof_task);
1613         }
1614 }
1615
1616 /*
1617  *
1618  */
1619 static void
1620 hn_ifinit_locked(hn_softc_t *sc)
1621 {
1622         struct ifnet *ifp;
1623         struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
1624         int ret;
1625
1626         ifp = sc->hn_ifp;
1627
1628         if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1629                 return;
1630         }
1631
1632         hv_promisc_mode = 1;
1633
1634         ret = hv_rf_on_open(device_ctx);
1635         if (ret != 0) {
1636                 return;
1637         } else {
1638                 sc->hn_initdone = 1;
1639         }
1640         atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1641         atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
1642         if_link_state_change(ifp, LINK_STATE_UP);
1643 }
1644
1645 /*
1646  *
1647  */
1648 static void
1649 hn_ifinit(void *xsc)
1650 {
1651         hn_softc_t *sc = xsc;
1652
1653         NV_LOCK(sc);
1654         if (sc->temp_unusable) {
1655                 NV_UNLOCK(sc);
1656                 return;
1657         }
1658         sc->temp_unusable = TRUE;
1659         NV_UNLOCK(sc);
1660
1661         hn_ifinit_locked(sc);
1662
1663         NV_LOCK(sc);
1664         sc->temp_unusable = FALSE;
1665         NV_UNLOCK(sc);
1666 }
1667
1668 #ifdef LATER
1669 /*
1670  *
1671  */
1672 static void
1673 hn_watchdog(struct ifnet *ifp)
1674 {
1675         hn_softc_t *sc;
1676         sc = ifp->if_softc;
1677
1678         printf("hn%d: watchdog timeout -- resetting\n", sc->hn_unit);
1679         hn_ifinit(sc);    /*???*/
1680         ifp->if_oerrors++;
1681 }
1682 #endif
1683
1684 #ifdef HN_LRO_HIWAT
1685 static int
1686 hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS)
1687 {
1688         struct hn_softc *sc = arg1;
1689         int hiwat, error;
1690
1691         hiwat = sc->hn_lro_hiwat;
1692         error = sysctl_handle_int(oidp, &hiwat, 0, req);
1693         if (error || req->newptr == NULL)
1694                 return error;
1695
1696         if (!HN_LRO_HIWAT_ISVALID(sc, hiwat))
1697                 return EINVAL;
1698
1699         if (sc->hn_lro_hiwat != hiwat)
1700                 hn_set_lro_hiwat(sc, hiwat);
1701         return 0;
1702 }
1703 #endif  /* HN_LRO_HIWAT */
1704
1705 static int
1706 hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS)
1707 {
1708         struct hn_softc *sc = arg1;
1709         int hcsum = arg2;
1710         int on, error;
1711
1712         on = 0;
1713         if (sc->hn_trust_hcsum & hcsum)
1714                 on = 1;
1715
1716         error = sysctl_handle_int(oidp, &on, 0, req);
1717         if (error || req->newptr == NULL)
1718                 return error;
1719
1720         NV_LOCK(sc);
1721         if (on)
1722                 sc->hn_trust_hcsum |= hcsum;
1723         else
1724                 sc->hn_trust_hcsum &= ~hcsum;
1725         NV_UNLOCK(sc);
1726         return 0;
1727 }
1728
1729 static int
1730 hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS)
1731 {
1732         struct hn_softc *sc = arg1;
1733         int chimney_size, error;
1734
1735         chimney_size = sc->hn_tx_chimney_size;
1736         error = sysctl_handle_int(oidp, &chimney_size, 0, req);
1737         if (error || req->newptr == NULL)
1738                 return error;
1739
1740         if (chimney_size > sc->hn_tx_chimney_max || chimney_size <= 0)
1741                 return EINVAL;
1742
1743         if (sc->hn_tx_chimney_size != chimney_size)
1744                 sc->hn_tx_chimney_size = chimney_size;
1745         return 0;
1746 }
1747
1748 static int
1749 hn_check_iplen(const struct mbuf *m, int hoff)
1750 {
1751         const struct ip *ip;
1752         int len, iphlen, iplen;
1753         const struct tcphdr *th;
1754         int thoff;                              /* TCP data offset */
1755
1756         len = hoff + sizeof(struct ip);
1757
1758         /* The packet must be at least the size of an IP header. */
1759         if (m->m_pkthdr.len < len)
1760                 return IPPROTO_DONE;
1761
1762         /* The fixed IP header must reside completely in the first mbuf. */
1763         if (m->m_len < len)
1764                 return IPPROTO_DONE;
1765
1766         ip = mtodo(m, hoff);
1767
1768         /* Bound check the packet's stated IP header length. */
1769         iphlen = ip->ip_hl << 2;
1770         if (iphlen < sizeof(struct ip))         /* minimum header length */
1771                 return IPPROTO_DONE;
1772
1773         /* The full IP header must reside completely in the one mbuf. */
1774         if (m->m_len < hoff + iphlen)
1775                 return IPPROTO_DONE;
1776
1777         iplen = ntohs(ip->ip_len);
1778
1779         /*
1780          * Check that the amount of data in the buffers is as
1781          * at least much as the IP header would have us expect.
1782          */
1783         if (m->m_pkthdr.len < hoff + iplen)
1784                 return IPPROTO_DONE;
1785
1786         /*
1787          * Ignore IP fragments.
1788          */
1789         if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
1790                 return IPPROTO_DONE;
1791
1792         /*
1793          * The TCP/IP or UDP/IP header must be entirely contained within
1794          * the first fragment of a packet.
1795          */
1796         switch (ip->ip_p) {
1797         case IPPROTO_TCP:
1798                 if (iplen < iphlen + sizeof(struct tcphdr))
1799                         return IPPROTO_DONE;
1800                 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
1801                         return IPPROTO_DONE;
1802                 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
1803                 thoff = th->th_off << 2;
1804                 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
1805                         return IPPROTO_DONE;
1806                 if (m->m_len < hoff + iphlen + thoff)
1807                         return IPPROTO_DONE;
1808                 break;
1809         case IPPROTO_UDP:
1810                 if (iplen < iphlen + sizeof(struct udphdr))
1811                         return IPPROTO_DONE;
1812                 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
1813                         return IPPROTO_DONE;
1814                 break;
1815         default:
1816                 if (iplen < iphlen)
1817                         return IPPROTO_DONE;
1818                 break;
1819         }
1820         return ip->ip_p;
1821 }
1822
1823 static void
1824 hn_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error)
1825 {
1826         bus_addr_t *paddr = arg;
1827
1828         if (error)
1829                 return;
1830
1831         KASSERT(nseg == 1, ("too many segments %d!", nseg));
1832         *paddr = segs->ds_addr;
1833 }
1834
1835 static int
1836 hn_create_tx_ring(struct hn_softc *sc)
1837 {
1838         bus_dma_tag_t parent_dtag;
1839         int error, i;
1840
1841         sc->hn_txdesc_cnt = HN_TX_DESC_CNT;
1842         sc->hn_txdesc = malloc(sizeof(struct hn_txdesc) * sc->hn_txdesc_cnt,
1843             M_NETVSC, M_WAITOK | M_ZERO);
1844         SLIST_INIT(&sc->hn_txlist);
1845         mtx_init(&sc->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
1846
1847         parent_dtag = bus_get_dma_tag(sc->hn_dev);
1848
1849         /* DMA tag for RNDIS messages. */
1850         error = bus_dma_tag_create(parent_dtag, /* parent */
1851             HN_RNDIS_MSG_ALIGN,         /* alignment */
1852             HN_RNDIS_MSG_BOUNDARY,      /* boundary */
1853             BUS_SPACE_MAXADDR,          /* lowaddr */
1854             BUS_SPACE_MAXADDR,          /* highaddr */
1855             NULL, NULL,                 /* filter, filterarg */
1856             HN_RNDIS_MSG_LEN,           /* maxsize */
1857             1,                          /* nsegments */
1858             HN_RNDIS_MSG_LEN,           /* maxsegsize */
1859             0,                          /* flags */
1860             NULL,                       /* lockfunc */
1861             NULL,                       /* lockfuncarg */
1862             &sc->hn_tx_rndis_dtag);
1863         if (error) {
1864                 device_printf(sc->hn_dev, "failed to create rndis dmatag\n");
1865                 return error;
1866         }
1867
1868         /* DMA tag for data. */
1869         error = bus_dma_tag_create(parent_dtag, /* parent */
1870             1,                          /* alignment */
1871             HN_TX_DATA_BOUNDARY,        /* boundary */
1872             BUS_SPACE_MAXADDR,          /* lowaddr */
1873             BUS_SPACE_MAXADDR,          /* highaddr */
1874             NULL, NULL,                 /* filter, filterarg */
1875             HN_TX_DATA_MAXSIZE,         /* maxsize */
1876             HN_TX_DATA_SEGCNT_MAX,      /* nsegments */
1877             HN_TX_DATA_SEGSIZE,         /* maxsegsize */
1878             0,                          /* flags */
1879             NULL,                       /* lockfunc */
1880             NULL,                       /* lockfuncarg */
1881             &sc->hn_tx_data_dtag);
1882         if (error) {
1883                 device_printf(sc->hn_dev, "failed to create data dmatag\n");
1884                 return error;
1885         }
1886
1887         for (i = 0; i < sc->hn_txdesc_cnt; ++i) {
1888                 struct hn_txdesc *txd = &sc->hn_txdesc[i];
1889
1890                 txd->sc = sc;
1891
1892                 /*
1893                  * Allocate and load RNDIS messages.
1894                  */
1895                 error = bus_dmamem_alloc(sc->hn_tx_rndis_dtag,
1896                     (void **)&txd->rndis_msg,
1897                     BUS_DMA_WAITOK | BUS_DMA_COHERENT,
1898                     &txd->rndis_msg_dmap);
1899                 if (error) {
1900                         device_printf(sc->hn_dev,
1901                             "failed to allocate rndis_msg, %d\n", i);
1902                         return error;
1903                 }
1904
1905                 error = bus_dmamap_load(sc->hn_tx_rndis_dtag,
1906                     txd->rndis_msg_dmap,
1907                     txd->rndis_msg, HN_RNDIS_MSG_LEN,
1908                     hn_dma_map_paddr, &txd->rndis_msg_paddr,
1909                     BUS_DMA_NOWAIT);
1910                 if (error) {
1911                         device_printf(sc->hn_dev,
1912                             "failed to load rndis_msg, %d\n", i);
1913                         bus_dmamem_free(sc->hn_tx_rndis_dtag,
1914                             txd->rndis_msg, txd->rndis_msg_dmap);
1915                         return error;
1916                 }
1917
1918                 /* DMA map for TX data. */
1919                 error = bus_dmamap_create(sc->hn_tx_data_dtag, 0,
1920                     &txd->data_dmap);
1921                 if (error) {
1922                         device_printf(sc->hn_dev,
1923                             "failed to allocate tx data dmamap\n");
1924                         bus_dmamap_unload(sc->hn_tx_rndis_dtag,
1925                             txd->rndis_msg_dmap);
1926                         bus_dmamem_free(sc->hn_tx_rndis_dtag,
1927                             txd->rndis_msg, txd->rndis_msg_dmap);
1928                         return error;
1929                 }
1930
1931                 /* All set, put it to list */
1932                 txd->flags |= HN_TXD_FLAG_ONLIST;
1933                 SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link);
1934         }
1935         sc->hn_txdesc_avail = sc->hn_txdesc_cnt;
1936
1937         return 0;
1938 }
1939
1940 static void
1941 hn_destroy_tx_ring(struct hn_softc *sc)
1942 {
1943         struct hn_txdesc *txd;
1944
1945         while ((txd = SLIST_FIRST(&sc->hn_txlist)) != NULL) {
1946                 KASSERT(txd->m == NULL, ("still has mbuf installed"));
1947                 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1948                     ("still dma mapped"));
1949                 SLIST_REMOVE_HEAD(&sc->hn_txlist, link);
1950
1951                 bus_dmamap_unload(sc->hn_tx_rndis_dtag,
1952                     txd->rndis_msg_dmap);
1953                 bus_dmamem_free(sc->hn_tx_rndis_dtag,
1954                     txd->rndis_msg, txd->rndis_msg_dmap);
1955
1956                 bus_dmamap_destroy(sc->hn_tx_data_dtag, txd->data_dmap);
1957         }
1958
1959         if (sc->hn_tx_data_dtag != NULL)
1960                 bus_dma_tag_destroy(sc->hn_tx_data_dtag);
1961         if (sc->hn_tx_rndis_dtag != NULL)
1962                 bus_dma_tag_destroy(sc->hn_tx_rndis_dtag);
1963         free(sc->hn_txdesc, M_NETVSC);
1964         mtx_destroy(&sc->hn_txlist_spin);
1965 }
1966
1967 static void
1968 hn_start_taskfunc(void *xsc, int pending __unused)
1969 {
1970         struct hn_softc *sc = xsc;
1971
1972         NV_LOCK(sc);
1973         hn_start_locked(sc->hn_ifp, 0);
1974         NV_UNLOCK(sc);
1975 }
1976
1977 static void
1978 hn_txeof_taskfunc(void *xsc, int pending __unused)
1979 {
1980         struct hn_softc *sc = xsc;
1981         struct ifnet *ifp = sc->hn_ifp;
1982
1983         NV_LOCK(sc);
1984         atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1985         hn_start_locked(ifp, 0);
1986         NV_UNLOCK(sc);
1987 }
1988
1989 static device_method_t netvsc_methods[] = {
1990         /* Device interface */
1991         DEVMETHOD(device_probe,         netvsc_probe),
1992         DEVMETHOD(device_attach,        netvsc_attach),
1993         DEVMETHOD(device_detach,        netvsc_detach),
1994         DEVMETHOD(device_shutdown,      netvsc_shutdown),
1995
1996         { 0, 0 }
1997 };
1998
1999 static driver_t netvsc_driver = {
2000         NETVSC_DEVNAME,
2001         netvsc_methods,
2002         sizeof(hn_softc_t)
2003 };
2004
2005 static devclass_t netvsc_devclass;
2006
2007 DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0);
2008 MODULE_VERSION(hn, 1);
2009 MODULE_DEPEND(hn, vmbus, 1, 1, 1);