]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c
MFC 294701,294702,294703,294705,294788
[FreeBSD/stable/10.git] / sys / dev / hyperv / netvsc / hv_netvsc_drv_freebsd.c
1 /*-
2  * Copyright (c) 2010-2012 Citrix Inc.
3  * Copyright (c) 2009-2012 Microsoft Corp.
4  * Copyright (c) 2012 NetApp Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28
29 /*-
30  * Copyright (c) 2004-2006 Kip Macy
31  * All rights reserved.
32  *
33  * Redistribution and use in source and binary forms, with or without
34  * modification, are permitted provided that the following conditions
35  * are met:
36  * 1. Redistributions of source code must retain the above copyright
37  *    notice, this list of conditions and the following disclaimer.
38  * 2. Redistributions in binary form must reproduce the above copyright
39  *    notice, this list of conditions and the following disclaimer in the
40  *    documentation and/or other materials provided with the distribution.
41  *
42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52  * SUCH DAMAGE.
53  */
54
55 #include <sys/cdefs.h>
56 __FBSDID("$FreeBSD$");
57
58 #include "opt_inet6.h"
59 #include "opt_inet.h"
60
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/sockio.h>
64 #include <sys/mbuf.h>
65 #include <sys/malloc.h>
66 #include <sys/module.h>
67 #include <sys/kernel.h>
68 #include <sys/socket.h>
69 #include <sys/queue.h>
70 #include <sys/lock.h>
71 #include <sys/sx.h>
72 #include <sys/sysctl.h>
73
74 #include <net/if.h>
75 #include <net/if_arp.h>
76 #include <net/ethernet.h>
77 #include <net/if_dl.h>
78 #include <net/if_media.h>
79
80 #include <net/bpf.h>
81
82 #include <net/if_types.h>
83 #include <net/if_vlan_var.h>
84 #include <net/if.h>
85
86 #include <netinet/in_systm.h>
87 #include <netinet/in.h>
88 #include <netinet/ip.h>
89 #include <netinet/if_ether.h>
90 #include <netinet/tcp.h>
91 #include <netinet/udp.h>
92 #include <netinet/ip6.h>
93
94 #include <vm/vm.h>
95 #include <vm/vm_param.h>
96 #include <vm/vm_kern.h>
97 #include <vm/pmap.h>
98
99 #include <machine/bus.h>
100 #include <machine/resource.h>
101 #include <machine/frame.h>
102 #include <machine/vmparam.h>
103
104 #include <sys/bus.h>
105 #include <sys/rman.h>
106 #include <sys/mutex.h>
107 #include <sys/errno.h>
108 #include <sys/types.h>
109 #include <machine/atomic.h>
110
111 #include <machine/intr_machdep.h>
112
113 #include <machine/in_cksum.h>
114
115 #include <dev/hyperv/include/hyperv.h>
116 #include "hv_net_vsc.h"
117 #include "hv_rndis.h"
118 #include "hv_rndis_filter.h"
119
120
121 /* Short for Hyper-V network interface */
122 #define NETVSC_DEVNAME    "hn"
123
124 /*
125  * It looks like offset 0 of buf is reserved to hold the softc pointer.
126  * The sc pointer evidently not needed, and is not presently populated.
127  * The packet offset is where the netvsc_packet starts in the buffer.
128  */
129 #define HV_NV_SC_PTR_OFFSET_IN_BUF         0
130 #define HV_NV_PACKET_OFFSET_IN_BUF         16
131
132 /* YYY should get it from the underlying channel */
133 #define HN_TX_DESC_CNT                  512
134
135 #define HN_RNDIS_MSG_LEN                \
136     (sizeof(rndis_msg) +                \
137      RNDIS_VLAN_PPI_SIZE +              \
138      RNDIS_TSO_PPI_SIZE +               \
139      RNDIS_CSUM_PPI_SIZE)
140 #define HN_RNDIS_MSG_BOUNDARY           PAGE_SIZE
141 #define HN_RNDIS_MSG_ALIGN              CACHE_LINE_SIZE
142
143 #define HN_TX_DATA_BOUNDARY             PAGE_SIZE
144 #define HN_TX_DATA_MAXSIZE              IP_MAXPACKET
145 #define HN_TX_DATA_SEGSIZE              PAGE_SIZE
146 #define HN_TX_DATA_SEGCNT_MAX           \
147     (NETVSC_PACKET_MAXPAGE - HV_RF_NUM_TX_RESERVED_PAGE_BUFS)
148
149 #define HN_DIRECT_TX_SIZE_DEF           128
150
151 struct hn_txdesc {
152         SLIST_ENTRY(hn_txdesc) link;
153         struct mbuf     *m;
154         struct hn_softc *sc;
155         int             refs;
156         uint32_t        flags;          /* HN_TXD_FLAG_ */
157         netvsc_packet   netvsc_pkt;     /* XXX to be removed */
158
159         bus_dmamap_t    data_dmap;
160
161         bus_addr_t      rndis_msg_paddr;
162         rndis_msg       *rndis_msg;
163         bus_dmamap_t    rndis_msg_dmap;
164 };
165
166 #define HN_TXD_FLAG_ONLIST      0x1
167 #define HN_TXD_FLAG_DMAMAP      0x2
168
169 /*
170  * A unified flag for all outbound check sum flags is useful,
171  * and it helps avoiding unnecessary check sum calculation in
172  * network forwarding scenario.
173  */
174 #define HV_CSUM_FOR_OUTBOUND                                            \
175     (CSUM_IP|CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP|CSUM_IP_TSO|          \
176     CSUM_IP_ISCSI|CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP|              \
177     CSUM_IP6_TSO|CSUM_IP6_ISCSI)
178
179 /* XXX move to netinet/tcp_lro.h */
180 #define HN_LRO_HIWAT_MAX                                65535
181 #define HN_LRO_HIWAT_DEF                                HN_LRO_HIWAT_MAX
182 /* YYY 2*MTU is a bit rough, but should be good enough. */
183 #define HN_LRO_HIWAT_MTULIM(ifp)                        (2 * (ifp)->if_mtu)
184 #define HN_LRO_HIWAT_ISVALID(sc, hiwat)                 \
185     ((hiwat) >= HN_LRO_HIWAT_MTULIM((sc)->hn_ifp) ||    \
186      (hiwat) <= HN_LRO_HIWAT_MAX)
187
188 /*
189  * Be aware that this sleepable mutex will exhibit WITNESS errors when
190  * certain TCP and ARP code paths are taken.  This appears to be a
191  * well-known condition, as all other drivers checked use a sleeping
192  * mutex to protect their transmit paths.
193  * Also Be aware that mutexes do not play well with semaphores, and there
194  * is a conflicting semaphore in a certain channel code path.
195  */
196 #define NV_LOCK_INIT(_sc, _name) \
197             mtx_init(&(_sc)->hn_lock, _name, MTX_NETWORK_LOCK, MTX_DEF)
198 #define NV_LOCK(_sc)            mtx_lock(&(_sc)->hn_lock)
199 #define NV_TRYLOCK(_sc)         mtx_trylock(&(_sc)->hn_lock)
200 #define NV_LOCK_ASSERT(_sc)     mtx_assert(&(_sc)->hn_lock, MA_OWNED)
201 #define NV_UNLOCK(_sc)          mtx_unlock(&(_sc)->hn_lock)
202 #define NV_LOCK_DESTROY(_sc)    mtx_destroy(&(_sc)->hn_lock)
203
204
205 /*
206  * Globals
207  */
208
209 int hv_promisc_mode = 0;    /* normal mode by default */
210
211 /* Trust tcp segements verification on host side. */
212 static int hn_trust_hosttcp = 1;
213 TUNABLE_INT("dev.hn.trust_hosttcp", &hn_trust_hosttcp);
214
215 #if __FreeBSD_version >= 1100045
216 /* Limit TSO burst size */
217 static int hn_tso_maxlen = 0;
218 TUNABLE_INT("dev.hn.tso_maxlen", &hn_tso_maxlen);
219 #endif
220
221 /* Limit chimney send size */
222 static int hn_tx_chimney_size = 0;
223 TUNABLE_INT("dev.hn.tx_chimney_size", &hn_tx_chimney_size);
224
225 /* Limit the size of packet for direct transmission */
226 static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF;
227 TUNABLE_INT("dev.hn.direct_tx_size", &hn_direct_tx_size);
228
229 /*
230  * Forward declarations
231  */
232 static void hn_stop(hn_softc_t *sc);
233 static void hn_ifinit_locked(hn_softc_t *sc);
234 static void hn_ifinit(void *xsc);
235 static int  hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
236 static int hn_start_locked(struct ifnet *ifp, int len);
237 static void hn_start(struct ifnet *ifp);
238 static void hn_start_txeof(struct ifnet *ifp);
239 static int hn_ifmedia_upd(struct ifnet *ifp);
240 static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
241 #ifdef HN_LRO_HIWAT
242 static int hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS);
243 #endif
244 static int hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS);
245 static int hn_check_iplen(const struct mbuf *, int);
246 static int hn_create_tx_ring(struct hn_softc *sc);
247 static void hn_destroy_tx_ring(struct hn_softc *sc);
248 static void hn_start_taskfunc(void *xsc, int pending);
249 static void hn_txeof_taskfunc(void *xsc, int pending);
250
251 static __inline void
252 hn_set_lro_hiwat(struct hn_softc *sc, int hiwat)
253 {
254         sc->hn_lro_hiwat = hiwat;
255 #ifdef HN_LRO_HIWAT
256         sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat;
257 #endif
258 }
259
260 /*
261  * NetVsc get message transport protocol type 
262  */
263 static uint32_t get_transport_proto_type(struct mbuf *m_head)
264 {
265         uint32_t ret_val = TRANSPORT_TYPE_NOT_IP;
266         uint16_t ether_type = 0;
267         int ether_len = 0;
268         struct ether_vlan_header *eh;
269 #ifdef INET
270         struct ip *iph;
271 #endif
272 #ifdef INET6
273         struct ip6_hdr *ip6;
274 #endif
275
276         eh = mtod(m_head, struct ether_vlan_header*);
277         if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
278                 ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
279                 ether_type = eh->evl_proto;
280         } else {
281                 ether_len = ETHER_HDR_LEN;
282                 ether_type = eh->evl_encap_proto;
283         }
284
285         switch (ntohs(ether_type)) {
286 #ifdef INET6
287         case ETHERTYPE_IPV6:
288                 ip6 = (struct ip6_hdr *)(m_head->m_data + ether_len);
289
290                 if (IPPROTO_TCP == ip6->ip6_nxt) {
291                         ret_val = TRANSPORT_TYPE_IPV6_TCP;
292                 } else if (IPPROTO_UDP == ip6->ip6_nxt) {
293                         ret_val = TRANSPORT_TYPE_IPV6_UDP;
294                 }
295                 break;
296 #endif
297 #ifdef INET
298         case ETHERTYPE_IP:
299                 iph = (struct ip *)(m_head->m_data + ether_len);
300
301                 if (IPPROTO_TCP == iph->ip_p) {
302                         ret_val = TRANSPORT_TYPE_IPV4_TCP;
303                 } else if (IPPROTO_UDP == iph->ip_p) {
304                         ret_val = TRANSPORT_TYPE_IPV4_UDP;
305                 }
306                 break;
307 #endif
308         default:
309                 ret_val = TRANSPORT_TYPE_NOT_IP;
310                 break;
311         }
312
313         return (ret_val);
314 }
315
316 static int
317 hn_ifmedia_upd(struct ifnet *ifp __unused)
318 {
319
320         return EOPNOTSUPP;
321 }
322
323 static void
324 hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
325 {
326         struct hn_softc *sc = ifp->if_softc;
327
328         ifmr->ifm_status = IFM_AVALID;
329         ifmr->ifm_active = IFM_ETHER;
330
331         if (!sc->hn_carrier) {
332                 ifmr->ifm_active |= IFM_NONE;
333                 return;
334         }
335         ifmr->ifm_status |= IFM_ACTIVE;
336         ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
337 }
338
339 /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
340 static const hv_guid g_net_vsc_device_type = {
341         .data = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
342                 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
343 };
344
345 /*
346  * Standard probe entry point.
347  *
348  */
349 static int
350 netvsc_probe(device_t dev)
351 {
352         const char *p;
353
354         p = vmbus_get_type(dev);
355         if (!memcmp(p, &g_net_vsc_device_type.data, sizeof(hv_guid))) {
356                 device_set_desc(dev, "Synthetic Network Interface");
357                 if (bootverbose)
358                         printf("Netvsc probe... DONE \n");
359
360                 return (BUS_PROBE_DEFAULT);
361         }
362
363         return (ENXIO);
364 }
365
366 /*
367  * Standard attach entry point.
368  *
369  * Called when the driver is loaded.  It allocates needed resources,
370  * and initializes the "hardware" and software.
371  */
372 static int
373 netvsc_attach(device_t dev)
374 {
375         struct hv_device *device_ctx = vmbus_get_devctx(dev);
376         netvsc_device_info device_info;
377         hn_softc_t *sc;
378         int unit = device_get_unit(dev);
379         struct ifnet *ifp = NULL;
380         struct sysctl_oid_list *child;
381         struct sysctl_ctx_list *ctx;
382         int error;
383 #if __FreeBSD_version >= 1100045
384         int tso_maxlen;
385 #endif
386
387         sc = device_get_softc(dev);
388         if (sc == NULL) {
389                 return (ENOMEM);
390         }
391
392         bzero(sc, sizeof(hn_softc_t));
393         sc->hn_unit = unit;
394         sc->hn_dev = dev;
395         sc->hn_lro_hiwat = HN_LRO_HIWAT_DEF;
396         sc->hn_trust_hosttcp = hn_trust_hosttcp;
397         sc->hn_direct_tx_size = hn_direct_tx_size;
398
399         sc->hn_tx_taskq = taskqueue_create_fast("hn_tx", M_WAITOK,
400             taskqueue_thread_enqueue, &sc->hn_tx_taskq);
401         taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx",
402             device_get_nameunit(dev));
403         TASK_INIT(&sc->hn_start_task, 0, hn_start_taskfunc, sc);
404         TASK_INIT(&sc->hn_txeof_task, 0, hn_txeof_taskfunc, sc);
405
406         error = hn_create_tx_ring(sc);
407         if (error)
408                 goto failed;
409
410         NV_LOCK_INIT(sc, "NetVSCLock");
411
412         sc->hn_dev_obj = device_ctx;
413
414         ifp = sc->hn_ifp = sc->arpcom.ac_ifp = if_alloc(IFT_ETHER);
415         ifp->if_softc = sc;
416
417         if_initname(ifp, device_get_name(dev), device_get_unit(dev));
418         ifp->if_dunit = unit;
419         ifp->if_dname = NETVSC_DEVNAME;
420
421         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
422         ifp->if_ioctl = hn_ioctl;
423         ifp->if_start = hn_start;
424         ifp->if_init = hn_ifinit;
425         /* needed by hv_rf_on_device_add() code */
426         ifp->if_mtu = ETHERMTU;
427         IFQ_SET_MAXLEN(&ifp->if_snd, 512);
428         ifp->if_snd.ifq_drv_maxlen = 511;
429         IFQ_SET_READY(&ifp->if_snd);
430
431         ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts);
432         ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL);
433         ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO);
434         /* XXX ifmedia_set really should do this for us */
435         sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media;
436
437         /*
438          * Tell upper layers that we support full VLAN capability.
439          */
440         ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header);
441         ifp->if_capabilities |=
442             IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO |
443             IFCAP_LRO;
444         ifp->if_capenable |=
445             IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO |
446             IFCAP_LRO;
447         /*
448          * Only enable UDP checksum offloading when it is on 2012R2 or
449          * later. UDP checksum offloading doesn't work on earlier
450          * Windows releases.
451          */
452         if (hv_vmbus_protocal_version >= HV_VMBUS_VERSION_WIN8_1)
453                 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
454         else
455                 ifp->if_hwassist = CSUM_TCP | CSUM_TSO;
456
457         error = hv_rf_on_device_add(device_ctx, &device_info);
458         if (error)
459                 goto failed;
460
461         if (device_info.link_state == 0) {
462                 sc->hn_carrier = 1;
463         }
464
465 #if defined(INET) || defined(INET6)
466         tcp_lro_init(&sc->hn_lro);
467         /* Driver private LRO settings */
468         sc->hn_lro.ifp = ifp;
469 #ifdef HN_LRO_HIWAT
470         sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat;
471 #endif
472 #endif  /* INET || INET6 */
473
474 #if __FreeBSD_version >= 1100045
475         tso_maxlen = hn_tso_maxlen;
476         if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET)
477                 tso_maxlen = IP_MAXPACKET;
478
479         ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
480         ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
481         ifp->if_hw_tsomax = tso_maxlen -
482             (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
483 #endif
484
485         ether_ifattach(ifp, device_info.mac_addr);
486
487 #if __FreeBSD_version >= 1100045
488         if_printf(ifp, "TSO: %u/%u/%u\n", ifp->if_hw_tsomax,
489             ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize);
490 #endif
491
492         sc->hn_tx_chimney_max = sc->net_dev->send_section_size;
493         sc->hn_tx_chimney_size = sc->hn_tx_chimney_max;
494         if (hn_tx_chimney_size > 0 &&
495             hn_tx_chimney_size < sc->hn_tx_chimney_max)
496                 sc->hn_tx_chimney_size = hn_tx_chimney_size;
497
498         ctx = device_get_sysctl_ctx(dev);
499         child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
500
501         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_queued",
502             CTLFLAG_RW, &sc->hn_lro.lro_queued, 0, "LRO queued");
503         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_flushed",
504             CTLFLAG_RW, &sc->hn_lro.lro_flushed, 0, "LRO flushed");
505         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "lro_tried",
506             CTLFLAG_RW, &sc->hn_lro_tried, "# of LRO tries");
507 #ifdef HN_LRO_HIWAT
508         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_hiwat",
509             CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_hiwat_sysctl,
510             "I", "LRO high watermark");
511 #endif
512         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "trust_hosttcp",
513             CTLFLAG_RW, &sc->hn_trust_hosttcp, 0,
514             "Trust tcp segement verification on host side, "
515             "when csum info is missing");
516         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_ip",
517             CTLFLAG_RW, &sc->hn_csum_ip, "RXCSUM IP");
518         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_tcp",
519             CTLFLAG_RW, &sc->hn_csum_tcp, "RXCSUM TCP");
520         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_trusted",
521             CTLFLAG_RW, &sc->hn_csum_trusted,
522             "# of TCP segements that we trust host's csum verification");
523         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "small_pkts",
524             CTLFLAG_RW, &sc->hn_small_pkts, "# of small packets received");
525         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "no_txdescs",
526             CTLFLAG_RW, &sc->hn_no_txdescs, "# of times short of TX descs");
527         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "send_failed",
528             CTLFLAG_RW, &sc->hn_send_failed, "# of hyper-v sending failure");
529         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "txdma_failed",
530             CTLFLAG_RW, &sc->hn_txdma_failed, "# of TX DMA failure");
531         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_collapsed",
532             CTLFLAG_RW, &sc->hn_tx_collapsed, "# of TX mbuf collapsed");
533         SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_chimney",
534             CTLFLAG_RW, &sc->hn_tx_chimney, "# of chimney send");
535         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt",
536             CTLFLAG_RD, &sc->hn_txdesc_cnt, 0, "# of total TX descs");
537         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail",
538             CTLFLAG_RD, &sc->hn_txdesc_avail, 0, "# of available TX descs");
539         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max",
540             CTLFLAG_RD, &sc->hn_tx_chimney_max, 0,
541             "Chimney send packet size upper boundary");
542         SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size",
543             CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_tx_chimney_size_sysctl,
544             "I", "Chimney send packet size limit");
545         SYSCTL_ADD_INT(ctx, child, OID_AUTO, "direct_tx_size",
546             CTLFLAG_RW, &sc->hn_direct_tx_size, 0,
547             "Size of the packet for direct transmission");
548
549         if (unit == 0) {
550                 struct sysctl_ctx_list *dc_ctx;
551                 struct sysctl_oid_list *dc_child;
552                 devclass_t dc;
553
554                 /*
555                  * Add sysctl nodes for devclass
556                  */
557                 dc = device_get_devclass(dev);
558                 dc_ctx = devclass_get_sysctl_ctx(dc);
559                 dc_child = SYSCTL_CHILDREN(devclass_get_sysctl_tree(dc));
560
561                 SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "trust_hosttcp",
562                     CTLFLAG_RD, &hn_trust_hosttcp, 0,
563                     "Trust tcp segement verification on host side, "
564                     "when csum info is missing (global setting)");
565                 SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tx_chimney_size",
566                     CTLFLAG_RD, &hn_tx_chimney_size, 0,
567                     "Chimney send packet size limit");
568 #if __FreeBSD_version >= 1100045
569                 SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "tso_maxlen",
570                     CTLFLAG_RD, &hn_tso_maxlen, 0, "TSO burst limit");
571 #endif
572                 SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "direct_tx_size",
573                     CTLFLAG_RD, &hn_direct_tx_size, 0,
574                     "Size of the packet for direct transmission");
575         }
576
577         return (0);
578 failed:
579         hn_destroy_tx_ring(sc);
580         if (ifp != NULL)
581                 if_free(ifp);
582         return (error);
583 }
584
585 /*
586  * Standard detach entry point
587  */
588 static int
589 netvsc_detach(device_t dev)
590 {
591         struct hn_softc *sc = device_get_softc(dev);
592         struct hv_device *hv_device = vmbus_get_devctx(dev); 
593
594         if (bootverbose)
595                 printf("netvsc_detach\n");
596
597         /*
598          * XXXKYS:  Need to clean up all our
599          * driver state; this is the driver
600          * unloading.
601          */
602
603         /*
604          * XXXKYS:  Need to stop outgoing traffic and unregister
605          * the netdevice.
606          */
607
608         hv_rf_on_device_remove(hv_device, HV_RF_NV_DESTROY_CHANNEL);
609
610         taskqueue_drain(sc->hn_tx_taskq, &sc->hn_start_task);
611         taskqueue_drain(sc->hn_tx_taskq, &sc->hn_txeof_task);
612         taskqueue_free(sc->hn_tx_taskq);
613
614         ifmedia_removeall(&sc->hn_media);
615 #if defined(INET) || defined(INET6)
616         tcp_lro_free(&sc->hn_lro);
617 #endif
618         hn_destroy_tx_ring(sc);
619
620         return (0);
621 }
622
623 /*
624  * Standard shutdown entry point
625  */
626 static int
627 netvsc_shutdown(device_t dev)
628 {
629         return (0);
630 }
631
632 static __inline int
633 hn_txdesc_dmamap_load(struct hn_softc *sc, struct hn_txdesc *txd,
634     struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs)
635 {
636         struct mbuf *m = *m_head;
637         int error;
638
639         error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag, txd->data_dmap,
640             m, segs, nsegs, BUS_DMA_NOWAIT);
641         if (error == EFBIG) {
642                 struct mbuf *m_new;
643
644                 m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX);
645                 if (m_new == NULL)
646                         return ENOBUFS;
647                 else
648                         *m_head = m = m_new;
649                 sc->hn_tx_collapsed++;
650
651                 error = bus_dmamap_load_mbuf_sg(sc->hn_tx_data_dtag,
652                     txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT);
653         }
654         if (!error) {
655                 bus_dmamap_sync(sc->hn_tx_data_dtag, txd->data_dmap,
656                     BUS_DMASYNC_PREWRITE);
657                 txd->flags |= HN_TXD_FLAG_DMAMAP;
658         }
659         return error;
660 }
661
662 static __inline void
663 hn_txdesc_dmamap_unload(struct hn_softc *sc, struct hn_txdesc *txd)
664 {
665
666         if (txd->flags & HN_TXD_FLAG_DMAMAP) {
667                 bus_dmamap_sync(sc->hn_tx_data_dtag,
668                     txd->data_dmap, BUS_DMASYNC_POSTWRITE);
669                 bus_dmamap_unload(sc->hn_tx_data_dtag,
670                     txd->data_dmap);
671                 txd->flags &= ~HN_TXD_FLAG_DMAMAP;
672         }
673 }
674
675 static __inline int
676 hn_txdesc_put(struct hn_softc *sc, struct hn_txdesc *txd)
677 {
678
679         KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0,
680             ("put an onlist txd %#x", txd->flags));
681
682         KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs));
683         if (atomic_fetchadd_int(&txd->refs, -1) != 1)
684                 return 0;
685
686         hn_txdesc_dmamap_unload(sc, txd);
687         if (txd->m != NULL) {
688                 m_freem(txd->m);
689                 txd->m = NULL;
690         }
691
692         txd->flags |= HN_TXD_FLAG_ONLIST;
693
694         mtx_lock_spin(&sc->hn_txlist_spin);
695         KASSERT(sc->hn_txdesc_avail >= 0 &&
696             sc->hn_txdesc_avail < sc->hn_txdesc_cnt,
697             ("txdesc_put: invalid txd avail %d", sc->hn_txdesc_avail));
698         sc->hn_txdesc_avail++;
699         SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link);
700         mtx_unlock_spin(&sc->hn_txlist_spin);
701
702         return 1;
703 }
704
705 static __inline struct hn_txdesc *
706 hn_txdesc_get(struct hn_softc *sc)
707 {
708         struct hn_txdesc *txd;
709
710         mtx_lock_spin(&sc->hn_txlist_spin);
711         txd = SLIST_FIRST(&sc->hn_txlist);
712         if (txd != NULL) {
713                 KASSERT(sc->hn_txdesc_avail > 0,
714                     ("txdesc_get: invalid txd avail %d", sc->hn_txdesc_avail));
715                 sc->hn_txdesc_avail--;
716                 SLIST_REMOVE_HEAD(&sc->hn_txlist, link);
717         }
718         mtx_unlock_spin(&sc->hn_txlist_spin);
719
720         if (txd != NULL) {
721                 KASSERT(txd->m == NULL && txd->refs == 0 &&
722                     (txd->flags & HN_TXD_FLAG_ONLIST), ("invalid txd"));
723                 txd->flags &= ~HN_TXD_FLAG_ONLIST;
724                 txd->refs = 1;
725         }
726         return txd;
727 }
728
729 static __inline void
730 hn_txdesc_hold(struct hn_txdesc *txd)
731 {
732
733         /* 0->1 transition will never work */
734         KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs));
735         atomic_add_int(&txd->refs, 1);
736 }
737
738 /*
739  * Send completion processing
740  *
741  * Note:  It looks like offset 0 of buf is reserved to hold the softc
742  * pointer.  The sc pointer is not currently needed in this function, and
743  * it is not presently populated by the TX function.
744  */
745 void
746 netvsc_xmit_completion(void *context)
747 {
748         netvsc_packet *packet = context;
749         struct hn_txdesc *txd;
750         struct hn_softc *sc;
751
752         txd = (struct hn_txdesc *)(uintptr_t)
753             packet->compl.send.send_completion_tid;
754
755         sc = txd->sc;
756         sc->hn_txeof = 1;
757         hn_txdesc_put(sc, txd);
758 }
759
760 void
761 netvsc_channel_rollup(struct hv_device *device_ctx)
762 {
763         struct hn_softc *sc = device_get_softc(device_ctx->device);
764
765         if (!sc->hn_txeof)
766                 return;
767
768         sc->hn_txeof = 0;
769         hn_start_txeof(sc->hn_ifp);
770 }
771
772 /*
773  * Start a transmit of one or more packets
774  */
775 static int
776 hn_start_locked(struct ifnet *ifp, int len)
777 {
778         hn_softc_t *sc = ifp->if_softc;
779         struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
780         netvsc_dev *net_dev = sc->net_dev;
781         struct ether_vlan_header *eh;
782         rndis_msg *rndis_mesg;
783         rndis_packet *rndis_pkt;
784         rndis_per_packet_info *rppi;
785         ndis_8021q_info *rppi_vlan_info;
786         rndis_tcp_ip_csum_info *csum_info;
787         rndis_tcp_tso_info *tso_info;   
788         int ether_len;
789         uint32_t rndis_msg_size = 0;
790         uint32_t trans_proto_type;
791
792         if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
793             IFF_DRV_RUNNING)
794                 return 0;
795
796         while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
797                 bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX];
798                 int error, nsegs, i, send_failed = 0;
799                 struct hn_txdesc *txd;
800                 netvsc_packet *packet;
801                 struct mbuf *m_head;
802
803                 IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head);
804                 if (m_head == NULL)
805                         break;
806
807                 if (len > 0 && m_head->m_pkthdr.len > len) {
808                         /*
809                          * This sending could be time consuming; let callers
810                          * dispatch this packet sending (and sending of any
811                          * following up packets) to tx taskqueue.
812                          */
813                         IF_PREPEND(&ifp->if_snd, m_head);
814                         return 1;
815                 }
816
817                 txd = hn_txdesc_get(sc);
818                 if (txd == NULL) {
819                         sc->hn_no_txdescs++;
820                         IF_PREPEND(&ifp->if_snd, m_head);
821                         atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
822                         break;
823                 }
824
825                 packet = &txd->netvsc_pkt;
826                 packet->is_data_pkt = TRUE;
827                 /* Initialize it from the mbuf */
828                 packet->tot_data_buf_len = m_head->m_pkthdr.len;
829
830                 /*
831                  * extension points to the area reserved for the
832                  * rndis_filter_packet, which is placed just after
833                  * the netvsc_packet (and rppi struct, if present;
834                  * length is updated later).
835                  */
836                 rndis_mesg = txd->rndis_msg;
837                 /* XXX not necessary */
838                 memset(rndis_mesg, 0, HN_RNDIS_MSG_LEN);
839                 rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG;
840
841                 rndis_pkt = &rndis_mesg->msg.packet;
842                 rndis_pkt->data_offset = sizeof(rndis_packet);
843                 rndis_pkt->data_length = packet->tot_data_buf_len;
844                 rndis_pkt->per_pkt_info_offset = sizeof(rndis_packet);
845
846                 rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet);
847
848                 /*
849                  * If the Hyper-V infrastructure needs to embed a VLAN tag,
850                  * initialize netvsc_packet and rppi struct values as needed.
851                  */
852                 if (m_head->m_flags & M_VLANTAG) {
853                         /*
854                          * set up some additional fields so the Hyper-V infrastructure will stuff the VLAN tag
855                          * into the frame.
856                          */
857                         rndis_msg_size += RNDIS_VLAN_PPI_SIZE;
858
859                         rppi = hv_set_rppi_data(rndis_mesg, RNDIS_VLAN_PPI_SIZE,
860                             ieee_8021q_info);
861                 
862                         /* VLAN info immediately follows rppi struct */
863                         rppi_vlan_info = (ndis_8021q_info *)((char*)rppi + 
864                             rppi->per_packet_info_offset);
865                         /* FreeBSD does not support CFI or priority */
866                         rppi_vlan_info->u1.s1.vlan_id =
867                             m_head->m_pkthdr.ether_vtag & 0xfff;
868                 }
869
870                 /* Only check the flags for outbound and ignore the ones for inbound */
871                 if (0 == (m_head->m_pkthdr.csum_flags & HV_CSUM_FOR_OUTBOUND)) {
872                         goto pre_send;
873                 }
874
875                 eh = mtod(m_head, struct ether_vlan_header*);
876                 if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
877                         ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
878                 } else {
879                         ether_len = ETHER_HDR_LEN;
880                 }
881
882                 trans_proto_type = get_transport_proto_type(m_head);
883                 if (TRANSPORT_TYPE_NOT_IP == trans_proto_type) {
884                         goto pre_send;
885                 }
886
887                 /*
888                  * TSO packet needless to setup the send side checksum
889                  * offload.
890                  */
891                 if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
892                         goto do_tso;
893                 }
894
895                 /* setup checksum offload */
896                 rndis_msg_size += RNDIS_CSUM_PPI_SIZE;
897                 rppi = hv_set_rppi_data(rndis_mesg, RNDIS_CSUM_PPI_SIZE,
898                     tcpip_chksum_info);
899                 csum_info = (rndis_tcp_ip_csum_info *)((char*)rppi +
900                     rppi->per_packet_info_offset);
901
902                 if (trans_proto_type & (TYPE_IPV4 << 16)) {
903                         csum_info->xmit.is_ipv4 = 1;
904                 } else {
905                         csum_info->xmit.is_ipv6 = 1;
906                 }
907
908                 if (trans_proto_type & TYPE_TCP) {
909                         csum_info->xmit.tcp_csum = 1;
910                         csum_info->xmit.tcp_header_offset = 0;
911                 } else if (trans_proto_type & TYPE_UDP) {
912                         csum_info->xmit.udp_csum = 1;
913                 }
914
915                 goto pre_send;
916
917 do_tso:
918                 /* setup TCP segmentation offload */
919                 rndis_msg_size += RNDIS_TSO_PPI_SIZE;
920                 rppi = hv_set_rppi_data(rndis_mesg, RNDIS_TSO_PPI_SIZE,
921                     tcp_large_send_info);
922                 
923                 tso_info = (rndis_tcp_tso_info *)((char *)rppi +
924                     rppi->per_packet_info_offset);
925                 tso_info->lso_v2_xmit.type =
926                     RNDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
927                 
928 #ifdef INET
929                 if (trans_proto_type & (TYPE_IPV4 << 16)) {
930                         struct ip *ip =
931                             (struct ip *)(m_head->m_data + ether_len);
932                         unsigned long iph_len = ip->ip_hl << 2;
933                         struct tcphdr *th =
934                             (struct tcphdr *)((caddr_t)ip + iph_len);
935                 
936                         tso_info->lso_v2_xmit.ip_version =
937                             RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV4;
938                         ip->ip_len = 0;
939                         ip->ip_sum = 0;
940                 
941                         th->th_sum = in_pseudo(ip->ip_src.s_addr,
942                             ip->ip_dst.s_addr,
943                             htons(IPPROTO_TCP));
944                 }
945 #endif
946 #if defined(INET6) && defined(INET)
947                 else
948 #endif
949 #ifdef INET6
950                 {
951                         struct ip6_hdr *ip6 =
952                             (struct ip6_hdr *)(m_head->m_data + ether_len);
953                         struct tcphdr *th = (struct tcphdr *)(ip6 + 1);
954
955                         tso_info->lso_v2_xmit.ip_version =
956                             RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV6;
957                         ip6->ip6_plen = 0;
958                         th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
959                 }
960 #endif
961                 tso_info->lso_v2_xmit.tcp_header_offset = 0;
962                 tso_info->lso_v2_xmit.mss = m_head->m_pkthdr.tso_segsz;
963
964 pre_send:
965                 rndis_mesg->msg_len = packet->tot_data_buf_len + rndis_msg_size;
966                 packet->tot_data_buf_len = rndis_mesg->msg_len;
967
968                 /* send packet with send buffer */
969                 if (packet->tot_data_buf_len < sc->hn_tx_chimney_size) {
970                         uint32_t send_buf_section_idx;
971
972                         send_buf_section_idx =
973                             hv_nv_get_next_send_section(net_dev);
974                         if (send_buf_section_idx !=
975                             NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) {
976                                 uint8_t *dest = ((uint8_t *)net_dev->send_buf +
977                                     (send_buf_section_idx *
978                                      net_dev->send_section_size));
979
980                                 memcpy(dest, rndis_mesg, rndis_msg_size);
981                                 dest += rndis_msg_size;
982
983                                 m_copydata(m_head, 0, m_head->m_pkthdr.len,
984                                     dest);
985
986                                 packet->send_buf_section_idx =
987                                     send_buf_section_idx;
988                                 packet->send_buf_section_size =
989                                     packet->tot_data_buf_len;
990                                 packet->page_buf_count = 0;
991                                 sc->hn_tx_chimney++;
992                                 goto do_send;
993                         }
994                 }
995
996                 error = hn_txdesc_dmamap_load(sc, txd, &m_head, segs, &nsegs);
997                 if (error) {
998                         int freed;
999
1000                         /*
1001                          * This mbuf is not linked w/ the txd yet, so free
1002                          * it now.
1003                          */
1004                         m_freem(m_head);
1005                         freed = hn_txdesc_put(sc, txd);
1006                         KASSERT(freed != 0,
1007                             ("fail to free txd upon txdma error"));
1008
1009                         sc->hn_txdma_failed++;
1010                         if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
1011                         continue;
1012                 }
1013
1014                 packet->page_buf_count = nsegs +
1015                     HV_RF_NUM_TX_RESERVED_PAGE_BUFS;
1016
1017                 /* send packet with page buffer */
1018                 packet->page_buffers[0].pfn = atop(txd->rndis_msg_paddr);
1019                 packet->page_buffers[0].offset =
1020                     txd->rndis_msg_paddr & PAGE_MASK;
1021                 packet->page_buffers[0].length = rndis_msg_size;
1022
1023                 /*
1024                  * Fill the page buffers with mbuf info starting at index
1025                  * HV_RF_NUM_TX_RESERVED_PAGE_BUFS.
1026                  */
1027                 for (i = 0; i < nsegs; ++i) {
1028                         hv_vmbus_page_buffer *pb = &packet->page_buffers[
1029                             i + HV_RF_NUM_TX_RESERVED_PAGE_BUFS];
1030
1031                         pb->pfn = atop(segs[i].ds_addr);
1032                         pb->offset = segs[i].ds_addr & PAGE_MASK;
1033                         pb->length = segs[i].ds_len;
1034                 }
1035
1036                 packet->send_buf_section_idx = 
1037                     NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX;
1038                 packet->send_buf_section_size = 0;
1039
1040 do_send:
1041                 txd->m = m_head;
1042
1043                 /* Set the completion routine */
1044                 packet->compl.send.on_send_completion = netvsc_xmit_completion;
1045                 packet->compl.send.send_completion_context = packet;
1046                 packet->compl.send.send_completion_tid =
1047                     (uint64_t)(uintptr_t)txd;
1048
1049 again:
1050                 /*
1051                  * Make sure that txd is not freed before ETHER_BPF_MTAP.
1052                  */
1053                 hn_txdesc_hold(txd);
1054                 error = hv_nv_on_send(device_ctx, packet);
1055                 if (!error) {
1056                         ETHER_BPF_MTAP(ifp, m_head);
1057                         if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
1058                 }
1059                 hn_txdesc_put(sc, txd);
1060
1061                 if (__predict_false(error)) {
1062                         int freed;
1063
1064                         /*
1065                          * This should "really rarely" happen.
1066                          *
1067                          * XXX Too many RX to be acked or too many sideband
1068                          * commands to run?  Ask netvsc_channel_rollup()
1069                          * to kick start later.
1070                          */
1071                         sc->hn_txeof = 1;
1072                         if (!send_failed) {
1073                                 sc->hn_send_failed++;
1074                                 send_failed = 1;
1075                                 /*
1076                                  * Try sending again after set hn_txeof;
1077                                  * in case that we missed the last
1078                                  * netvsc_channel_rollup().
1079                                  */
1080                                 goto again;
1081                         }
1082                         if_printf(ifp, "send failed\n");
1083
1084                         /*
1085                          * This mbuf will be prepended, don't free it
1086                          * in hn_txdesc_put(); only unload it from the
1087                          * DMA map in hn_txdesc_put(), if it was loaded.
1088                          */
1089                         txd->m = NULL;
1090                         freed = hn_txdesc_put(sc, txd);
1091                         KASSERT(freed != 0,
1092                             ("fail to free txd upon send error"));
1093
1094                         sc->hn_send_failed++;
1095                         IF_PREPEND(&ifp->if_snd, m_head);
1096                         atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1097                         break;
1098                 }
1099         }
1100         return 0;
1101 }
1102
1103 /*
1104  * Link up/down notification
1105  */
1106 void
1107 netvsc_linkstatus_callback(struct hv_device *device_obj, uint32_t status)
1108 {
1109         hn_softc_t *sc = device_get_softc(device_obj->device);
1110
1111         if (sc == NULL) {
1112                 return;
1113         }
1114
1115         if (status == 1) {
1116                 sc->hn_carrier = 1;
1117         } else {
1118                 sc->hn_carrier = 0;
1119         }
1120 }
1121
1122 /*
1123  * Append the specified data to the indicated mbuf chain,
1124  * Extend the mbuf chain if the new data does not fit in
1125  * existing space.
1126  *
1127  * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c.
1128  * There should be an equivalent in the kernel mbuf code,
1129  * but there does not appear to be one yet.
1130  *
1131  * Differs from m_append() in that additional mbufs are
1132  * allocated with cluster size MJUMPAGESIZE, and filled
1133  * accordingly.
1134  *
1135  * Return 1 if able to complete the job; otherwise 0.
1136  */
1137 static int
1138 hv_m_append(struct mbuf *m0, int len, c_caddr_t cp)
1139 {
1140         struct mbuf *m, *n;
1141         int remainder, space;
1142
1143         for (m = m0; m->m_next != NULL; m = m->m_next)
1144                 ;
1145         remainder = len;
1146         space = M_TRAILINGSPACE(m);
1147         if (space > 0) {
1148                 /*
1149                  * Copy into available space.
1150                  */
1151                 if (space > remainder)
1152                         space = remainder;
1153                 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
1154                 m->m_len += space;
1155                 cp += space;
1156                 remainder -= space;
1157         }
1158         while (remainder > 0) {
1159                 /*
1160                  * Allocate a new mbuf; could check space
1161                  * and allocate a cluster instead.
1162                  */
1163                 n = m_getjcl(M_DONTWAIT, m->m_type, 0, MJUMPAGESIZE);
1164                 if (n == NULL)
1165                         break;
1166                 n->m_len = min(MJUMPAGESIZE, remainder);
1167                 bcopy(cp, mtod(n, caddr_t), n->m_len);
1168                 cp += n->m_len;
1169                 remainder -= n->m_len;
1170                 m->m_next = n;
1171                 m = n;
1172         }
1173         if (m0->m_flags & M_PKTHDR)
1174                 m0->m_pkthdr.len += len - remainder;
1175
1176         return (remainder == 0);
1177 }
1178
1179
1180 /*
1181  * Called when we receive a data packet from the "wire" on the
1182  * specified device
1183  *
1184  * Note:  This is no longer used as a callback
1185  */
1186 int
1187 netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet,
1188     rndis_tcp_ip_csum_info *csum_info)
1189 {
1190         hn_softc_t *sc = (hn_softc_t *)device_get_softc(device_ctx->device);
1191         struct mbuf *m_new;
1192         struct ifnet *ifp;
1193         device_t dev = device_ctx->device;
1194         int size, do_lro = 0;
1195
1196         if (sc == NULL) {
1197                 return (0); /* TODO: KYS how can this be! */
1198         }
1199
1200         ifp = sc->hn_ifp;
1201         
1202         ifp = sc->arpcom.ac_ifp;
1203
1204         if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1205                 return (0);
1206         }
1207
1208         /*
1209          * Bail out if packet contains more data than configured MTU.
1210          */
1211         if (packet->tot_data_buf_len > (ifp->if_mtu + ETHER_HDR_LEN)) {
1212                 return (0);
1213         } else if (packet->tot_data_buf_len <= MHLEN) {
1214                 m_new = m_gethdr(M_NOWAIT, MT_DATA);
1215                 if (m_new == NULL)
1216                         return (0);
1217                 memcpy(mtod(m_new, void *), packet->data,
1218                     packet->tot_data_buf_len);
1219                 m_new->m_pkthdr.len = m_new->m_len = packet->tot_data_buf_len;
1220                 sc->hn_small_pkts++;
1221         } else {
1222                 /*
1223                  * Get an mbuf with a cluster.  For packets 2K or less,
1224                  * get a standard 2K cluster.  For anything larger, get a
1225                  * 4K cluster.  Any buffers larger than 4K can cause problems
1226                  * if looped around to the Hyper-V TX channel, so avoid them.
1227                  */
1228                 size = MCLBYTES;
1229                 if (packet->tot_data_buf_len > MCLBYTES) {
1230                         /* 4096 */
1231                         size = MJUMPAGESIZE;
1232                 }
1233
1234                 m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
1235                 if (m_new == NULL) {
1236                         device_printf(dev, "alloc mbuf failed.\n");
1237                         return (0);
1238                 }
1239
1240                 hv_m_append(m_new, packet->tot_data_buf_len, packet->data);
1241         }
1242         m_new->m_pkthdr.rcvif = ifp;
1243
1244         /* receive side checksum offload */
1245         if (NULL != csum_info) {
1246                 /* IP csum offload */
1247                 if (csum_info->receive.ip_csum_succeeded) {
1248                         m_new->m_pkthdr.csum_flags |=
1249                             (CSUM_IP_CHECKED | CSUM_IP_VALID);
1250                         sc->hn_csum_ip++;
1251                 }
1252
1253                 /* TCP csum offload */
1254                 if (csum_info->receive.tcp_csum_succeeded) {
1255                         m_new->m_pkthdr.csum_flags |=
1256                             (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1257                         m_new->m_pkthdr.csum_data = 0xffff;
1258                         sc->hn_csum_tcp++;
1259                 }
1260
1261                 if (csum_info->receive.ip_csum_succeeded &&
1262                     csum_info->receive.tcp_csum_succeeded)
1263                         do_lro = 1;
1264         } else {
1265                 const struct ether_header *eh;
1266                 uint16_t etype;
1267                 int hoff;
1268
1269                 hoff = sizeof(*eh);
1270                 if (m_new->m_len < hoff)
1271                         goto skip;
1272                 eh = mtod(m_new, struct ether_header *);
1273                 etype = ntohs(eh->ether_type);
1274                 if (etype == ETHERTYPE_VLAN) {
1275                         const struct ether_vlan_header *evl;
1276
1277                         hoff = sizeof(*evl);
1278                         if (m_new->m_len < hoff)
1279                                 goto skip;
1280                         evl = mtod(m_new, struct ether_vlan_header *);
1281                         etype = ntohs(evl->evl_proto);
1282                 }
1283
1284                 if (etype == ETHERTYPE_IP) {
1285                         int pr;
1286
1287                         pr = hn_check_iplen(m_new, hoff);
1288                         if (pr == IPPROTO_TCP) {
1289                                 if (sc->hn_trust_hosttcp) {
1290                                         sc->hn_csum_trusted++;
1291                                         m_new->m_pkthdr.csum_flags |=
1292                                            (CSUM_IP_CHECKED | CSUM_IP_VALID |
1293                                             CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
1294                                         m_new->m_pkthdr.csum_data = 0xffff;
1295                                 }
1296                                 /* Rely on SW csum verification though... */
1297                                 do_lro = 1;
1298                         }
1299                 }
1300         }
1301 skip:
1302         if ((packet->vlan_tci != 0) &&
1303             (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0) {
1304                 m_new->m_pkthdr.ether_vtag = packet->vlan_tci;
1305                 m_new->m_flags |= M_VLANTAG;
1306         }
1307
1308         /*
1309          * Note:  Moved RX completion back to hv_nv_on_receive() so all
1310          * messages (not just data messages) will trigger a response.
1311          */
1312
1313         ifp->if_ipackets++;
1314
1315         if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
1316 #if defined(INET) || defined(INET6)
1317                 struct lro_ctrl *lro = &sc->hn_lro;
1318
1319                 if (lro->lro_cnt) {
1320                         sc->hn_lro_tried++;
1321                         if (tcp_lro_rx(lro, m_new, 0) == 0) {
1322                                 /* DONE! */
1323                                 return 0;
1324                         }
1325                 }
1326 #endif
1327         }
1328
1329         /* We're not holding the lock here, so don't release it */
1330         (*ifp->if_input)(ifp, m_new);
1331
1332         return (0);
1333 }
1334
1335 void
1336 netvsc_recv_rollup(struct hv_device *device_ctx)
1337 {
1338 #if defined(INET) || defined(INET6)
1339         hn_softc_t *sc = device_get_softc(device_ctx->device);
1340         struct lro_ctrl *lro = &sc->hn_lro;
1341         struct lro_entry *queued;
1342
1343         while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) {
1344                 SLIST_REMOVE_HEAD(&lro->lro_active, next);
1345                 tcp_lro_flush(lro, queued);
1346         }
1347 #endif
1348 }
1349
1350 /*
1351  * Rules for using sc->temp_unusable:
1352  * 1.  sc->temp_unusable can only be read or written while holding NV_LOCK()
1353  * 2.  code reading sc->temp_unusable under NV_LOCK(), and finding 
1354  *     sc->temp_unusable set, must release NV_LOCK() and exit
1355  * 3.  to retain exclusive control of the interface,
1356  *     sc->temp_unusable must be set by code before releasing NV_LOCK()
1357  * 4.  only code setting sc->temp_unusable can clear sc->temp_unusable
1358  * 5.  code setting sc->temp_unusable must eventually clear sc->temp_unusable
1359  */
1360
1361 /*
1362  * Standard ioctl entry point.  Called when the user wants to configure
1363  * the interface.
1364  */
1365 static int
1366 hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1367 {
1368         hn_softc_t *sc = ifp->if_softc;
1369         struct ifreq *ifr = (struct ifreq *)data;
1370 #ifdef INET
1371         struct ifaddr *ifa = (struct ifaddr *)data;
1372 #endif
1373         netvsc_device_info device_info;
1374         struct hv_device *hn_dev;
1375         int mask, error = 0;
1376         int retry_cnt = 500;
1377         
1378         switch(cmd) {
1379
1380         case SIOCSIFADDR:
1381 #ifdef INET
1382                 if (ifa->ifa_addr->sa_family == AF_INET) {
1383                         ifp->if_flags |= IFF_UP;
1384                         if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
1385                                 hn_ifinit(sc);
1386                         arp_ifinit(ifp, ifa);
1387                 } else
1388 #endif
1389                 error = ether_ioctl(ifp, cmd, data);
1390                 break;
1391         case SIOCSIFMTU:
1392                 hn_dev = vmbus_get_devctx(sc->hn_dev);
1393
1394                 /* Check MTU value change */
1395                 if (ifp->if_mtu == ifr->ifr_mtu)
1396                         break;
1397
1398                 if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) {
1399                         error = EINVAL;
1400                         break;
1401                 }
1402
1403                 /* Obtain and record requested MTU */
1404                 ifp->if_mtu = ifr->ifr_mtu;
1405                 /*
1406                  * Make sure that LRO high watermark is still valid,
1407                  * after MTU change (the 2*MTU limit).
1408                  */
1409                 if (!HN_LRO_HIWAT_ISVALID(sc, sc->hn_lro_hiwat))
1410                         hn_set_lro_hiwat(sc, HN_LRO_HIWAT_MTULIM(ifp));
1411
1412                 do {
1413                         NV_LOCK(sc);
1414                         if (!sc->temp_unusable) {
1415                                 sc->temp_unusable = TRUE;
1416                                 retry_cnt = -1;
1417                         }
1418                         NV_UNLOCK(sc);
1419                         if (retry_cnt > 0) {
1420                                 retry_cnt--;
1421                                 DELAY(5 * 1000);
1422                         }
1423                 } while (retry_cnt > 0);
1424
1425                 if (retry_cnt == 0) {
1426                         error = EINVAL;
1427                         break;
1428                 }
1429
1430                 /* We must remove and add back the device to cause the new
1431                  * MTU to take effect.  This includes tearing down, but not
1432                  * deleting the channel, then bringing it back up.
1433                  */
1434                 error = hv_rf_on_device_remove(hn_dev, HV_RF_NV_RETAIN_CHANNEL);
1435                 if (error) {
1436                         NV_LOCK(sc);
1437                         sc->temp_unusable = FALSE;
1438                         NV_UNLOCK(sc);
1439                         break;
1440                 }
1441                 error = hv_rf_on_device_add(hn_dev, &device_info);
1442                 if (error) {
1443                         NV_LOCK(sc);
1444                         sc->temp_unusable = FALSE;
1445                         NV_UNLOCK(sc);
1446                         break;
1447                 }
1448
1449                 sc->hn_tx_chimney_max = sc->net_dev->send_section_size;
1450                 if (sc->hn_tx_chimney_size > sc->hn_tx_chimney_max)
1451                         sc->hn_tx_chimney_size = sc->hn_tx_chimney_max;
1452                 hn_ifinit_locked(sc);
1453
1454                 NV_LOCK(sc);
1455                 sc->temp_unusable = FALSE;
1456                 NV_UNLOCK(sc);
1457                 break;
1458         case SIOCSIFFLAGS:
1459                 do {
1460                        NV_LOCK(sc);
1461                        if (!sc->temp_unusable) {
1462                                sc->temp_unusable = TRUE;
1463                                retry_cnt = -1;
1464                        }
1465                        NV_UNLOCK(sc);
1466                        if (retry_cnt > 0) {
1467                                 retry_cnt--;
1468                                 DELAY(5 * 1000);
1469                        }
1470                 } while (retry_cnt > 0);
1471
1472                 if (retry_cnt == 0) {
1473                        error = EINVAL;
1474                        break;
1475                 }
1476
1477                 if (ifp->if_flags & IFF_UP) {
1478                         /*
1479                          * If only the state of the PROMISC flag changed,
1480                          * then just use the 'set promisc mode' command
1481                          * instead of reinitializing the entire NIC. Doing
1482                          * a full re-init means reloading the firmware and
1483                          * waiting for it to start up, which may take a
1484                          * second or two.
1485                          */
1486 #ifdef notyet
1487                         /* Fixme:  Promiscuous mode? */
1488                         if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1489                             ifp->if_flags & IFF_PROMISC &&
1490                             !(sc->hn_if_flags & IFF_PROMISC)) {
1491                                 /* do something here for Hyper-V */
1492                         } else if (ifp->if_drv_flags & IFF_DRV_RUNNING &&
1493                             !(ifp->if_flags & IFF_PROMISC) &&
1494                             sc->hn_if_flags & IFF_PROMISC) {
1495                                 /* do something here for Hyper-V */
1496                         } else
1497 #endif
1498                                 hn_ifinit_locked(sc);
1499                 } else {
1500                         if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1501                                 hn_stop(sc);
1502                         }
1503                 }
1504                 NV_LOCK(sc);
1505                 sc->temp_unusable = FALSE;
1506                 NV_UNLOCK(sc);
1507                 sc->hn_if_flags = ifp->if_flags;
1508                 error = 0;
1509                 break;
1510         case SIOCSIFCAP:
1511                 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
1512                 if (mask & IFCAP_TXCSUM) {
1513                         if (IFCAP_TXCSUM & ifp->if_capenable) {
1514                                 ifp->if_capenable &= ~IFCAP_TXCSUM;
1515                                 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
1516                         } else {
1517                                 ifp->if_capenable |= IFCAP_TXCSUM;
1518                                 /*
1519                                  * Only enable UDP checksum offloading on
1520                                  * Windows Server 2012R2 or later releases.
1521                                  */
1522                                 if (hv_vmbus_protocal_version >=
1523                                     HV_VMBUS_VERSION_WIN8_1) {
1524                                         ifp->if_hwassist |=
1525                                             (CSUM_TCP | CSUM_UDP);
1526                                 } else {
1527                                         ifp->if_hwassist |= CSUM_TCP;
1528                                 }
1529                         }
1530                 }
1531
1532                 if (mask & IFCAP_RXCSUM) {
1533                         if (IFCAP_RXCSUM & ifp->if_capenable) {
1534                                 ifp->if_capenable &= ~IFCAP_RXCSUM;
1535                         } else {
1536                                 ifp->if_capenable |= IFCAP_RXCSUM;
1537                         }
1538                 }
1539                 if (mask & IFCAP_LRO)
1540                         ifp->if_capenable ^= IFCAP_LRO;
1541
1542                 if (mask & IFCAP_TSO4) {
1543                         ifp->if_capenable ^= IFCAP_TSO4;
1544                         ifp->if_hwassist ^= CSUM_IP_TSO;
1545                 }
1546
1547                 if (mask & IFCAP_TSO6) {
1548                         ifp->if_capenable ^= IFCAP_TSO6;
1549                         ifp->if_hwassist ^= CSUM_IP6_TSO;
1550                 }
1551
1552                 error = 0;
1553                 break;
1554         case SIOCADDMULTI:
1555         case SIOCDELMULTI:
1556 #ifdef notyet
1557                 /* Fixme:  Multicast mode? */
1558                 if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1559                         NV_LOCK(sc);
1560                         netvsc_setmulti(sc);
1561                         NV_UNLOCK(sc);
1562                         error = 0;
1563                 }
1564 #endif
1565                 error = EINVAL;
1566                 break;
1567         case SIOCSIFMEDIA:
1568         case SIOCGIFMEDIA:
1569                 error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
1570                 break;
1571         default:
1572                 error = ether_ioctl(ifp, cmd, data);
1573                 break;
1574         }
1575
1576         return (error);
1577 }
1578
1579 /*
1580  *
1581  */
1582 static void
1583 hn_stop(hn_softc_t *sc)
1584 {
1585         struct ifnet *ifp;
1586         int ret;
1587         struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
1588
1589         ifp = sc->hn_ifp;
1590
1591         if (bootverbose)
1592                 printf(" Closing Device ...\n");
1593
1594         atomic_clear_int(&ifp->if_drv_flags,
1595             (IFF_DRV_RUNNING | IFF_DRV_OACTIVE));
1596         if_link_state_change(ifp, LINK_STATE_DOWN);
1597         sc->hn_initdone = 0;
1598
1599         ret = hv_rf_on_close(device_ctx);
1600 }
1601
1602 /*
1603  * FreeBSD transmit entry point
1604  */
1605 static void
1606 hn_start(struct ifnet *ifp)
1607 {
1608         hn_softc_t *sc;
1609
1610         sc = ifp->if_softc;
1611         if (NV_TRYLOCK(sc)) {
1612                 int sched;
1613
1614                 sched = hn_start_locked(ifp, sc->hn_direct_tx_size);
1615                 NV_UNLOCK(sc);
1616                 if (!sched)
1617                         return;
1618         }
1619         taskqueue_enqueue_fast(sc->hn_tx_taskq, &sc->hn_start_task);
1620 }
1621
1622 static void
1623 hn_start_txeof(struct ifnet *ifp)
1624 {
1625         hn_softc_t *sc;
1626
1627         sc = ifp->if_softc;
1628         if (NV_TRYLOCK(sc)) {
1629                 int sched;
1630
1631                 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1632                 sched = hn_start_locked(ifp, sc->hn_direct_tx_size);
1633                 NV_UNLOCK(sc);
1634                 if (sched) {
1635                         taskqueue_enqueue_fast(sc->hn_tx_taskq,
1636                             &sc->hn_start_task);
1637                 }
1638         } else {
1639                 /*
1640                  * Release the OACTIVE earlier, with the hope, that
1641                  * others could catch up.  The task will clear the
1642                  * flag again with the NV_LOCK to avoid possible
1643                  * races.
1644                  */
1645                 atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1646                 taskqueue_enqueue_fast(sc->hn_tx_taskq, &sc->hn_txeof_task);
1647         }
1648 }
1649
1650 /*
1651  *
1652  */
1653 static void
1654 hn_ifinit_locked(hn_softc_t *sc)
1655 {
1656         struct ifnet *ifp;
1657         struct hv_device *device_ctx = vmbus_get_devctx(sc->hn_dev);
1658         int ret;
1659
1660         ifp = sc->hn_ifp;
1661
1662         if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1663                 return;
1664         }
1665
1666         hv_promisc_mode = 1;
1667
1668         ret = hv_rf_on_open(device_ctx);
1669         if (ret != 0) {
1670                 return;
1671         } else {
1672                 sc->hn_initdone = 1;
1673         }
1674         atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1675         atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
1676         if_link_state_change(ifp, LINK_STATE_UP);
1677 }
1678
1679 /*
1680  *
1681  */
1682 static void
1683 hn_ifinit(void *xsc)
1684 {
1685         hn_softc_t *sc = xsc;
1686
1687         NV_LOCK(sc);
1688         if (sc->temp_unusable) {
1689                 NV_UNLOCK(sc);
1690                 return;
1691         }
1692         sc->temp_unusable = TRUE;
1693         NV_UNLOCK(sc);
1694
1695         hn_ifinit_locked(sc);
1696
1697         NV_LOCK(sc);
1698         sc->temp_unusable = FALSE;
1699         NV_UNLOCK(sc);
1700 }
1701
1702 #ifdef LATER
1703 /*
1704  *
1705  */
1706 static void
1707 hn_watchdog(struct ifnet *ifp)
1708 {
1709         hn_softc_t *sc;
1710         sc = ifp->if_softc;
1711
1712         printf("hn%d: watchdog timeout -- resetting\n", sc->hn_unit);
1713         hn_ifinit(sc);    /*???*/
1714         ifp->if_oerrors++;
1715 }
1716 #endif
1717
1718 #ifdef HN_LRO_HIWAT
1719 static int
1720 hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS)
1721 {
1722         struct hn_softc *sc = arg1;
1723         int hiwat, error;
1724
1725         hiwat = sc->hn_lro_hiwat;
1726         error = sysctl_handle_int(oidp, &hiwat, 0, req);
1727         if (error || req->newptr == NULL)
1728                 return error;
1729
1730         if (!HN_LRO_HIWAT_ISVALID(sc, hiwat))
1731                 return EINVAL;
1732
1733         if (sc->hn_lro_hiwat != hiwat)
1734                 hn_set_lro_hiwat(sc, hiwat);
1735         return 0;
1736 }
1737 #endif  /* HN_LRO_HIWAT */
1738
1739 static int
1740 hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS)
1741 {
1742         struct hn_softc *sc = arg1;
1743         int chimney_size, error;
1744
1745         chimney_size = sc->hn_tx_chimney_size;
1746         error = sysctl_handle_int(oidp, &chimney_size, 0, req);
1747         if (error || req->newptr == NULL)
1748                 return error;
1749
1750         if (chimney_size > sc->hn_tx_chimney_max || chimney_size <= 0)
1751                 return EINVAL;
1752
1753         if (sc->hn_tx_chimney_size != chimney_size)
1754                 sc->hn_tx_chimney_size = chimney_size;
1755         return 0;
1756 }
1757
1758 static int
1759 hn_check_iplen(const struct mbuf *m, int hoff)
1760 {
1761         const struct ip *ip;
1762         int len, iphlen, iplen;
1763         const struct tcphdr *th;
1764         int thoff;                              /* TCP data offset */
1765
1766         len = hoff + sizeof(struct ip);
1767
1768         /* The packet must be at least the size of an IP header. */
1769         if (m->m_pkthdr.len < len)
1770                 return IPPROTO_DONE;
1771
1772         /* The fixed IP header must reside completely in the first mbuf. */
1773         if (m->m_len < len)
1774                 return IPPROTO_DONE;
1775
1776         ip = mtodo(m, hoff);
1777
1778         /* Bound check the packet's stated IP header length. */
1779         iphlen = ip->ip_hl << 2;
1780         if (iphlen < sizeof(struct ip))         /* minimum header length */
1781                 return IPPROTO_DONE;
1782
1783         /* The full IP header must reside completely in the one mbuf. */
1784         if (m->m_len < hoff + iphlen)
1785                 return IPPROTO_DONE;
1786
1787         iplen = ntohs(ip->ip_len);
1788
1789         /*
1790          * Check that the amount of data in the buffers is as
1791          * at least much as the IP header would have us expect.
1792          */
1793         if (m->m_pkthdr.len < hoff + iplen)
1794                 return IPPROTO_DONE;
1795
1796         /*
1797          * Ignore IP fragments.
1798          */
1799         if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF))
1800                 return IPPROTO_DONE;
1801
1802         /*
1803          * The TCP/IP or UDP/IP header must be entirely contained within
1804          * the first fragment of a packet.
1805          */
1806         switch (ip->ip_p) {
1807         case IPPROTO_TCP:
1808                 if (iplen < iphlen + sizeof(struct tcphdr))
1809                         return IPPROTO_DONE;
1810                 if (m->m_len < hoff + iphlen + sizeof(struct tcphdr))
1811                         return IPPROTO_DONE;
1812                 th = (const struct tcphdr *)((const uint8_t *)ip + iphlen);
1813                 thoff = th->th_off << 2;
1814                 if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen)
1815                         return IPPROTO_DONE;
1816                 if (m->m_len < hoff + iphlen + thoff)
1817                         return IPPROTO_DONE;
1818                 break;
1819         case IPPROTO_UDP:
1820                 if (iplen < iphlen + sizeof(struct udphdr))
1821                         return IPPROTO_DONE;
1822                 if (m->m_len < hoff + iphlen + sizeof(struct udphdr))
1823                         return IPPROTO_DONE;
1824                 break;
1825         default:
1826                 if (iplen < iphlen)
1827                         return IPPROTO_DONE;
1828                 break;
1829         }
1830         return ip->ip_p;
1831 }
1832
1833 static void
1834 hn_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error)
1835 {
1836         bus_addr_t *paddr = arg;
1837
1838         if (error)
1839                 return;
1840
1841         KASSERT(nseg == 1, ("too many segments %d!", nseg));
1842         *paddr = segs->ds_addr;
1843 }
1844
1845 static int
1846 hn_create_tx_ring(struct hn_softc *sc)
1847 {
1848         bus_dma_tag_t parent_dtag;
1849         int error, i;
1850
1851         sc->hn_txdesc_cnt = HN_TX_DESC_CNT;
1852         sc->hn_txdesc = malloc(sizeof(struct hn_txdesc) * sc->hn_txdesc_cnt,
1853             M_NETVSC, M_WAITOK | M_ZERO);
1854         SLIST_INIT(&sc->hn_txlist);
1855         mtx_init(&sc->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN);
1856
1857         parent_dtag = bus_get_dma_tag(sc->hn_dev);
1858
1859         /* DMA tag for RNDIS messages. */
1860         error = bus_dma_tag_create(parent_dtag, /* parent */
1861             HN_RNDIS_MSG_ALIGN,         /* alignment */
1862             HN_RNDIS_MSG_BOUNDARY,      /* boundary */
1863             BUS_SPACE_MAXADDR,          /* lowaddr */
1864             BUS_SPACE_MAXADDR,          /* highaddr */
1865             NULL, NULL,                 /* filter, filterarg */
1866             HN_RNDIS_MSG_LEN,           /* maxsize */
1867             1,                          /* nsegments */
1868             HN_RNDIS_MSG_LEN,           /* maxsegsize */
1869             0,                          /* flags */
1870             NULL,                       /* lockfunc */
1871             NULL,                       /* lockfuncarg */
1872             &sc->hn_tx_rndis_dtag);
1873         if (error) {
1874                 device_printf(sc->hn_dev, "failed to create rndis dmatag\n");
1875                 return error;
1876         }
1877
1878         /* DMA tag for data. */
1879         error = bus_dma_tag_create(parent_dtag, /* parent */
1880             1,                          /* alignment */
1881             HN_TX_DATA_BOUNDARY,        /* boundary */
1882             BUS_SPACE_MAXADDR,          /* lowaddr */
1883             BUS_SPACE_MAXADDR,          /* highaddr */
1884             NULL, NULL,                 /* filter, filterarg */
1885             HN_TX_DATA_MAXSIZE,         /* maxsize */
1886             HN_TX_DATA_SEGCNT_MAX,      /* nsegments */
1887             HN_TX_DATA_SEGSIZE,         /* maxsegsize */
1888             0,                          /* flags */
1889             NULL,                       /* lockfunc */
1890             NULL,                       /* lockfuncarg */
1891             &sc->hn_tx_data_dtag);
1892         if (error) {
1893                 device_printf(sc->hn_dev, "failed to create data dmatag\n");
1894                 return error;
1895         }
1896
1897         for (i = 0; i < sc->hn_txdesc_cnt; ++i) {
1898                 struct hn_txdesc *txd = &sc->hn_txdesc[i];
1899
1900                 txd->sc = sc;
1901
1902                 /*
1903                  * Allocate and load RNDIS messages.
1904                  */
1905                 error = bus_dmamem_alloc(sc->hn_tx_rndis_dtag,
1906                     (void **)&txd->rndis_msg,
1907                     BUS_DMA_WAITOK | BUS_DMA_COHERENT,
1908                     &txd->rndis_msg_dmap);
1909                 if (error) {
1910                         device_printf(sc->hn_dev,
1911                             "failed to allocate rndis_msg, %d\n", i);
1912                         return error;
1913                 }
1914
1915                 error = bus_dmamap_load(sc->hn_tx_rndis_dtag,
1916                     txd->rndis_msg_dmap,
1917                     txd->rndis_msg, HN_RNDIS_MSG_LEN,
1918                     hn_dma_map_paddr, &txd->rndis_msg_paddr,
1919                     BUS_DMA_NOWAIT);
1920                 if (error) {
1921                         device_printf(sc->hn_dev,
1922                             "failed to load rndis_msg, %d\n", i);
1923                         bus_dmamem_free(sc->hn_tx_rndis_dtag,
1924                             txd->rndis_msg, txd->rndis_msg_dmap);
1925                         return error;
1926                 }
1927
1928                 /* DMA map for TX data. */
1929                 error = bus_dmamap_create(sc->hn_tx_data_dtag, 0,
1930                     &txd->data_dmap);
1931                 if (error) {
1932                         device_printf(sc->hn_dev,
1933                             "failed to allocate tx data dmamap\n");
1934                         bus_dmamap_unload(sc->hn_tx_rndis_dtag,
1935                             txd->rndis_msg_dmap);
1936                         bus_dmamem_free(sc->hn_tx_rndis_dtag,
1937                             txd->rndis_msg, txd->rndis_msg_dmap);
1938                         return error;
1939                 }
1940
1941                 /* All set, put it to list */
1942                 txd->flags |= HN_TXD_FLAG_ONLIST;
1943                 SLIST_INSERT_HEAD(&sc->hn_txlist, txd, link);
1944         }
1945         sc->hn_txdesc_avail = sc->hn_txdesc_cnt;
1946
1947         return 0;
1948 }
1949
1950 static void
1951 hn_destroy_tx_ring(struct hn_softc *sc)
1952 {
1953         struct hn_txdesc *txd;
1954
1955         while ((txd = SLIST_FIRST(&sc->hn_txlist)) != NULL) {
1956                 KASSERT(txd->m == NULL, ("still has mbuf installed"));
1957                 KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0,
1958                     ("still dma mapped"));
1959                 SLIST_REMOVE_HEAD(&sc->hn_txlist, link);
1960
1961                 bus_dmamap_unload(sc->hn_tx_rndis_dtag,
1962                     txd->rndis_msg_dmap);
1963                 bus_dmamem_free(sc->hn_tx_rndis_dtag,
1964                     txd->rndis_msg, txd->rndis_msg_dmap);
1965
1966                 bus_dmamap_destroy(sc->hn_tx_data_dtag, txd->data_dmap);
1967         }
1968
1969         if (sc->hn_tx_data_dtag != NULL)
1970                 bus_dma_tag_destroy(sc->hn_tx_data_dtag);
1971         if (sc->hn_tx_rndis_dtag != NULL)
1972                 bus_dma_tag_destroy(sc->hn_tx_rndis_dtag);
1973         free(sc->hn_txdesc, M_NETVSC);
1974         mtx_destroy(&sc->hn_txlist_spin);
1975 }
1976
1977 static void
1978 hn_start_taskfunc(void *xsc, int pending __unused)
1979 {
1980         struct hn_softc *sc = xsc;
1981
1982         NV_LOCK(sc);
1983         hn_start_locked(sc->hn_ifp, 0);
1984         NV_UNLOCK(sc);
1985 }
1986
1987 static void
1988 hn_txeof_taskfunc(void *xsc, int pending __unused)
1989 {
1990         struct hn_softc *sc = xsc;
1991         struct ifnet *ifp = sc->hn_ifp;
1992
1993         NV_LOCK(sc);
1994         atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE);
1995         hn_start_locked(ifp, 0);
1996         NV_UNLOCK(sc);
1997 }
1998
1999 static device_method_t netvsc_methods[] = {
2000         /* Device interface */
2001         DEVMETHOD(device_probe,         netvsc_probe),
2002         DEVMETHOD(device_attach,        netvsc_attach),
2003         DEVMETHOD(device_detach,        netvsc_detach),
2004         DEVMETHOD(device_shutdown,      netvsc_shutdown),
2005
2006         { 0, 0 }
2007 };
2008
2009 static driver_t netvsc_driver = {
2010         NETVSC_DEVNAME,
2011         netvsc_methods,
2012         sizeof(hn_softc_t)
2013 };
2014
2015 static devclass_t netvsc_devclass;
2016
2017 DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0);
2018 MODULE_VERSION(hn, 1);
2019 MODULE_DEPEND(hn, vmbus, 1, 1, 1);