2 * Copyright (c) 2006, Cisco Systems, Inc.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Neither the name of Cisco Systems, Inc. nor the names of its contributors
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/sockio.h>
39 #include <sys/malloc.h>
40 #include <sys/kernel.h>
41 #include <sys/socket.h>
42 #include <sys/queue.h>
43 #include <sys/taskqueue.h>
45 #include <sys/module.h>
47 #include <sys/sysctl.h>
50 #include <net/if_arp.h>
51 #include <net/if_types.h>
52 #include <net/ethernet.h>
53 #include <net/if_bridgevar.h>
55 #include <netinet/in_systm.h>
56 #include <netinet/in.h>
57 #include <netinet/in_var.h>
58 #include <netinet/ip.h>
59 #include <netinet/tcp.h>
60 #include <netinet/udp.h>
62 #include <netinet/sctp.h>
63 #include <netinet/sctp_crc32.h>
66 #include <vm/vm_extern.h>
67 #include <vm/vm_kern.h>
69 #include <machine/in_cksum.h>
70 #include <machine/xen-os.h>
71 #include <machine/hypervisor.h>
72 #include <machine/hypervisor-ifs.h>
73 #include <machine/xen_intr.h>
74 #include <machine/evtchn.h>
75 #include <machine/xenbus.h>
76 #include <machine/gnttab.h>
77 #include <machine/xen-public/memory.h>
78 #include <dev/xen/xenbus/xenbus_comms.h>
81 #ifdef XEN_NETBACK_DEBUG
82 #define DPRINTF(fmt, args...) \
83 printf("netback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
85 #define DPRINTF(fmt, args...) ((void)0)
88 #ifdef XEN_NETBACK_DEBUG_LOTS
89 #define DDPRINTF(fmt, args...) \
90 printf("netback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
91 #define DPRINTF_MBUF(_m) print_mbuf(_m, 0)
92 #define DPRINTF_MBUF_LEN(_m, _len) print_mbuf(_m, _len)
94 #define DDPRINTF(fmt, args...) ((void)0)
95 #define DPRINTF_MBUF(_m) ((void)0)
96 #define DPRINTF_MBUF_LEN(_m, _len) ((void)0)
99 #define WPRINTF(fmt, args...) \
100 printf("netback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
102 #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
103 #define BUG_ON PANIC_IF
105 #define IFNAME(_np) (_np)->ifp->if_xname
107 #define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
108 #define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
112 grant_handle_t handle;
116 typedef struct netback_info {
119 STAILQ_ENTRY(netback_info) next_tx;
120 STAILQ_ENTRY(netback_info) next_rx;
121 int on_tx_sched_list;
122 int on_rx_sched_list;
124 struct xenbus_device *xdev;
125 XenbusState frontend_state;
132 struct ring_ref tx_ring_ref;
133 struct ring_ref rx_ring_ref;
134 netif_tx_back_ring_t tx;
135 netif_rx_back_ring_t rx;
136 evtchn_port_t evtchn;
148 #define MAX_PENDING_REQS 256
149 #define PKT_PROT_LEN 64
152 netif_tx_request_t req;
154 } pending_tx_info[MAX_PENDING_REQS];
155 static uint16_t pending_ring[MAX_PENDING_REQS];
156 typedef unsigned int PEND_RING_IDX;
157 #define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
158 static PEND_RING_IDX pending_prod, pending_cons;
159 #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
161 static unsigned long mmap_vstart;
162 #define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE))
164 /* Freed TX mbufs get batched on this ring before return to pending_ring. */
165 static uint16_t dealloc_ring[MAX_PENDING_REQS];
166 static PEND_RING_IDX dealloc_prod, dealloc_cons;
168 static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+1];
169 static mmu_update_t rx_mmu[NET_RX_RING_SIZE];
170 static gnttab_transfer_t grant_rx_op[NET_RX_RING_SIZE];
172 static grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
173 static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS];
174 static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS];
176 static struct task net_tx_task, net_rx_task;
177 static struct callout rx_task_callout;
179 static STAILQ_HEAD(netback_tx_sched_list, netback_info) tx_sched_list =
180 STAILQ_HEAD_INITIALIZER(tx_sched_list);
181 static STAILQ_HEAD(netback_rx_sched_list, netback_info) rx_sched_list =
182 STAILQ_HEAD_INITIALIZER(rx_sched_list);
183 static struct mtx tx_sched_list_lock;
184 static struct mtx rx_sched_list_lock;
186 static int vif_unit_maker = 0;
189 static void netback_start(struct ifnet *ifp);
190 static int netback_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
191 static int vif_add_dev(struct xenbus_device *xdev);
192 static void disconnect_rings(netif_t *netif);
194 #ifdef XEN_NETBACK_DEBUG_LOTS
195 /* Debug code to display the contents of an mbuf */
197 print_mbuf(struct mbuf *m, int max)
200 printf("mbuf %08x len = %d", (unsigned int)m, m->m_pkthdr.len);
201 for (; m; m = m->m_next) {
202 unsigned char *d = m->m_data;
203 for (i=0; i < m->m_len; i++) {
207 printf("\n%04x:", j);
208 printf(" %02x", d[i]);
216 #define MAX_MFN_ALLOC 64
217 static unsigned long mfn_list[MAX_MFN_ALLOC];
218 static unsigned int alloc_index = 0;
223 unsigned long mfn = 0;
224 struct xen_memory_reservation reservation = {
225 .extent_start = mfn_list,
226 .nr_extents = MAX_MFN_ALLOC,
230 if ( unlikely(alloc_index == 0) )
231 alloc_index = HYPERVISOR_memory_op(
232 XENMEM_increase_reservation, &reservation);
233 if ( alloc_index != 0 )
234 mfn = mfn_list[--alloc_index];
239 alloc_empty_page_range(unsigned long nr_pages)
243 multicall_entry_t mcl[17];
244 unsigned long mfn_list[16];
245 struct xen_memory_reservation reservation = {
246 .extent_start = mfn_list,
253 pages = malloc(nr_pages*PAGE_SIZE, M_DEVBUF, M_NOWAIT);
257 memset(mcl, 0, sizeof(mcl));
259 while (i < nr_pages) {
260 unsigned long va = (unsigned long)pages + (i++ * PAGE_SIZE);
262 mcl[j].op = __HYPERVISOR_update_va_mapping;
265 mfn_list[j++] = vtomach(va) >> PAGE_SHIFT;
267 xen_phys_machine[(vtophys(va) >> PAGE_SHIFT)] = INVALID_P2M_ENTRY;
269 if (j == 16 || i == nr_pages) {
270 mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_LOCAL;
272 reservation.nr_extents = j;
274 mcl[j].op = __HYPERVISOR_memory_op;
275 mcl[j].args[0] = XENMEM_decrease_reservation;
276 mcl[j].args[1] = (unsigned long)&reservation;
278 (void)HYPERVISOR_multicall(mcl, j+1);
280 mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = 0;
285 return (unsigned long)pages;
288 #ifdef XEN_NETBACK_FIXUP_CSUM
290 fixup_checksum(struct mbuf *m)
292 struct ether_header *eh = mtod(m, struct ether_header *);
293 struct ip *ip = (struct ip *)(eh + 1);
294 int iphlen = ip->ip_hl << 2;
295 int iplen = ntohs(ip->ip_len);
297 if ((m->m_pkthdr.csum_flags & CSUM_TCP)) {
298 struct tcphdr *th = (struct tcphdr *)((caddr_t)ip + iphlen);
299 th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
300 htons(IPPROTO_TCP + (iplen - iphlen)));
301 th->th_sum = in_cksum_skip(m, iplen + sizeof(*eh), sizeof(*eh) + iphlen);
302 m->m_pkthdr.csum_flags &= ~CSUM_TCP;
304 } else if (sw_csum & CSUM_SCTP) {
305 sctp_delayed_cksum(m);
306 sw_csum &= ~CSUM_SCTP;
310 struct udphdr *uh = (struct udphdr *)((caddr_t)ip + iphlen);
311 uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
312 htons(IPPROTO_UDP + (iplen - iphlen)));
313 if ((csum = in_cksum_skip(m, iplen + sizeof(*eh), sizeof(*eh) + iphlen)) == 0)
316 m->m_pkthdr.csum_flags &= ~CSUM_UDP;
321 /* Add the interface to the specified bridge */
323 add_to_bridge(struct ifnet *ifp, char *bridge)
327 struct ifnet *ifp_bridge = ifunit(bridge);
332 bzero(&ifd, sizeof(ifd));
333 bzero(&ifb, sizeof(ifb));
335 strcpy(ifb.ifbr_ifsname, ifp->if_xname);
336 strcpy(ifd.ifd_name, ifp->if_xname);
337 ifd.ifd_cmd = BRDGADD;
338 ifd.ifd_len = sizeof(ifb);
341 return bridge_ioctl_kern(ifp_bridge, SIOCSDRVSPEC, &ifd);
346 netif_create(int handle, struct xenbus_device *xdev, char *bridge)
351 netif = (netif_t *)malloc(sizeof(*netif), M_DEVBUF, M_NOWAIT | M_ZERO);
356 netif->handle = handle;
357 netif->domid = xdev->otherend_id;
359 netif->bridge = bridge;
362 /* Set up ifnet structure */
363 ifp = netif->ifp = if_alloc(IFT_ETHER);
366 free(bridge, M_DEVBUF);
367 free(netif, M_DEVBUF);
371 ifp->if_softc = netif;
372 if_initname(ifp, "vif",
373 atomic_fetchadd_int(&vif_unit_maker, 1) /* ifno */ );
374 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX;
375 ifp->if_output = ether_output;
376 ifp->if_start = netback_start;
377 ifp->if_ioctl = netback_ioctl;
378 ifp->if_mtu = ETHERMTU;
379 ifp->if_snd.ifq_maxlen = NET_TX_RING_SIZE - 1;
381 DPRINTF("Created %s for domid=%d handle=%d\n", IFNAME(netif), netif->domid, netif->handle);
387 netif_get(netif_t *netif)
389 atomic_add_int(&netif->ref_cnt, 1);
393 netif_put(netif_t *netif)
395 if (atomic_fetchadd_int(&netif->ref_cnt, -1) == 1) {
396 DPRINTF("%s\n", IFNAME(netif));
397 disconnect_rings(netif);
403 free(netif->bridge, M_DEVBUF);
404 free(netif, M_DEVBUF);
409 netback_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
413 DDPRINTF("%s cmd=SIOCSIFFLAGS flags=%x\n",
414 IFNAME((struct netback_info *)ifp->if_softc), ((struct ifreq *)data)->ifr_flags);
418 DDPRINTF("%s cmd=%lx\n", IFNAME((struct netback_info *)ifp->if_softc), cmd);
420 return ether_ioctl(ifp, cmd, data);
424 maybe_schedule_tx_action(void)
427 if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && !STAILQ_EMPTY(&tx_sched_list))
428 taskqueue_enqueue(taskqueue_swi, &net_tx_task);
431 /* Removes netif from front of list and does not call netif_put() (caller must) */
433 remove_from_tx_schedule_list(void)
437 mtx_lock(&tx_sched_list_lock);
439 if ((netif = STAILQ_FIRST(&tx_sched_list))) {
440 STAILQ_REMOVE(&tx_sched_list, netif, netback_info, next_tx);
441 STAILQ_NEXT(netif, next_tx) = NULL;
442 netif->on_tx_sched_list = 0;
445 mtx_unlock(&tx_sched_list_lock);
450 /* Adds netif to end of list and calls netif_get() */
452 add_to_tx_schedule_list_tail(netif_t *netif)
454 if (netif->on_tx_sched_list)
457 mtx_lock(&tx_sched_list_lock);
458 if (!netif->on_tx_sched_list && (netif->ifp->if_drv_flags & IFF_DRV_RUNNING)) {
460 STAILQ_INSERT_TAIL(&tx_sched_list, netif, next_tx);
461 netif->on_tx_sched_list = 1;
463 mtx_unlock(&tx_sched_list_lock);
467 * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER:
468 * If this driver is pipelining transmit requests then we can be very
469 * aggressive in avoiding new-packet notifications -- frontend only needs to
470 * send a notification if there are no outstanding unreceived responses.
471 * If we may be buffer transmit buffers for any reason then we must be rather
472 * more conservative and treat this as the final check for pending work.
475 netif_schedule_tx_work(netif_t *netif)
479 #ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
480 more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx);
482 RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
486 DDPRINTF("Adding %s to tx sched list\n", IFNAME(netif));
487 add_to_tx_schedule_list_tail(netif);
488 maybe_schedule_tx_action();
492 static struct mtx dealloc_lock;
493 MTX_SYSINIT(netback_dealloc, &dealloc_lock, "DEALLOC LOCK", MTX_SPIN | MTX_NOWITNESS);
496 netif_idx_release(uint16_t pending_idx)
498 mtx_lock_spin(&dealloc_lock);
499 dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx;
500 mtx_unlock_spin(&dealloc_lock);
502 taskqueue_enqueue(taskqueue_swi, &net_tx_task);
506 make_tx_response(netif_t *netif,
510 RING_IDX i = netif->tx.rsp_prod_pvt;
511 netif_tx_response_t *resp;
514 resp = RING_GET_RESPONSE(&netif->tx, i);
518 netif->tx.rsp_prod_pvt = ++i;
519 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify);
521 notify_remote_via_irq(netif->irq);
523 #ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER
524 if (i == netif->tx.req_cons) {
526 RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do);
528 add_to_tx_schedule_list_tail(netif);
534 net_tx_action_dealloc(void)
536 gnttab_unmap_grant_ref_t *gop;
537 uint16_t pending_idx;
538 PEND_RING_IDX dc, dp;
546 * Free up any grants we have finished using
550 pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)];
551 gop->host_addr = MMAP_VADDR(pending_idx);
552 gop->dev_bus_addr = 0;
553 gop->handle = grant_tx_handle[pending_idx];
556 ret = HYPERVISOR_grant_table_op(
557 GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops);
560 while (dealloc_cons != dp) {
561 pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)];
563 netif = pending_tx_info[pending_idx].netif;
565 make_tx_response(netif, pending_tx_info[pending_idx].req.id,
568 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
575 netif_page_release(void *buf, void *args)
577 uint16_t pending_idx = (unsigned int)args;
579 DDPRINTF("pending_idx=%u\n", pending_idx);
581 KASSERT(pending_idx < MAX_PENDING_REQS, ("%s: bad index %u", __func__, pending_idx));
583 netif_idx_release(pending_idx);
587 net_tx_action(void *context, int pending)
591 netif_tx_request_t txreq;
592 uint16_t pending_idx;
594 gnttab_map_grant_ref_t *mop;
596 struct mbuf *txq = NULL, *txq_last = NULL;
598 if (dealloc_cons != dealloc_prod)
599 net_tx_action_dealloc();
602 while ((NR_PENDING_REQS < MAX_PENDING_REQS) && !STAILQ_EMPTY(&tx_sched_list)) {
604 /* Get a netif from the list with work to do. */
605 netif = remove_from_tx_schedule_list();
607 DDPRINTF("Processing %s (prod=%u, cons=%u)\n",
608 IFNAME(netif), netif->tx.sring->req_prod, netif->tx.req_cons);
610 RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do);
616 i = netif->tx.req_cons;
617 rmb(); /* Ensure that we see the request before we copy it. */
618 memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq));
620 /* If we want credit-based scheduling, coud add it here - WORK */
622 netif->tx.req_cons++;
624 netif_schedule_tx_work(netif);
626 if (unlikely(txreq.size < ETHER_HDR_LEN) ||
627 unlikely(txreq.size > (ETHER_MAX_LEN-ETHER_CRC_LEN))) {
628 WPRINTF("Bad packet size: %d\n", txreq.size);
629 make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
634 /* No crossing a page as the payload mustn't fragment. */
635 if (unlikely((txreq.offset + txreq.size) >= PAGE_SIZE)) {
636 WPRINTF("txreq.offset: %x, size: %u, end: %u\n",
637 txreq.offset, txreq.size,
638 (txreq.offset & PAGE_MASK) + txreq.size);
639 make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
644 pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
646 MGETHDR(m, M_DONTWAIT, MT_DATA);
648 WPRINTF("Failed to allocate mbuf\n");
649 make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
653 m->m_pkthdr.rcvif = netif->ifp;
655 if ((m->m_pkthdr.len = txreq.size) > PKT_PROT_LEN) {
657 MGET(n, M_DONTWAIT, MT_DATA);
658 if (!(m->m_next = n)) {
660 WPRINTF("Failed to allocate second mbuf\n");
661 make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
665 n->m_len = txreq.size - PKT_PROT_LEN;
666 m->m_len = PKT_PROT_LEN;
668 m->m_len = txreq.size;
670 mop->host_addr = MMAP_VADDR(pending_idx);
671 mop->dom = netif->domid;
672 mop->ref = txreq.gref;
673 mop->flags = GNTMAP_host_map | GNTMAP_readonly;
676 memcpy(&pending_tx_info[pending_idx].req,
677 &txreq, sizeof(txreq));
678 pending_tx_info[pending_idx].netif = netif;
679 *((uint16_t *)m->m_data) = pending_idx;
682 txq_last->m_nextpkt = m;
689 if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops))
696 ret = HYPERVISOR_grant_table_op(
697 GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops);
701 while ((m = txq) != NULL) {
707 pending_idx = *((uint16_t *)m->m_data);
708 netif = pending_tx_info[pending_idx].netif;
709 memcpy(&txreq, &pending_tx_info[pending_idx].req, sizeof(txreq));
711 /* Check the remap error code. */
712 if (unlikely(mop->status)) {
713 WPRINTF("#### netback grant fails\n");
714 make_tx_response(netif, txreq.id, NETIF_RSP_ERROR);
718 pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
723 /* Can't do this in FreeBSD since vtophys() returns the pfn */
724 /* of the remote domain who loaned us the machine page - DPT */
725 xen_phys_machine[(vtophys(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT)] =
726 mop->dev_bus_addr >> PAGE_SHIFT;
728 grant_tx_handle[pending_idx] = mop->handle;
730 /* Setup data in mbuf (lengths are already set) */
731 data = (caddr_t)(MMAP_VADDR(pending_idx)|txreq.offset);
732 bcopy(data, m->m_data, m->m_len);
734 struct mbuf *n = m->m_next;
735 MEXTADD(n, MMAP_VADDR(pending_idx), PAGE_SIZE, netif_page_release,
736 (void *)(unsigned int)pending_idx, M_RDONLY, EXT_NET_DRV);
737 n->m_data = &data[PKT_PROT_LEN];
739 /* Schedule a response immediately. */
740 netif_idx_release(pending_idx);
743 if ((txreq.flags & NETTXF_data_validated)) {
744 /* Tell the stack the checksums are okay */
745 m->m_pkthdr.csum_flags |=
746 (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
747 m->m_pkthdr.csum_data = 0xffff;
750 /* If necessary, inform stack to compute the checksums if it forwards the packet */
751 if ((txreq.flags & NETTXF_csum_blank)) {
752 struct ether_header *eh = mtod(m, struct ether_header *);
753 if (ntohs(eh->ether_type) == ETHERTYPE_IP) {
754 struct ip *ip = (struct ip *)&m->m_data[14];
755 if (ip->ip_p == IPPROTO_TCP)
756 m->m_pkthdr.csum_flags |= CSUM_TCP;
757 else if (ip->ip_p == IPPROTO_UDP)
758 m->m_pkthdr.csum_flags |= CSUM_UDP;
762 netif->ifp->if_ibytes += m->m_pkthdr.len;
763 netif->ifp->if_ipackets++;
765 DDPRINTF("RECV %d bytes from %s (cflags=%x)\n",
766 m->m_pkthdr.len, IFNAME(netif), m->m_pkthdr.csum_flags);
767 DPRINTF_MBUF_LEN(m, 128);
769 (*netif->ifp->if_input)(netif->ifp, m);
775 /* Handle interrupt from a frontend */
777 netback_intr(void *arg)
779 netif_t *netif = arg;
780 DDPRINTF("%s\n", IFNAME(netif));
781 add_to_tx_schedule_list_tail(netif);
782 maybe_schedule_tx_action();
785 /* Removes netif from front of list and does not call netif_put() (caller must) */
787 remove_from_rx_schedule_list(void)
791 mtx_lock(&rx_sched_list_lock);
793 if ((netif = STAILQ_FIRST(&rx_sched_list))) {
794 STAILQ_REMOVE(&rx_sched_list, netif, netback_info, next_rx);
795 STAILQ_NEXT(netif, next_rx) = NULL;
796 netif->on_rx_sched_list = 0;
799 mtx_unlock(&rx_sched_list_lock);
804 /* Adds netif to end of list and calls netif_get() */
806 add_to_rx_schedule_list_tail(netif_t *netif)
808 if (netif->on_rx_sched_list)
811 mtx_lock(&rx_sched_list_lock);
812 if (!netif->on_rx_sched_list && (netif->ifp->if_drv_flags & IFF_DRV_RUNNING)) {
814 STAILQ_INSERT_TAIL(&rx_sched_list, netif, next_rx);
815 netif->on_rx_sched_list = 1;
817 mtx_unlock(&rx_sched_list_lock);
821 make_rx_response(netif_t *netif, uint16_t id, int8_t st,
822 uint16_t offset, uint16_t size, uint16_t flags)
824 RING_IDX i = netif->rx.rsp_prod_pvt;
825 netif_rx_response_t *resp;
828 resp = RING_GET_RESPONSE(&netif->rx, i);
829 resp->offset = offset;
832 resp->status = (int16_t)size;
834 resp->status = (int16_t)st;
836 DDPRINTF("rx resp(%d): off=%x fl=%x id=%x stat=%d\n",
837 i, resp->offset, resp->flags, resp->id, resp->status);
839 netif->rx.rsp_prod_pvt = ++i;
840 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, notify);
846 netif_rx(netif_t *netif)
848 struct ifnet *ifp = netif->ifp;
850 multicall_entry_t *mcl;
852 gnttab_transfer_t *gop;
853 unsigned long vdata, old_mfn, new_mfn;
854 struct mbuf *rxq = NULL, *rxq_last = NULL;
855 int ret, notify = 0, pkts_dequeued = 0;
857 DDPRINTF("%s\n", IFNAME(netif));
863 while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
865 /* Quit if the target domain has no receive buffers */
866 if (netif->rx.req_cons == netif->rx.sring->req_prod)
869 IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
875 /* Check if we need to copy the data */
876 if (((m->m_flags & (M_RDONLY|M_EXT)) != M_EXT) ||
877 (*m->m_ext.ref_cnt > 1) || m->m_next != NULL) {
880 DDPRINTF("copying mbuf (fl=%x ext=%x rc=%d n=%x)\n",
882 (m->m_flags & M_EXT) ? m->m_ext.ext_type : 0,
883 (m->m_flags & M_EXT) ? *m->m_ext.ref_cnt : 0,
884 (unsigned int)m->m_next);
887 MGETHDR(n, M_DONTWAIT, MT_DATA);
891 MCLGET(n, M_DONTWAIT);
892 if (!(n->m_flags & M_EXT)) {
897 /* Leave space at front and keep current alignment */
898 n->m_data += 16 + ((unsigned int)m->m_data & 0x3);
900 if (m->m_pkthdr.len > M_TRAILINGSPACE(n)) {
901 WPRINTF("pkt to big %d\n", m->m_pkthdr.len);
905 m_copydata(m, 0, m->m_pkthdr.len, n->m_data);
906 n->m_pkthdr.len = n->m_len = m->m_pkthdr.len;
907 n->m_pkthdr.csum_flags = (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA);
912 vdata = (unsigned long)m->m_data;
913 old_mfn = vtomach(vdata) >> PAGE_SHIFT;
915 if ((new_mfn = alloc_mfn()) == 0)
918 #ifdef XEN_NETBACK_FIXUP_CSUM
919 /* Check if we need to compute a checksum. This happens */
920 /* when bridging from one domain to another. */
921 if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) ||
922 (m->m_pkthdr.csum_flags & CSUM_SCTP))
926 xen_phys_machine[(vtophys(vdata) >> PAGE_SHIFT)] = new_mfn;
928 mcl->op = __HYPERVISOR_update_va_mapping;
929 mcl->args[0] = vdata;
930 mcl->args[1] = (new_mfn << PAGE_SHIFT) | PG_V | PG_RW | PG_M | PG_A;
936 gop->domid = netif->domid;
937 gop->ref = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons)->gref;
938 netif->rx.req_cons++;
941 mmu->ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
942 mmu->val = vtophys(vdata) >> PAGE_SHIFT;
946 rxq_last->m_nextpkt = m;
951 DDPRINTF("XMIT %d bytes to %s\n", m->m_pkthdr.len, IFNAME(netif));
952 DPRINTF_MBUF_LEN(m, 128);
954 /* Filled the batch queue? */
955 if ((gop - grant_rx_op) == ARRAY_SIZE(grant_rx_op))
960 DDPRINTF("dropping pkt\n");
966 return pkts_dequeued;
968 mcl->op = __HYPERVISOR_mmu_update;
969 mcl->args[0] = (unsigned long)rx_mmu;
970 mcl->args[1] = mmu - rx_mmu;
972 mcl->args[3] = DOMID_SELF;
975 mcl[-2].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL;
976 ret = HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl);
979 ret = HYPERVISOR_grant_table_op(GNTTABOP_transfer, grant_rx_op, gop - grant_rx_op);
985 while ((m = rxq) != NULL) {
987 uint16_t id, flags = 0;
992 /* Rederive the machine addresses. */
993 new_mfn = mcl->args[1] >> PAGE_SHIFT;
996 ifp->if_obytes += m->m_pkthdr.len;
999 /* The update_va_mapping() must not fail. */
1000 BUG_ON(mcl->result != 0);
1003 if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA))
1004 flags |= NETRXF_csum_blank | NETRXF_data_validated;
1005 else if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID))
1006 flags |= NETRXF_data_validated;
1008 /* Check the reassignment error code. */
1009 status = NETIF_RSP_OKAY;
1010 if (gop->status != 0) {
1011 DPRINTF("Bad status %d from grant transfer to DOM%u\n",
1012 gop->status, netif->domid);
1014 * Page no longer belongs to us unless GNTST_bad_page,
1015 * but that should be a fatal error anyway.
1017 BUG_ON(gop->status == GNTST_bad_page);
1018 status = NETIF_RSP_ERROR;
1020 id = RING_GET_REQUEST(&netif->rx, netif->rx.rsp_prod_pvt)->id;
1021 notify |= make_rx_response(netif, id, status,
1022 (unsigned long)m->m_data & PAGE_MASK,
1023 m->m_pkthdr.len, flags);
1031 notify_remote_via_irq(netif->irq);
1033 return pkts_dequeued;
1037 rx_task_timer(void *arg)
1040 taskqueue_enqueue(taskqueue_swi, &net_rx_task);
1044 net_rx_action(void *context, int pending)
1046 netif_t *netif, *last_zero_work = NULL;
1050 while ((netif = remove_from_rx_schedule_list())) {
1051 struct ifnet *ifp = netif->ifp;
1053 if (netif == last_zero_work) {
1054 if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
1055 add_to_rx_schedule_list_tail(netif);
1057 if (!STAILQ_EMPTY(&rx_sched_list))
1058 callout_reset(&rx_task_callout, 1, rx_task_timer, NULL);
1062 if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1063 if (netif_rx(netif))
1064 last_zero_work = NULL;
1065 else if (!last_zero_work)
1066 last_zero_work = netif;
1067 if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
1068 add_to_rx_schedule_list_tail(netif);
1076 netback_start(struct ifnet *ifp)
1078 netif_t *netif = (netif_t *)ifp->if_softc;
1080 DDPRINTF("%s\n", IFNAME(netif));
1082 add_to_rx_schedule_list_tail(netif);
1083 taskqueue_enqueue(taskqueue_swi, &net_rx_task);
1086 /* Map a grant ref to a ring */
1088 map_ring(grant_ref_t ref, domid_t dom, struct ring_ref *ring)
1090 struct gnttab_map_grant_ref op;
1092 ring->va = kmem_alloc_nofault(kernel_map, PAGE_SIZE);
1096 op.host_addr = ring->va;
1097 op.flags = GNTMAP_host_map;
1100 HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
1102 WPRINTF("grant table op err=%d\n", op.status);
1103 kmem_free(kernel_map, ring->va, PAGE_SIZE);
1108 ring->handle = op.handle;
1109 ring->bus_addr = op.dev_bus_addr;
1114 /* Unmap grant ref for a ring */
1116 unmap_ring(struct ring_ref *ring)
1118 struct gnttab_unmap_grant_ref op;
1120 op.host_addr = ring->va;
1121 op.dev_bus_addr = ring->bus_addr;
1122 op.handle = ring->handle;
1123 HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
1125 WPRINTF("grant table op err=%d\n", op.status);
1127 kmem_free(kernel_map, ring->va, PAGE_SIZE);
1132 connect_rings(netif_t *netif)
1134 struct xenbus_device *xdev = netif->xdev;
1135 netif_tx_sring_t *txs;
1136 netif_rx_sring_t *rxs;
1137 unsigned long tx_ring_ref, rx_ring_ref;
1138 evtchn_port_t evtchn;
1139 evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain };
1142 // Grab FE data and map his memory
1143 err = xenbus_gather(NULL, xdev->otherend,
1144 "tx-ring-ref", "%lu", &tx_ring_ref,
1145 "rx-ring-ref", "%lu", &rx_ring_ref,
1146 "event-channel", "%u", &evtchn, NULL);
1148 xenbus_dev_fatal(xdev, err,
1149 "reading %s/ring-ref and event-channel",
1154 err = map_ring(tx_ring_ref, netif->domid, &netif->tx_ring_ref);
1156 xenbus_dev_fatal(xdev, err, "mapping tx ring");
1159 txs = (netif_tx_sring_t *)netif->tx_ring_ref.va;
1160 BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
1162 err = map_ring(rx_ring_ref, netif->domid, &netif->rx_ring_ref);
1164 unmap_ring(&netif->tx_ring_ref);
1165 xenbus_dev_fatal(xdev, err, "mapping rx ring");
1168 rxs = (netif_rx_sring_t *)netif->rx_ring_ref.va;
1169 BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
1171 op.u.bind_interdomain.remote_dom = netif->domid;
1172 op.u.bind_interdomain.remote_port = evtchn;
1173 err = HYPERVISOR_event_channel_op(&op);
1175 unmap_ring(&netif->tx_ring_ref);
1176 unmap_ring(&netif->rx_ring_ref);
1177 xenbus_dev_fatal(xdev, err, "binding event channel");
1180 netif->evtchn = op.u.bind_interdomain.local_port;
1182 /* bind evtchn to irq handler */
1184 bind_evtchn_to_irqhandler(netif->evtchn, "netback",
1185 netback_intr, netif, INTR_TYPE_NET|INTR_MPSAFE, &netif->irq_cookie);
1187 netif->rings_connected = 1;
1189 DPRINTF("%s connected! evtchn=%d irq=%d\n",
1190 IFNAME(netif), netif->evtchn, netif->irq);
1196 disconnect_rings(netif_t *netif)
1200 if (netif->rings_connected) {
1201 unbind_from_irqhandler(netif->irq, netif->irq_cookie);
1203 unmap_ring(&netif->tx_ring_ref);
1204 unmap_ring(&netif->rx_ring_ref);
1205 netif->rings_connected = 0;
1210 connect(netif_t *netif)
1214 netif->frontend_state != XenbusStateConnected) {
1218 if (!connect_rings(netif)) {
1219 xenbus_switch_state(netif->xdev, NULL, XenbusStateConnected);
1221 /* Turn on interface */
1222 netif->ifp->if_drv_flags |= IFF_DRV_RUNNING;
1223 netif->ifp->if_flags |= IFF_UP;
1228 netback_remove(struct xenbus_device *xdev)
1230 netif_t *netif = xdev->data;
1233 DPRINTF("remove %s\n", xdev->nodename);
1235 if ((ndev = netif->ndev)) {
1238 device_detach(ndev);
1250 * Entry point to this code when a new device is created. Allocate the basic
1251 * structures and the ring buffers for communication with the frontend.
1252 * Switch to Connected state.
1255 netback_probe(struct xenbus_device *xdev, const struct xenbus_device_id *id)
1261 DPRINTF("node=%s\n", xdev->nodename);
1263 /* Grab the handle */
1264 err = xenbus_scanf(NULL, xdev->nodename, "handle", "%li", &handle);
1266 xenbus_dev_fatal(xdev, err, "reading handle");
1270 /* Check for bridge */
1271 bridge = xenbus_read(NULL, xdev->nodename, "bridge", NULL);
1275 err = xenbus_switch_state(xdev, NULL, XenbusStateInitWait);
1277 xenbus_dev_fatal(xdev, err, "writing switch state");
1281 err = netif_create(handle, xdev, bridge);
1283 xenbus_dev_fatal(xdev, err, "creating netif");
1287 err = vif_add_dev(xdev);
1289 netif_put((netif_t *)xdev->data);
1290 xenbus_dev_fatal(xdev, err, "adding vif device");
1298 * We are reconnecting to the backend, due to a suspend/resume, or a backend
1299 * driver restart. We tear down our netif structure and recreate it, but
1300 * leave the device-layer structures intact so that this is transparent to the
1301 * rest of the kernel.
1303 static int netback_resume(struct xenbus_device *xdev)
1305 DPRINTF("node=%s\n", xdev->nodename);
1311 * Callback received when the frontend's state changes.
1313 static void frontend_changed(struct xenbus_device *xdev,
1314 XenbusState frontend_state)
1316 netif_t *netif = xdev->data;
1318 DPRINTF("state=%d\n", frontend_state);
1320 netif->frontend_state = frontend_state;
1322 switch (frontend_state) {
1323 case XenbusStateInitialising:
1324 case XenbusStateInitialised:
1326 case XenbusStateConnected:
1329 case XenbusStateClosing:
1330 xenbus_switch_state(xdev, NULL, XenbusStateClosing);
1332 case XenbusStateClosed:
1333 xenbus_remove_device(xdev);
1335 case XenbusStateUnknown:
1336 case XenbusStateInitWait:
1337 xenbus_dev_fatal(xdev, EINVAL, "saw state %d at frontend",
1343 /* ** Driver registration ** */
1345 static struct xenbus_device_id netback_ids[] = {
1350 static struct xenbus_driver netback = {
1353 .probe = netback_probe,
1354 .remove = netback_remove,
1355 .resume= netback_resume,
1356 .otherend_changed = frontend_changed,
1360 netback_init(void *unused)
1362 callout_init(&rx_task_callout, CALLOUT_MPSAFE);
1364 mmap_vstart = alloc_empty_page_range(MAX_PENDING_REQS);
1365 BUG_ON(!mmap_vstart);
1368 for (pending_prod = 0; pending_prod < MAX_PENDING_REQS; pending_prod++)
1369 pending_ring[pending_prod] = pending_prod;
1371 TASK_INIT(&net_tx_task, 0, net_tx_action, NULL);
1372 TASK_INIT(&net_rx_task, 0, net_rx_action, NULL);
1373 mtx_init(&tx_sched_list_lock, "nb_tx_sched_lock", "netback tx sched lock", MTX_DEF);
1374 mtx_init(&rx_sched_list_lock, "nb_rx_sched_lock", "netback rx sched lock", MTX_DEF);
1376 DPRINTF("registering %s\n", netback.name);
1378 xenbus_register_backend(&netback);
1381 SYSINIT(xnbedev, SI_SUB_PSEUDO, SI_ORDER_ANY, netback_init, NULL)
1384 vif_add_dev(struct xenbus_device *xdev)
1386 netif_t *netif = xdev->data;
1387 device_t nexus, ndev;
1393 /* We will add a vif device as a child of nexus0 (for now) */
1394 if (!(dc = devclass_find("nexus")) ||
1395 !(nexus = devclass_get_device(dc, 0))) {
1396 WPRINTF("could not find nexus0!\n");
1402 /* Create a newbus device representing the vif */
1403 ndev = BUS_ADD_CHILD(nexus, 0, "vif", netif->ifp->if_dunit);
1405 WPRINTF("could not create newbus device %s!\n", IFNAME(netif));
1411 device_set_ivars(ndev, netif);
1414 device_probe_and_attach(ndev);
1431 vif_sysctl_ring_info(netif_t *netif, int cmd)
1433 char *buf = malloc(256, M_DEVBUF, M_WAITOK);
1435 if (!netif->rings_connected)
1436 sprintf(buf, "rings not connected\n");
1437 else if (cmd == VIF_SYSCTL_TXRING) {
1438 netif_tx_back_ring_t *tx = &netif->tx;
1439 sprintf(buf, "nr_ents=%x req_cons=%x"
1440 " req_prod=%x req_event=%x"
1441 " rsp_prod=%x rsp_event=%x",
1442 tx->nr_ents, tx->req_cons,
1443 tx->sring->req_prod, tx->sring->req_event,
1444 tx->sring->rsp_prod, tx->sring->rsp_event);
1446 netif_rx_back_ring_t *rx = &netif->rx;
1447 sprintf(buf, "nr_ents=%x req_cons=%x"
1448 " req_prod=%x req_event=%x"
1449 " rsp_prod=%x rsp_event=%x",
1450 rx->nr_ents, rx->req_cons,
1451 rx->sring->req_prod, rx->sring->req_event,
1452 rx->sring->rsp_prod, rx->sring->rsp_event);
1459 vif_sysctl_handler(SYSCTL_HANDLER_ARGS)
1461 device_t dev = (device_t)arg1;
1462 netif_t *netif = (netif_t *)device_get_ivars(dev);
1468 case VIF_SYSCTL_DOMID:
1469 return sysctl_handle_int(oidp, NULL, netif->domid, req);
1470 case VIF_SYSCTL_HANDLE:
1471 return sysctl_handle_int(oidp, NULL, netif->handle, req);
1472 case VIF_SYSCTL_TXRING:
1473 case VIF_SYSCTL_RXRING:
1474 value = buf = vif_sysctl_ring_info(netif, arg2);
1480 err = SYSCTL_OUT(req, value, strlen(value));
1482 free(buf, M_DEVBUF);
1487 /* Newbus vif device driver probe */
1489 vif_probe(device_t dev)
1491 DDPRINTF("vif%d\n", device_get_unit(dev));
1495 /* Newbus vif device driver attach */
1497 vif_attach(device_t dev)
1499 netif_t *netif = (netif_t *)device_get_ivars(dev);
1500 uint8_t mac[ETHER_ADDR_LEN];
1502 DDPRINTF("%s\n", IFNAME(netif));
1504 SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
1505 OID_AUTO, "domid", CTLTYPE_INT|CTLFLAG_RD,
1506 dev, VIF_SYSCTL_DOMID, vif_sysctl_handler, "I",
1507 "domid of frontend");
1508 SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
1509 OID_AUTO, "handle", CTLTYPE_INT|CTLFLAG_RD,
1510 dev, VIF_SYSCTL_HANDLE, vif_sysctl_handler, "I",
1511 "handle of frontend");
1512 #ifdef XEN_NETBACK_DEBUG
1513 SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
1514 OID_AUTO, "txring", CTLFLAG_RD,
1515 dev, VIF_SYSCTL_TXRING, vif_sysctl_handler, "A",
1517 SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
1518 OID_AUTO, "rxring", CTLFLAG_RD,
1519 dev, VIF_SYSCTL_RXRING, vif_sysctl_handler, "A",
1523 memset(mac, 0xff, sizeof(mac));
1526 ether_ifattach(netif->ifp, mac);
1527 netif->attached = 1;
1531 if (netif->bridge) {
1532 DPRINTF("Adding %s to bridge %s\n", IFNAME(netif), netif->bridge);
1533 int err = add_to_bridge(netif->ifp, netif->bridge);
1535 WPRINTF("Error adding %s to %s; err=%d\n",
1536 IFNAME(netif), netif->bridge, err);
1540 return bus_generic_attach(dev);
1543 /* Newbus vif device driver detach */
1545 vif_detach(device_t dev)
1547 netif_t *netif = (netif_t *)device_get_ivars(dev);
1548 struct ifnet *ifp = netif->ifp;
1550 DDPRINTF("%s\n", IFNAME(netif));
1552 /* Tell the stack that the interface is no longer active */
1553 ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
1555 ether_ifdetach(ifp);
1557 bus_generic_detach(dev);
1559 netif->attached = 0;
1566 static device_method_t vif_methods[] = {
1567 /* Device interface */
1568 DEVMETHOD(device_probe, vif_probe),
1569 DEVMETHOD(device_attach, vif_attach),
1570 DEVMETHOD(device_detach, vif_detach),
1571 DEVMETHOD(device_shutdown, bus_generic_shutdown),
1572 DEVMETHOD(device_suspend, bus_generic_suspend),
1573 DEVMETHOD(device_resume, bus_generic_resume),
1577 static devclass_t vif_devclass;
1579 static driver_t vif_driver = {
1585 DRIVER_MODULE(vif, nexus, vif_driver, vif_devclass, 0, 0);
1591 * c-set-style: "BSD"
1594 * indent-tabs-mode: t