2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 * This file implements multiple network backends (tap, netmap, ...),
32 * to be used by network frontends such as virtio-net and e1000.
33 * The API to access the backend (e.g. send/receive packets, negotiate
34 * features) is exported by net_backends.h.
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
40 #include <sys/types.h> /* u_short etc */
41 #ifndef WITHOUT_CAPSICUM
42 #include <sys/capsicum.h>
44 #include <sys/ioctl.h>
49 #include <net/netmap.h>
50 #include <net/netmap_virt.h>
51 #define NETMAP_WITH_LIBS
52 #include <net/netmap_user.h>
54 #ifndef WITHOUT_CAPSICUM
55 #include <capsicum_helpers.h>
68 #include <pthread_np.h>
76 #include "net_backends.h"
78 #include <sys/linker_set.h>
81 * Each network backend registers a set of function pointers that are
82 * used to implement the net backends API.
83 * This might need to be exposed if we implement backends in separate files.
86 const char *prefix; /* prefix matching this backend */
89 * Routines used to initialize and cleanup the resources needed
90 * by a backend. The cleanup function is used internally,
91 * and should not be called by the frontend.
93 int (*init)(struct net_backend *be, const char *devname,
94 net_be_rxeof_t cb, void *param);
95 void (*cleanup)(struct net_backend *be);
98 * Called to serve a guest transmit request. The scatter-gather
99 * vector provided by the caller has 'iovcnt' elements and contains
100 * the packet to send.
102 ssize_t (*send)(struct net_backend *be, const struct iovec *iov,
106 * Get the length of the next packet that can be received from
107 * the backend. If no packets are currently available, this
108 * function returns 0.
110 ssize_t (*peek_recvlen)(struct net_backend *be);
113 * Called to receive a packet from the backend. When the function
114 * returns a positive value 'len', the scatter-gather vector
115 * provided by the caller contains a packet with such length.
116 * The function returns 0 if the backend doesn't have a new packet to
119 ssize_t (*recv)(struct net_backend *be, const struct iovec *iov,
123 * Ask the backend to enable or disable receive operation in the
124 * backend. On return from a disable operation, it is guaranteed
125 * that the receive callback won't be called until receive is
126 * enabled again. Note however that it is up to the caller to make
127 * sure that netbe_recv() is not currently being executed by another
130 void (*recv_enable)(struct net_backend *be);
131 void (*recv_disable)(struct net_backend *be);
134 * Ask the backend for the virtio-net features it is able to
135 * support. Possible features are TSO, UFO and checksum offloading
136 * in both rx and tx direction and for both IPv4 and IPv6.
138 uint64_t (*get_cap)(struct net_backend *be);
141 * Tell the backend to enable/disable the specified virtio-net
142 * features (capabilities).
144 int (*set_cap)(struct net_backend *be, uint64_t features,
145 unsigned int vnet_hdr_len);
147 struct pci_vtnet_softc *sc;
151 * Length of the virtio-net header used by the backend and the
152 * frontend, respectively. A zero value means that the header
155 unsigned int be_vnet_hdr_len;
156 unsigned int fe_vnet_hdr_len;
158 /* Size of backend-specific private data. */
161 /* Room for backend-specific data. */
165 SET_DECLARE(net_backend_set, struct net_backend);
167 #define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr)
169 #define WPRINTF(params) PRINTLN params
178 * A bounce buffer that allows us to implement the peek_recvlen
179 * callback. In the future we may get the same information from
187 tap_cleanup(struct net_backend *be)
189 struct tap_priv *priv = (struct tap_priv *)be->opaque;
192 mevent_delete(priv->mevp);
201 tap_init(struct net_backend *be, const char *devname,
202 net_be_rxeof_t cb, void *param)
204 struct tap_priv *priv = (struct tap_priv *)be->opaque;
207 #ifndef WITHOUT_CAPSICUM
212 WPRINTF(("TAP backend requires non-NULL callback"));
216 strcpy(tbuf, "/dev/");
217 strlcat(tbuf, devname, sizeof(tbuf));
219 be->fd = open(tbuf, O_RDWR);
221 WPRINTF(("open of tap device %s failed", tbuf));
226 * Set non-blocking and register for read
227 * notifications with the event loop
229 if (ioctl(be->fd, FIONBIO, &opt) < 0) {
230 WPRINTF(("tap device O_NONBLOCK failed"));
234 #ifndef WITHOUT_CAPSICUM
235 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
236 if (caph_rights_limit(be->fd, &rights) == -1)
237 errx(EX_OSERR, "Unable to apply rights for sandbox");
240 memset(priv->bbuf, 0, sizeof(priv->bbuf));
243 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
244 if (priv->mevp == NULL) {
245 WPRINTF(("Could not register event"));
257 * Called to send a buffer chain out to the tap device
260 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
262 return (writev(be->fd, iov, iovcnt));
266 tap_peek_recvlen(struct net_backend *be)
268 struct tap_priv *priv = (struct tap_priv *)be->opaque;
271 if (priv->bbuflen > 0) {
273 * We already have a packet in the bounce buffer.
274 * Just return its length.
276 return priv->bbuflen;
280 * Read the next packet (if any) into the bounce buffer, so
281 * that we get to know its length and we can return that
284 ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
285 if (ret < 0 && errno == EWOULDBLOCK) {
296 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
298 struct tap_priv *priv = (struct tap_priv *)be->opaque;
301 if (priv->bbuflen > 0) {
303 * A packet is available in the bounce buffer, so
304 * we read it from there.
306 ret = buf_to_iov(priv->bbuf, priv->bbuflen,
309 /* Mark the bounce buffer as empty. */
315 ret = readv(be->fd, iov, iovcnt);
316 if (ret < 0 && errno == EWOULDBLOCK) {
324 tap_recv_enable(struct net_backend *be)
326 struct tap_priv *priv = (struct tap_priv *)be->opaque;
328 mevent_enable(priv->mevp);
332 tap_recv_disable(struct net_backend *be)
334 struct tap_priv *priv = (struct tap_priv *)be->opaque;
336 mevent_disable(priv->mevp);
340 tap_get_cap(struct net_backend *be)
343 return (0); /* no capabilities for now */
347 tap_set_cap(struct net_backend *be, uint64_t features,
348 unsigned vnet_hdr_len)
351 return ((features || vnet_hdr_len) ? -1 : 0);
354 static struct net_backend tap_backend = {
356 .priv_size = sizeof(struct tap_priv),
358 .cleanup = tap_cleanup,
360 .peek_recvlen = tap_peek_recvlen,
362 .recv_enable = tap_recv_enable,
363 .recv_disable = tap_recv_disable,
364 .get_cap = tap_get_cap,
365 .set_cap = tap_set_cap,
368 /* A clone of the tap backend, with a different prefix. */
369 static struct net_backend vmnet_backend = {
371 .priv_size = sizeof(struct tap_priv),
373 .cleanup = tap_cleanup,
375 .peek_recvlen = tap_peek_recvlen,
377 .recv_enable = tap_recv_enable,
378 .recv_disable = tap_recv_disable,
379 .get_cap = tap_get_cap,
380 .set_cap = tap_set_cap,
383 DATA_SET(net_backend_set, tap_backend);
384 DATA_SET(net_backend_set, vmnet_backend);
390 /* The virtio-net features supported by netmap. */
391 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
392 VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
393 VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
394 VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
397 char ifname[IFNAMSIZ];
400 struct netmap_ring *rx;
401 struct netmap_ring *tx;
408 nmreq_init(struct nmreq *req, char *ifname)
411 memset(req, 0, sizeof(*req));
412 strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
413 req->nr_version = NETMAP_API;
417 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
421 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
423 nmreq_init(&req, priv->ifname);
424 req.nr_cmd = NETMAP_BDG_VNET_HDR;
425 req.nr_arg1 = vnet_hdr_len;
426 err = ioctl(be->fd, NIOCREGIF, &req);
428 WPRINTF(("Unable to set vnet header length %d",
433 be->be_vnet_hdr_len = vnet_hdr_len;
439 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
441 int prev_hdr_len = be->be_vnet_hdr_len;
444 if (vnet_hdr_len == prev_hdr_len) {
448 ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
453 netmap_set_vnet_hdr_len(be, prev_hdr_len);
459 netmap_get_cap(struct net_backend *be)
462 return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
463 NETMAP_FEATURES : 0);
467 netmap_set_cap(struct net_backend *be, uint64_t features,
468 unsigned vnet_hdr_len)
471 return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
475 netmap_init(struct net_backend *be, const char *devname,
476 net_be_rxeof_t cb, void *param)
478 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
480 strlcpy(priv->ifname, devname, sizeof(priv->ifname));
481 priv->ifname[sizeof(priv->ifname) - 1] = '\0';
483 priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
484 if (priv->nmd == NULL) {
485 WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)",
486 devname, strerror(errno)));
491 priv->memid = priv->nmd->req.nr_arg2;
492 priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
493 priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
495 priv->cb_param = param;
496 be->fd = priv->nmd->fd;
498 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
499 if (priv->mevp == NULL) {
500 WPRINTF(("Could not register event"));
508 netmap_cleanup(struct net_backend *be)
510 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
513 mevent_delete(priv->mevp);
522 netmap_send(struct net_backend *be, const struct iovec *iov,
525 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
526 struct netmap_ring *ring;
536 if (head == ring->tail) {
537 WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt)));
540 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
541 nm_buf_size = ring->nr_buf_size;
544 for (j = 0; j < iovcnt; j++) {
545 int iov_frag_size = iov[j].iov_len;
546 void *iov_frag_buf = iov[j].iov_base;
548 totlen += iov_frag_size;
551 * Split each iovec fragment over more netmap slots, if
557 copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
558 memcpy(nm_buf, iov_frag_buf, copylen);
560 iov_frag_buf += copylen;
561 iov_frag_size -= copylen;
563 nm_buf_size -= copylen;
564 nm_buf_len += copylen;
566 if (iov_frag_size == 0) {
570 ring->slot[head].len = nm_buf_len;
571 ring->slot[head].flags = NS_MOREFRAG;
572 head = nm_ring_next(ring, head);
573 if (head == ring->tail) {
575 * We ran out of netmap slots while
576 * splitting the iovec fragments.
578 WPRINTF(("No space, drop %zu bytes",
579 count_iov(iov, iovcnt)));
582 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
583 nm_buf_size = ring->nr_buf_size;
588 /* Complete the last slot, which must not have NS_MOREFRAG set. */
589 ring->slot[head].len = nm_buf_len;
590 ring->slot[head].flags = 0;
591 head = nm_ring_next(ring, head);
593 /* Now update ring->head and ring->cur. */
594 ring->head = ring->cur = head;
596 ioctl(be->fd, NIOCTXSYNC, NULL);
602 netmap_peek_recvlen(struct net_backend *be)
604 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
605 struct netmap_ring *ring = priv->rx;
606 uint32_t head = ring->head;
609 while (head != ring->tail) {
610 struct netmap_slot *slot = ring->slot + head;
613 if ((slot->flags & NS_MOREFRAG) == 0)
615 head = nm_ring_next(ring, head);
622 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
624 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
625 struct netmap_slot *slot = NULL;
626 struct netmap_ring *ring;
636 iov_frag_buf = iov->iov_base;
637 iov_frag_size = iov->iov_len;
643 if (head == ring->tail) {
647 slot = ring->slot + head;
648 nm_buf = NETMAP_BUF(ring, slot->buf_idx);
649 nm_buf_len = slot->len;
652 int copylen = nm_buf_len < iov_frag_size ?
653 nm_buf_len : iov_frag_size;
655 memcpy(iov_frag_buf, nm_buf, copylen);
657 nm_buf_len -= copylen;
658 iov_frag_buf += copylen;
659 iov_frag_size -= copylen;
662 if (nm_buf_len == 0) {
669 /* No space to receive. */
670 WPRINTF(("Short iov, drop %zd bytes",
674 iov_frag_buf = iov->iov_base;
675 iov_frag_size = iov->iov_len;
678 head = nm_ring_next(ring, head);
680 } while (slot->flags & NS_MOREFRAG);
682 /* Release slots to netmap. */
683 ring->head = ring->cur = head;
689 netmap_recv_enable(struct net_backend *be)
691 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
693 mevent_enable(priv->mevp);
697 netmap_recv_disable(struct net_backend *be)
699 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
701 mevent_disable(priv->mevp);
704 static struct net_backend netmap_backend = {
706 .priv_size = sizeof(struct netmap_priv),
708 .cleanup = netmap_cleanup,
710 .peek_recvlen = netmap_peek_recvlen,
712 .recv_enable = netmap_recv_enable,
713 .recv_disable = netmap_recv_disable,
714 .get_cap = netmap_get_cap,
715 .set_cap = netmap_set_cap,
718 /* A clone of the netmap backend, with a different prefix. */
719 static struct net_backend vale_backend = {
721 .priv_size = sizeof(struct netmap_priv),
723 .cleanup = netmap_cleanup,
725 .peek_recvlen = netmap_peek_recvlen,
727 .recv_enable = netmap_recv_enable,
728 .recv_disable = netmap_recv_disable,
729 .get_cap = netmap_get_cap,
730 .set_cap = netmap_set_cap,
733 DATA_SET(net_backend_set, netmap_backend);
734 DATA_SET(net_backend_set, vale_backend);
737 * Initialize a backend and attach to the frontend.
738 * This is called during frontend initialization.
739 * @pbe is a pointer to the backend to be initialized
740 * @devname is the backend-name as supplied on the command line,
741 * e.g. -s 2:0,frontend-name,backend-name[,other-args]
742 * @cb is the receive callback supplied by the frontend,
743 * and it is invoked in the event loop when a receive
744 * event is generated in the hypervisor,
745 * @param is a pointer to the frontend, and normally used as
746 * the argument for the callback.
749 netbe_init(struct net_backend **ret, const char *devname, net_be_rxeof_t cb,
752 struct net_backend **pbe, *nbe, *tbe = NULL;
756 * Find the network backend that matches the user-provided
757 * device name. net_backend_set is built using a linker set.
759 SET_FOREACH(pbe, net_backend_set) {
760 if (strncmp(devname, (*pbe)->prefix,
761 strlen((*pbe)->prefix)) == 0) {
763 assert(tbe->init != NULL);
764 assert(tbe->cleanup != NULL);
765 assert(tbe->send != NULL);
766 assert(tbe->recv != NULL);
767 assert(tbe->get_cap != NULL);
768 assert(tbe->set_cap != NULL);
776 nbe = calloc(1, sizeof(*nbe) + tbe->priv_size);
777 *nbe = *tbe; /* copy the template */
780 nbe->be_vnet_hdr_len = 0;
781 nbe->fe_vnet_hdr_len = 0;
783 /* Initialize the backend. */
784 err = nbe->init(nbe, devname, cb, param);
796 netbe_cleanup(struct net_backend *be)
806 netbe_get_cap(struct net_backend *be)
810 return (be->get_cap(be));
814 netbe_set_cap(struct net_backend *be, uint64_t features,
815 unsigned vnet_hdr_len)
821 /* There are only three valid lengths, i.e., 0, 10 and 12. */
822 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
823 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
826 be->fe_vnet_hdr_len = vnet_hdr_len;
828 ret = be->set_cap(be, features, vnet_hdr_len);
829 assert(be->be_vnet_hdr_len == 0 ||
830 be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
836 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
839 return (be->send(be, iov, iovcnt));
843 netbe_peek_recvlen(struct net_backend *be)
846 return (be->peek_recvlen(be));
850 * Try to read a packet from the backend, without blocking.
851 * If no packets are available, return 0. In case of success, return
852 * the length of the packet just read. Return -1 in case of errors.
855 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
858 return (be->recv(be, iov, iovcnt));
862 * Read a packet from the backend and discard it.
863 * Returns the size of the discarded packet or zero if no packet was available.
864 * A negative error code is returned in case of read error.
867 netbe_rx_discard(struct net_backend *be)
870 * MP note: the dummybuf is only used to discard frames,
871 * so there is no need for it to be per-vtnet or locked.
872 * We only make it large enough for TSO-sized segment.
874 static uint8_t dummybuf[65536 + 64];
877 iov.iov_base = dummybuf;
878 iov.iov_len = sizeof(dummybuf);
880 return netbe_recv(be, &iov, 1);
884 netbe_rx_disable(struct net_backend *be)
887 return be->recv_disable(be);
891 netbe_rx_enable(struct net_backend *be)
894 return be->recv_enable(be);
898 netbe_get_vnet_hdr_len(struct net_backend *be)
901 return (be->be_vnet_hdr_len);