2 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
17 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
18 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
19 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
20 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
21 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
22 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
23 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 * This file implements multiple network backends (tap, netmap, ...),
30 * to be used by network frontends such as virtio-net and e1000.
31 * The API to access the backend (e.g. send/receive packets, negotiate
32 * features) is exported by net_backends.h.
35 #include <sys/types.h> /* u_short etc */
36 #ifndef WITHOUT_CAPSICUM
37 #include <sys/capsicum.h>
39 #include <sys/cdefs.h>
40 #include <sys/ioctl.h>
45 #include <net/netmap.h>
46 #include <net/netmap_virt.h>
47 #define NETMAP_WITH_LIBS
48 #include <net/netmap_user.h>
50 #ifndef WITHOUT_CAPSICUM
51 #include <capsicum_helpers.h>
64 #include <pthread_np.h>
71 #include "net_backends.h"
73 #include <sys/linker_set.h>
76 * Each network backend registers a set of function pointers that are
77 * used to implement the net backends API.
78 * This might need to be exposed if we implement backends in separate files.
81 const char *prefix; /* prefix matching this backend */
84 * Routines used to initialize and cleanup the resources needed
85 * by a backend. The cleanup function is used internally,
86 * and should not be called by the frontend.
88 int (*init)(struct net_backend *be, const char *devname,
89 net_be_rxeof_t cb, void *param);
90 void (*cleanup)(struct net_backend *be);
93 * Called to serve a guest transmit request. The scatter-gather
94 * vector provided by the caller has 'iovcnt' elements and contains
97 ssize_t (*send)(struct net_backend *be, struct iovec *iov, int iovcnt);
100 * Called to receive a packet from the backend. When the function
101 * returns a positive value 'len', the scatter-gather vector
102 * provided by the caller contains a packet with such length.
103 * The function returns 0 if the backend doesn't have a new packet to
106 ssize_t (*recv)(struct net_backend *be, struct iovec *iov, int iovcnt);
109 * Ask the backend for the virtio-net features it is able to
110 * support. Possible features are TSO, UFO and checksum offloading
111 * in both rx and tx direction and for both IPv4 and IPv6.
113 uint64_t (*get_cap)(struct net_backend *be);
116 * Tell the backend to enable/disable the specified virtio-net
117 * features (capabilities).
119 int (*set_cap)(struct net_backend *be, uint64_t features,
120 unsigned int vnet_hdr_len);
122 struct pci_vtnet_softc *sc;
126 * Length of the virtio-net header used by the backend and the
127 * frontend, respectively. A zero value means that the header
130 unsigned int be_vnet_hdr_len;
131 unsigned int fe_vnet_hdr_len;
133 /* Size of backend-specific private data. */
136 /* Room for backend-specific data. */
140 SET_DECLARE(net_backend_set, struct net_backend);
142 #define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr)
144 #define WPRINTF(params) printf params
155 tap_cleanup(struct net_backend *be)
157 struct tap_priv *priv = (struct tap_priv *)be->opaque;
160 mevent_delete(priv->mevp);
169 tap_init(struct net_backend *be, const char *devname,
170 net_be_rxeof_t cb, void *param)
172 struct tap_priv *priv = (struct tap_priv *)be->opaque;
176 #ifndef WITHOUT_CAPSICUM
181 WPRINTF(("TAP backend requires non-NULL callback\n"));
185 strcpy(tbuf, "/dev/");
186 strlcat(tbuf, devname, sizeof(tbuf));
188 fd = open(tbuf, O_RDWR);
190 WPRINTF(("open of tap device %s failed\n", tbuf));
195 * Set non-blocking and register for read
196 * notifications with the event loop
198 if (ioctl(fd, FIONBIO, &opt) < 0) {
199 WPRINTF(("tap device O_NONBLOCK failed\n"));
203 #ifndef WITHOUT_CAPSICUM
204 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
205 if (caph_rights_limit(fd, &rights) == -1)
206 errx(EX_OSERR, "Unable to apply rights for sandbox");
209 priv->mevp = mevent_add(fd, EVF_READ, cb, param);
210 if (priv->mevp == NULL) {
211 WPRINTF(("Could not register event\n"));
225 * Called to send a buffer chain out to the tap device
228 tap_send(struct net_backend *be, struct iovec *iov, int iovcnt)
230 return (writev(be->fd, iov, iovcnt));
234 tap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
238 /* Should never be called without a valid tap fd */
239 assert(be->fd != -1);
241 ret = readv(be->fd, iov, iovcnt);
243 if (ret < 0 && errno == EWOULDBLOCK) {
251 tap_get_cap(struct net_backend *be)
254 return (0); /* no capabilities for now */
258 tap_set_cap(struct net_backend *be, uint64_t features,
259 unsigned vnet_hdr_len)
262 return ((features || vnet_hdr_len) ? -1 : 0);
265 static struct net_backend tap_backend = {
267 .priv_size = sizeof(struct tap_priv),
269 .cleanup = tap_cleanup,
272 .get_cap = tap_get_cap,
273 .set_cap = tap_set_cap,
276 /* A clone of the tap backend, with a different prefix. */
277 static struct net_backend vmnet_backend = {
279 .priv_size = sizeof(struct tap_priv),
281 .cleanup = tap_cleanup,
284 .get_cap = tap_get_cap,
285 .set_cap = tap_set_cap,
288 DATA_SET(net_backend_set, tap_backend);
289 DATA_SET(net_backend_set, vmnet_backend);
295 /* The virtio-net features supported by netmap. */
296 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
297 VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
298 VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
299 VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
302 char ifname[IFNAMSIZ];
305 struct netmap_ring *rx;
306 struct netmap_ring *tx;
313 nmreq_init(struct nmreq *req, char *ifname)
316 memset(req, 0, sizeof(*req));
317 strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
318 req->nr_version = NETMAP_API;
322 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
326 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
328 nmreq_init(&req, priv->ifname);
329 req.nr_cmd = NETMAP_BDG_VNET_HDR;
330 req.nr_arg1 = vnet_hdr_len;
331 err = ioctl(be->fd, NIOCREGIF, &req);
333 WPRINTF(("Unable to set vnet header length %d\n",
338 be->be_vnet_hdr_len = vnet_hdr_len;
344 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
346 int prev_hdr_len = be->be_vnet_hdr_len;
349 if (vnet_hdr_len == prev_hdr_len) {
353 ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
358 netmap_set_vnet_hdr_len(be, prev_hdr_len);
364 netmap_get_cap(struct net_backend *be)
367 return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
368 NETMAP_FEATURES : 0);
372 netmap_set_cap(struct net_backend *be, uint64_t features,
373 unsigned vnet_hdr_len)
376 return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
380 netmap_init(struct net_backend *be, const char *devname,
381 net_be_rxeof_t cb, void *param)
383 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
385 strlcpy(priv->ifname, devname, sizeof(priv->ifname));
386 priv->ifname[sizeof(priv->ifname) - 1] = '\0';
388 priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
389 if (priv->nmd == NULL) {
390 WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)\n",
391 devname, strerror(errno)));
396 priv->memid = priv->nmd->req.nr_arg2;
397 priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
398 priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
400 priv->cb_param = param;
401 be->fd = priv->nmd->fd;
403 priv->mevp = mevent_add(be->fd, EVF_READ, cb, param);
404 if (priv->mevp == NULL) {
405 WPRINTF(("Could not register event\n"));
413 netmap_cleanup(struct net_backend *be)
415 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
418 mevent_delete(priv->mevp);
427 netmap_send(struct net_backend *be, struct iovec *iov,
430 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
431 struct netmap_ring *ring;
441 if (head == ring->tail) {
442 WPRINTF(("No space, drop %zu bytes\n", count_iov(iov, iovcnt)));
445 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
446 nm_buf_size = ring->nr_buf_size;
449 for (j = 0; j < iovcnt; j++) {
450 int iov_frag_size = iov[j].iov_len;
451 void *iov_frag_buf = iov[j].iov_base;
453 totlen += iov_frag_size;
456 * Split each iovec fragment over more netmap slots, if
462 copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
463 memcpy(nm_buf, iov_frag_buf, copylen);
465 iov_frag_buf += copylen;
466 iov_frag_size -= copylen;
468 nm_buf_size -= copylen;
469 nm_buf_len += copylen;
471 if (iov_frag_size == 0) {
475 ring->slot[head].len = nm_buf_len;
476 ring->slot[head].flags = NS_MOREFRAG;
477 head = nm_ring_next(ring, head);
478 if (head == ring->tail) {
480 * We ran out of netmap slots while
481 * splitting the iovec fragments.
483 WPRINTF(("No space, drop %zu bytes\n",
484 count_iov(iov, iovcnt)));
487 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
488 nm_buf_size = ring->nr_buf_size;
493 /* Complete the last slot, which must not have NS_MOREFRAG set. */
494 ring->slot[head].len = nm_buf_len;
495 ring->slot[head].flags = 0;
496 head = nm_ring_next(ring, head);
498 /* Now update ring->head and ring->cur. */
499 ring->head = ring->cur = head;
501 ioctl(be->fd, NIOCTXSYNC, NULL);
507 netmap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
509 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
510 struct netmap_slot *slot = NULL;
511 struct netmap_ring *ring;
521 iov_frag_buf = iov->iov_base;
522 iov_frag_size = iov->iov_len;
528 if (head == ring->tail) {
532 slot = ring->slot + head;
533 nm_buf = NETMAP_BUF(ring, slot->buf_idx);
534 nm_buf_len = slot->len;
537 int copylen = nm_buf_len < iov_frag_size ?
538 nm_buf_len : iov_frag_size;
540 memcpy(iov_frag_buf, nm_buf, copylen);
542 nm_buf_len -= copylen;
543 iov_frag_buf += copylen;
544 iov_frag_size -= copylen;
547 if (nm_buf_len == 0) {
554 /* No space to receive. */
555 WPRINTF(("Short iov, drop %zd bytes\n",
559 iov_frag_buf = iov->iov_base;
560 iov_frag_size = iov->iov_len;
563 head = nm_ring_next(ring, head);
565 } while (slot->flags & NS_MOREFRAG);
567 /* Release slots to netmap. */
568 ring->head = ring->cur = head;
573 static struct net_backend netmap_backend = {
575 .priv_size = sizeof(struct netmap_priv),
577 .cleanup = netmap_cleanup,
580 .get_cap = netmap_get_cap,
581 .set_cap = netmap_set_cap,
584 /* A clone of the netmap backend, with a different prefix. */
585 static struct net_backend vale_backend = {
587 .priv_size = sizeof(struct netmap_priv),
589 .cleanup = netmap_cleanup,
592 .get_cap = netmap_get_cap,
593 .set_cap = netmap_set_cap,
596 DATA_SET(net_backend_set, netmap_backend);
597 DATA_SET(net_backend_set, vale_backend);
600 * Initialize a backend and attach to the frontend.
601 * This is called during frontend initialization.
602 * @pbe is a pointer to the backend to be initialized
603 * @devname is the backend-name as supplied on the command line,
604 * e.g. -s 2:0,frontend-name,backend-name[,other-args]
605 * @cb is the receive callback supplied by the frontend,
606 * and it is invoked in the event loop when a receive
607 * event is generated in the hypervisor,
608 * @param is a pointer to the frontend, and normally used as
609 * the argument for the callback.
612 netbe_init(struct net_backend **ret, const char *devname, net_be_rxeof_t cb,
615 struct net_backend **pbe, *nbe, *tbe = NULL;
619 * Find the network backend that matches the user-provided
620 * device name. net_backend_set is built using a linker set.
622 SET_FOREACH(pbe, net_backend_set) {
623 if (strncmp(devname, (*pbe)->prefix,
624 strlen((*pbe)->prefix)) == 0) {
626 assert(tbe->init != NULL);
627 assert(tbe->cleanup != NULL);
628 assert(tbe->send != NULL);
629 assert(tbe->recv != NULL);
630 assert(tbe->get_cap != NULL);
631 assert(tbe->set_cap != NULL);
639 nbe = calloc(1, sizeof(*nbe) + tbe->priv_size);
640 *nbe = *tbe; /* copy the template */
643 nbe->be_vnet_hdr_len = 0;
644 nbe->fe_vnet_hdr_len = 0;
646 /* Initialize the backend. */
647 err = nbe->init(nbe, devname, cb, param);
659 netbe_cleanup(struct net_backend *be)
669 netbe_get_cap(struct net_backend *be)
673 return (be->get_cap(be));
677 netbe_set_cap(struct net_backend *be, uint64_t features,
678 unsigned vnet_hdr_len)
684 /* There are only three valid lengths, i.e., 0, 10 and 12. */
685 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
686 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
689 be->fe_vnet_hdr_len = vnet_hdr_len;
691 ret = be->set_cap(be, features, vnet_hdr_len);
692 assert(be->be_vnet_hdr_len == 0 ||
693 be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
698 static __inline struct iovec *
699 iov_trim(struct iovec *iov, int *iovcnt, unsigned int tlen)
703 /* XXX short-cut: assume first segment is >= tlen */
704 assert(iov[0].iov_len >= tlen);
706 iov[0].iov_len -= tlen;
707 if (iov[0].iov_len == 0) {
712 iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
720 netbe_send(struct net_backend *be, struct iovec *iov, int iovcnt)
724 if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) {
726 * The frontend uses a virtio-net header, but the backend
727 * does not. We ignore it (as it must be all zeroes) and
730 assert(be->be_vnet_hdr_len == 0);
731 iov = iov_trim(iov, &iovcnt, be->fe_vnet_hdr_len);
734 return (be->send(be, iov, iovcnt));
738 * Try to read a packet from the backend, without blocking.
739 * If no packets are available, return 0. In case of success, return
740 * the length of the packet just read. Return -1 in case of errors.
743 netbe_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
745 /* Length of prepended virtio-net header. */
746 unsigned int hlen = be->fe_vnet_hdr_len;
751 if (hlen && hlen != be->be_vnet_hdr_len) {
753 * The frontend uses a virtio-net header, but the backend
754 * does not. We need to prepend a zeroed header.
756 struct virtio_net_rxhdr *vh;
758 assert(be->be_vnet_hdr_len == 0);
761 * Get a pointer to the rx header, and use the
762 * data immediately following it for the packet buffer.
764 vh = iov[0].iov_base;
765 iov = iov_trim(iov, &iovcnt, hlen);
768 * The only valid field in the rx packet header is the
769 * number of buffers if merged rx bufs were negotiated.
772 if (hlen == VNET_HDR_LEN) {
777 ret = be->recv(be, iov, iovcnt);
786 * Read a packet from the backend and discard it.
787 * Returns the size of the discarded packet or zero if no packet was available.
788 * A negative error code is returned in case of read error.
791 netbe_rx_discard(struct net_backend *be)
794 * MP note: the dummybuf is only used to discard frames,
795 * so there is no need for it to be per-vtnet or locked.
796 * We only make it large enough for TSO-sized segment.
798 static uint8_t dummybuf[65536 + 64];
801 iov.iov_base = dummybuf;
802 iov.iov_len = sizeof(dummybuf);
804 return netbe_recv(be, &iov, 1);