2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 * This file implements multiple network backends (tap, netmap, ...),
32 * to be used by network frontends such as virtio-net and e1000.
33 * The API to access the backend (e.g. send/receive packets, negotiate
34 * features) is exported by net_backends.h.
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
40 #include <sys/types.h> /* u_short etc */
41 #ifndef WITHOUT_CAPSICUM
42 #include <sys/capsicum.h>
44 #include <sys/ioctl.h>
49 #include <net/netmap.h>
50 #include <net/netmap_virt.h>
51 #define NETMAP_WITH_LIBS
52 #include <net/netmap_user.h>
54 #ifndef WITHOUT_CAPSICUM
55 #include <capsicum_helpers.h>
68 #include <pthread_np.h>
75 #include "net_backends.h"
77 #include <sys/linker_set.h>
80 * Each network backend registers a set of function pointers that are
81 * used to implement the net backends API.
82 * This might need to be exposed if we implement backends in separate files.
85 const char *prefix; /* prefix matching this backend */
88 * Routines used to initialize and cleanup the resources needed
89 * by a backend. The cleanup function is used internally,
90 * and should not be called by the frontend.
92 int (*init)(struct net_backend *be, const char *devname,
93 net_be_rxeof_t cb, void *param);
94 void (*cleanup)(struct net_backend *be);
97 * Called to serve a guest transmit request. The scatter-gather
98 * vector provided by the caller has 'iovcnt' elements and contains
101 ssize_t (*send)(struct net_backend *be, struct iovec *iov, int iovcnt);
104 * Called to receive a packet from the backend. When the function
105 * returns a positive value 'len', the scatter-gather vector
106 * provided by the caller contains a packet with such length.
107 * The function returns 0 if the backend doesn't have a new packet to
110 ssize_t (*recv)(struct net_backend *be, struct iovec *iov, int iovcnt);
113 * Ask the backend for the virtio-net features it is able to
114 * support. Possible features are TSO, UFO and checksum offloading
115 * in both rx and tx direction and for both IPv4 and IPv6.
117 uint64_t (*get_cap)(struct net_backend *be);
120 * Tell the backend to enable/disable the specified virtio-net
121 * features (capabilities).
123 int (*set_cap)(struct net_backend *be, uint64_t features,
124 unsigned int vnet_hdr_len);
126 struct pci_vtnet_softc *sc;
130 * Length of the virtio-net header used by the backend and the
131 * frontend, respectively. A zero value means that the header
134 unsigned int be_vnet_hdr_len;
135 unsigned int fe_vnet_hdr_len;
137 /* Size of backend-specific private data. */
140 /* Room for backend-specific data. */
144 SET_DECLARE(net_backend_set, struct net_backend);
146 #define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr)
148 #define WPRINTF(params) printf params
159 tap_cleanup(struct net_backend *be)
161 struct tap_priv *priv = (struct tap_priv *)be->opaque;
164 mevent_delete(priv->mevp);
173 tap_init(struct net_backend *be, const char *devname,
174 net_be_rxeof_t cb, void *param)
176 struct tap_priv *priv = (struct tap_priv *)be->opaque;
180 #ifndef WITHOUT_CAPSICUM
185 WPRINTF(("TAP backend requires non-NULL callback\n"));
189 strcpy(tbuf, "/dev/");
190 strlcat(tbuf, devname, sizeof(tbuf));
192 fd = open(tbuf, O_RDWR);
194 WPRINTF(("open of tap device %s failed\n", tbuf));
199 * Set non-blocking and register for read
200 * notifications with the event loop
202 if (ioctl(fd, FIONBIO, &opt) < 0) {
203 WPRINTF(("tap device O_NONBLOCK failed\n"));
207 #ifndef WITHOUT_CAPSICUM
208 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
209 if (caph_rights_limit(fd, &rights) == -1)
210 errx(EX_OSERR, "Unable to apply rights for sandbox");
213 priv->mevp = mevent_add(fd, EVF_READ, cb, param);
214 if (priv->mevp == NULL) {
215 WPRINTF(("Could not register event\n"));
229 * Called to send a buffer chain out to the tap device
232 tap_send(struct net_backend *be, struct iovec *iov, int iovcnt)
234 return (writev(be->fd, iov, iovcnt));
238 tap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
242 /* Should never be called without a valid tap fd */
243 assert(be->fd != -1);
245 ret = readv(be->fd, iov, iovcnt);
247 if (ret < 0 && errno == EWOULDBLOCK) {
255 tap_get_cap(struct net_backend *be)
258 return (0); /* no capabilities for now */
262 tap_set_cap(struct net_backend *be, uint64_t features,
263 unsigned vnet_hdr_len)
266 return ((features || vnet_hdr_len) ? -1 : 0);
269 static struct net_backend tap_backend = {
271 .priv_size = sizeof(struct tap_priv),
273 .cleanup = tap_cleanup,
276 .get_cap = tap_get_cap,
277 .set_cap = tap_set_cap,
280 /* A clone of the tap backend, with a different prefix. */
281 static struct net_backend vmnet_backend = {
283 .priv_size = sizeof(struct tap_priv),
285 .cleanup = tap_cleanup,
288 .get_cap = tap_get_cap,
289 .set_cap = tap_set_cap,
292 DATA_SET(net_backend_set, tap_backend);
293 DATA_SET(net_backend_set, vmnet_backend);
299 /* The virtio-net features supported by netmap. */
300 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
301 VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
302 VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
303 VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
306 char ifname[IFNAMSIZ];
309 struct netmap_ring *rx;
310 struct netmap_ring *tx;
317 nmreq_init(struct nmreq *req, char *ifname)
320 memset(req, 0, sizeof(*req));
321 strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
322 req->nr_version = NETMAP_API;
326 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
330 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
332 nmreq_init(&req, priv->ifname);
333 req.nr_cmd = NETMAP_BDG_VNET_HDR;
334 req.nr_arg1 = vnet_hdr_len;
335 err = ioctl(be->fd, NIOCREGIF, &req);
337 WPRINTF(("Unable to set vnet header length %d\n",
342 be->be_vnet_hdr_len = vnet_hdr_len;
348 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
350 int prev_hdr_len = be->be_vnet_hdr_len;
353 if (vnet_hdr_len == prev_hdr_len) {
357 ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
362 netmap_set_vnet_hdr_len(be, prev_hdr_len);
368 netmap_get_cap(struct net_backend *be)
371 return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
372 NETMAP_FEATURES : 0);
376 netmap_set_cap(struct net_backend *be, uint64_t features,
377 unsigned vnet_hdr_len)
380 return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
384 netmap_init(struct net_backend *be, const char *devname,
385 net_be_rxeof_t cb, void *param)
387 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
389 strlcpy(priv->ifname, devname, sizeof(priv->ifname));
390 priv->ifname[sizeof(priv->ifname) - 1] = '\0';
392 priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
393 if (priv->nmd == NULL) {
394 WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)\n",
395 devname, strerror(errno)));
400 priv->memid = priv->nmd->req.nr_arg2;
401 priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
402 priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
404 priv->cb_param = param;
405 be->fd = priv->nmd->fd;
407 priv->mevp = mevent_add(be->fd, EVF_READ, cb, param);
408 if (priv->mevp == NULL) {
409 WPRINTF(("Could not register event\n"));
417 netmap_cleanup(struct net_backend *be)
419 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
422 mevent_delete(priv->mevp);
431 netmap_send(struct net_backend *be, struct iovec *iov,
434 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
435 struct netmap_ring *ring;
445 if (head == ring->tail) {
446 WPRINTF(("No space, drop %zu bytes\n", count_iov(iov, iovcnt)));
449 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
450 nm_buf_size = ring->nr_buf_size;
453 for (j = 0; j < iovcnt; j++) {
454 int iov_frag_size = iov[j].iov_len;
455 void *iov_frag_buf = iov[j].iov_base;
457 totlen += iov_frag_size;
460 * Split each iovec fragment over more netmap slots, if
466 copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
467 memcpy(nm_buf, iov_frag_buf, copylen);
469 iov_frag_buf += copylen;
470 iov_frag_size -= copylen;
472 nm_buf_size -= copylen;
473 nm_buf_len += copylen;
475 if (iov_frag_size == 0) {
479 ring->slot[head].len = nm_buf_len;
480 ring->slot[head].flags = NS_MOREFRAG;
481 head = nm_ring_next(ring, head);
482 if (head == ring->tail) {
484 * We ran out of netmap slots while
485 * splitting the iovec fragments.
487 WPRINTF(("No space, drop %zu bytes\n",
488 count_iov(iov, iovcnt)));
491 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
492 nm_buf_size = ring->nr_buf_size;
497 /* Complete the last slot, which must not have NS_MOREFRAG set. */
498 ring->slot[head].len = nm_buf_len;
499 ring->slot[head].flags = 0;
500 head = nm_ring_next(ring, head);
502 /* Now update ring->head and ring->cur. */
503 ring->head = ring->cur = head;
505 ioctl(be->fd, NIOCTXSYNC, NULL);
511 netmap_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
513 struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
514 struct netmap_slot *slot = NULL;
515 struct netmap_ring *ring;
525 iov_frag_buf = iov->iov_base;
526 iov_frag_size = iov->iov_len;
532 if (head == ring->tail) {
536 slot = ring->slot + head;
537 nm_buf = NETMAP_BUF(ring, slot->buf_idx);
538 nm_buf_len = slot->len;
541 int copylen = nm_buf_len < iov_frag_size ?
542 nm_buf_len : iov_frag_size;
544 memcpy(iov_frag_buf, nm_buf, copylen);
546 nm_buf_len -= copylen;
547 iov_frag_buf += copylen;
548 iov_frag_size -= copylen;
551 if (nm_buf_len == 0) {
558 /* No space to receive. */
559 WPRINTF(("Short iov, drop %zd bytes\n",
563 iov_frag_buf = iov->iov_base;
564 iov_frag_size = iov->iov_len;
567 head = nm_ring_next(ring, head);
569 } while (slot->flags & NS_MOREFRAG);
571 /* Release slots to netmap. */
572 ring->head = ring->cur = head;
577 static struct net_backend netmap_backend = {
579 .priv_size = sizeof(struct netmap_priv),
581 .cleanup = netmap_cleanup,
584 .get_cap = netmap_get_cap,
585 .set_cap = netmap_set_cap,
588 /* A clone of the netmap backend, with a different prefix. */
589 static struct net_backend vale_backend = {
591 .priv_size = sizeof(struct netmap_priv),
593 .cleanup = netmap_cleanup,
596 .get_cap = netmap_get_cap,
597 .set_cap = netmap_set_cap,
600 DATA_SET(net_backend_set, netmap_backend);
601 DATA_SET(net_backend_set, vale_backend);
604 * Initialize a backend and attach to the frontend.
605 * This is called during frontend initialization.
606 * @pbe is a pointer to the backend to be initialized
607 * @devname is the backend-name as supplied on the command line,
608 * e.g. -s 2:0,frontend-name,backend-name[,other-args]
609 * @cb is the receive callback supplied by the frontend,
610 * and it is invoked in the event loop when a receive
611 * event is generated in the hypervisor,
612 * @param is a pointer to the frontend, and normally used as
613 * the argument for the callback.
616 netbe_init(struct net_backend **ret, const char *devname, net_be_rxeof_t cb,
619 struct net_backend **pbe, *nbe, *tbe = NULL;
623 * Find the network backend that matches the user-provided
624 * device name. net_backend_set is built using a linker set.
626 SET_FOREACH(pbe, net_backend_set) {
627 if (strncmp(devname, (*pbe)->prefix,
628 strlen((*pbe)->prefix)) == 0) {
630 assert(tbe->init != NULL);
631 assert(tbe->cleanup != NULL);
632 assert(tbe->send != NULL);
633 assert(tbe->recv != NULL);
634 assert(tbe->get_cap != NULL);
635 assert(tbe->set_cap != NULL);
643 nbe = calloc(1, sizeof(*nbe) + tbe->priv_size);
644 *nbe = *tbe; /* copy the template */
647 nbe->be_vnet_hdr_len = 0;
648 nbe->fe_vnet_hdr_len = 0;
650 /* Initialize the backend. */
651 err = nbe->init(nbe, devname, cb, param);
663 netbe_cleanup(struct net_backend *be)
673 netbe_get_cap(struct net_backend *be)
677 return (be->get_cap(be));
681 netbe_set_cap(struct net_backend *be, uint64_t features,
682 unsigned vnet_hdr_len)
688 /* There are only three valid lengths, i.e., 0, 10 and 12. */
689 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
690 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
693 be->fe_vnet_hdr_len = vnet_hdr_len;
695 ret = be->set_cap(be, features, vnet_hdr_len);
696 assert(be->be_vnet_hdr_len == 0 ||
697 be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
702 static __inline struct iovec *
703 iov_trim(struct iovec *iov, int *iovcnt, unsigned int tlen)
707 /* XXX short-cut: assume first segment is >= tlen */
708 assert(iov[0].iov_len >= tlen);
710 iov[0].iov_len -= tlen;
711 if (iov[0].iov_len == 0) {
716 iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen);
724 netbe_send(struct net_backend *be, struct iovec *iov, int iovcnt)
728 if (be->be_vnet_hdr_len != be->fe_vnet_hdr_len) {
730 * The frontend uses a virtio-net header, but the backend
731 * does not. We ignore it (as it must be all zeroes) and
734 assert(be->be_vnet_hdr_len == 0);
735 iov = iov_trim(iov, &iovcnt, be->fe_vnet_hdr_len);
738 return (be->send(be, iov, iovcnt));
742 * Try to read a packet from the backend, without blocking.
743 * If no packets are available, return 0. In case of success, return
744 * the length of the packet just read. Return -1 in case of errors.
747 netbe_recv(struct net_backend *be, struct iovec *iov, int iovcnt)
749 /* Length of prepended virtio-net header. */
750 unsigned int hlen = be->fe_vnet_hdr_len;
755 if (hlen && hlen != be->be_vnet_hdr_len) {
757 * The frontend uses a virtio-net header, but the backend
758 * does not. We need to prepend a zeroed header.
760 struct virtio_net_rxhdr *vh;
762 assert(be->be_vnet_hdr_len == 0);
765 * Get a pointer to the rx header, and use the
766 * data immediately following it for the packet buffer.
768 vh = iov[0].iov_base;
769 iov = iov_trim(iov, &iovcnt, hlen);
772 * The only valid field in the rx packet header is the
773 * number of buffers if merged rx bufs were negotiated.
776 if (hlen == VNET_HDR_LEN) {
781 ret = be->recv(be, iov, iovcnt);
790 * Read a packet from the backend and discard it.
791 * Returns the size of the discarded packet or zero if no packet was available.
792 * A negative error code is returned in case of read error.
795 netbe_rx_discard(struct net_backend *be)
798 * MP note: the dummybuf is only used to discard frames,
799 * so there is no need for it to be per-vtnet or locked.
800 * We only make it large enough for TSO-sized segment.
802 static uint8_t dummybuf[65536 + 64];
805 iov.iov_base = dummybuf;
806 iov.iov_len = sizeof(dummybuf);
808 return netbe_recv(be, &iov, 1);