2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (C) 2013-2016 Universita` di Pisa
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #if defined(__FreeBSD__)
31 #include <sys/cdefs.h> /* prerequisite */
32 __FBSDID("$FreeBSD$");
34 #include <sys/types.h>
35 #include <sys/errno.h>
36 #include <sys/param.h> /* defines used in kernel.h */
37 #include <sys/kernel.h> /* types used in module initialization */
38 #include <sys/conf.h> /* cdevsw struct, UID, GID */
39 #include <sys/sockio.h>
40 #include <sys/socketvar.h> /* struct socket */
41 #include <sys/malloc.h>
43 #include <sys/rwlock.h>
44 #include <sys/socket.h> /* sockaddrs */
45 #include <sys/selinfo.h>
46 #include <sys/sysctl.h>
48 #include <net/if_var.h>
49 #include <net/bpf.h> /* BIOCIMMEDIATE */
50 #include <machine/bus.h> /* bus_dmamap_* */
51 #include <sys/endian.h>
52 #include <sys/refcount.h>
60 #elif defined(__APPLE__)
62 #warning OSX support is only partial
70 #error Unsupported platform
72 #endif /* unsupported */
78 #include <net/netmap.h>
79 #include <dev/netmap/netmap_kern.h>
80 #include <dev/netmap/netmap_mem2.h>
81 #include <dev/netmap/netmap_bdg.h>
86 * system parameters (most of them in netmap_kern.h)
87 * NM_BDG_NAME prefix for switch port names, default "vale"
88 * NM_BDG_MAXPORTS number of ports
89 * NM_BRIDGES max number of switches in the system.
90 * XXX should become a sysctl or tunable
92 * Switch ports are named valeX:Y where X is the switch name and Y
93 * is the port. If Y matches a physical interface name, the port is
94 * connected to a physical device.
96 * Unlike physical interfaces, switch ports use their own memory region
97 * for rings and buffers.
98 * The virtual interfaces use per-queue lock instead of core lock.
99 * In the tx loop, we aggregate traffic in batches to make all operations
100 * faster. The batch size is bridge_batch.
102 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */
103 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */
104 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */
105 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */
106 /* actual size of the tables */
107 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NETMAP_MAX_FRAGS)
108 /* NM_FT_NULL terminates a list of slots in the ft */
109 #define NM_FT_NULL NM_BDG_BATCH_MAX
113 * bridge_batch is set via sysctl to the max batch size to be
114 * used in the bridge. The actual value may be larger as the
115 * last packet in the block may overflow the size.
117 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
119 SYSCTL_DECL(_dev_netmap);
120 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0,
121 "Max batch size to be used in the bridge");
124 static int netmap_vp_create(struct nmreq_header *hdr, struct ifnet *,
125 struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
126 static int netmap_vp_bdg_attach(const char *, struct netmap_adapter *,
128 static int netmap_vale_bwrap_attach(const char *, struct netmap_adapter *);
131 * For each output interface, nm_bdg_q is used to construct a list.
132 * bq_len is the number of output buffers (we can have coalescing
138 uint32_t bq_len; /* number of buffers */
141 /* Holds the default callbacks */
142 struct netmap_bdg_ops vale_bdg_ops = {
143 .lookup = netmap_bdg_learning,
146 .vp_create = netmap_vp_create,
147 .bwrap_attach = netmap_vale_bwrap_attach,
152 * this is a slightly optimized copy routine which rounds
153 * to multiple of 64 bytes and is often faster than dealing
154 * with other odd sizes. We assume there is enough room
155 * in the source and destination buffers.
157 * XXX only for multiples of 64 bytes, non overlapped.
160 pkt_copy(void *_src, void *_dst, int l)
162 uint64_t *src = _src;
163 uint64_t *dst = _dst;
164 if (unlikely(l >= 1024)) {
168 for (; likely(l > 0); l-=64) {
182 * Free the forwarding tables for rings attached to switch ports.
185 nm_free_bdgfwd(struct netmap_adapter *na)
188 struct netmap_kring **kring;
191 nrings = na->num_tx_rings;
192 kring = na->tx_rings;
193 for (i = 0; i < nrings; i++) {
194 if (kring[i]->nkr_ft) {
195 nm_os_free(kring[i]->nkr_ft);
196 kring[i]->nkr_ft = NULL; /* protect from freeing twice */
203 * Allocate the forwarding tables for the rings attached to the bridge ports.
206 nm_alloc_bdgfwd(struct netmap_adapter *na)
208 int nrings, l, i, num_dstq;
209 struct netmap_kring **kring;
212 /* all port:rings + broadcast */
213 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
214 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
215 l += sizeof(struct nm_bdg_q) * num_dstq;
216 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
218 nrings = netmap_real_rings(na, NR_TX);
219 kring = na->tx_rings;
220 for (i = 0; i < nrings; i++) {
221 struct nm_bdg_fwd *ft;
222 struct nm_bdg_q *dstq;
225 ft = nm_os_malloc(l);
230 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
231 for (j = 0; j < num_dstq; j++) {
232 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
235 kring[i]->nkr_ft = ft;
240 /* Allows external modules to create bridges in exclusive mode,
241 * returns an authentication token that the external module will need
242 * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(),
243 * and nm_bdg_update_private_data() operations.
244 * Successfully executed if ret != NULL and *return_status == 0.
247 netmap_vale_create(const char *bdg_name, int *return_status)
249 struct nm_bridge *b = NULL;
253 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
255 *return_status = EEXIST;
256 goto unlock_bdg_create;
259 b = nm_find_bridge(bdg_name, 1 /* create */, &vale_bdg_ops);
261 *return_status = ENOMEM;
262 goto unlock_bdg_create;
265 b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE;
266 ret = nm_bdg_get_auth_token(b);
274 /* Allows external modules to destroy a bridge created through
275 * netmap_bdg_create(), the bridge must be empty.
278 netmap_vale_destroy(const char *bdg_name, void *auth_token)
280 struct nm_bridge *b = NULL;
284 b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
287 goto unlock_bdg_free;
290 if (!nm_bdg_valid_auth_token(b, auth_token)) {
292 goto unlock_bdg_free;
294 if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) {
296 goto unlock_bdg_free;
299 b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE);
300 ret = netmap_bdg_free(b);
302 b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE;
312 /* nm_dtor callback for ephemeral VALE ports */
314 netmap_vp_dtor(struct netmap_adapter *na)
316 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
317 struct nm_bridge *b = vpna->na_bdg;
319 ND("%s has %d references", na->name, na->na_refcount);
322 netmap_bdg_detach_common(b, vpna->bdg_port, -1);
325 if (na->ifp != NULL && !nm_iszombie(na)) {
326 NM_DETACH_NA(na->ifp);
327 if (vpna->autodelete) {
328 ND("releasing %s", na->ifp->if_xname);
330 nm_os_vi_detach(na->ifp);
337 /* Called by external kernel modules (e.g., Openvswitch).
338 * to modify the private data previously given to regops().
339 * 'name' may be just bridge's name (including ':' if it
340 * is not just NM_BDG_NAME).
341 * Called without NMG_LOCK.
344 nm_bdg_update_private_data(const char *name, bdg_update_private_data_fn_t callback,
345 void *callback_data, void *auth_token)
347 void *private_data = NULL;
352 b = nm_find_bridge(name, 0 /* don't create */, NULL);
355 goto unlock_update_priv;
357 if (!nm_bdg_valid_auth_token(b, auth_token)) {
359 goto unlock_update_priv;
362 private_data = callback(b->private_data, callback_data, &error);
363 b->private_data = private_data;
372 /* nm_krings_create callback for VALE ports.
373 * Calls the standard netmap_krings_create, then adds leases on rx
374 * rings and bdgfwd on tx rings.
377 netmap_vp_krings_create(struct netmap_adapter *na)
382 u_int nrx = netmap_real_rings(na, NR_RX);
385 * Leases are attached to RX rings on vale ports
387 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
389 error = netmap_krings_create(na, tailroom);
393 leases = na->tailroom;
395 for (i = 0; i < nrx; i++) { /* Receive rings */
396 na->rx_rings[i]->nkr_leases = leases;
397 leases += na->num_rx_desc;
400 error = nm_alloc_bdgfwd(na);
402 netmap_krings_delete(na);
410 /* nm_krings_delete callback for VALE ports. */
412 netmap_vp_krings_delete(struct netmap_adapter *na)
415 netmap_krings_delete(na);
420 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
421 struct netmap_vp_adapter *na, u_int ring_nr);
425 * main dispatch routine for the bridge.
426 * Grab packets from a kring, move them into the ft structure
427 * associated to the tx (input) port. Max one instance per port,
428 * filtered on input (ioctl, poll or XXX).
429 * Returns the next position in the ring.
432 nm_bdg_preflush(struct netmap_kring *kring, u_int end)
434 struct netmap_vp_adapter *na =
435 (struct netmap_vp_adapter*)kring->na;
436 struct netmap_ring *ring = kring->ring;
437 struct nm_bdg_fwd *ft;
438 u_int ring_nr = kring->ring_id;
439 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
440 u_int ft_i = 0; /* start from 0 */
441 u_int frags = 1; /* how many frags ? */
442 struct nm_bridge *b = na->na_bdg;
444 /* To protect against modifications to the bridge we acquire a
445 * shared lock, waiting if we can sleep (if the source port is
446 * attached to a user process) or with a trylock otherwise (NICs).
448 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
449 if (na->up.na_flags & NAF_BDG_MAYSLEEP)
451 else if (!BDG_RTRYLOCK(b))
453 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
456 for (; likely(j != end); j = nm_next(j, lim)) {
457 struct netmap_slot *slot = &ring->slot[j];
460 ft[ft_i].ft_len = slot->len;
461 ft[ft_i].ft_flags = slot->flags;
462 ft[ft_i].ft_offset = 0;
464 ND("flags is 0x%x", slot->flags);
465 /* we do not use the buf changed flag, but we still need to reset it */
466 slot->flags &= ~NS_BUF_CHANGED;
468 /* this slot goes into a list so initialize the link field */
469 ft[ft_i].ft_next = NM_FT_NULL;
470 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
471 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
472 if (unlikely(buf == NULL)) {
473 RD(5, "NULL %s buffer pointer from %s slot %d len %d",
474 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
475 kring->name, j, ft[ft_i].ft_len);
476 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
478 ft[ft_i].ft_flags = 0;
480 __builtin_prefetch(buf);
482 if (slot->flags & NS_MOREFRAG) {
486 if (unlikely(netmap_verbose && frags > 1))
487 RD(5, "%d frags at %d", frags, ft_i - frags);
488 ft[ft_i - frags].ft_frags = frags;
490 if (unlikely((int)ft_i >= bridge_batch))
491 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
494 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
495 * have to fix frags count. */
497 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
498 ft[ft_i - frags].ft_frags = frags;
499 D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
502 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
508 /* ----- FreeBSD if_bridge hash function ------- */
511 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
512 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
514 * http://www.burtleburtle.net/bob/hash/spooky.html
516 #define mix(a, b, c) \
518 a -= b; a -= c; a ^= (c >> 13); \
519 b -= c; b -= a; b ^= (a << 8); \
520 c -= a; c -= b; c ^= (b >> 13); \
521 a -= b; a -= c; a ^= (c >> 12); \
522 b -= c; b -= a; b ^= (a << 16); \
523 c -= a; c -= b; c ^= (b >> 5); \
524 a -= b; a -= c; a ^= (c >> 3); \
525 b -= c; b -= a; b ^= (a << 10); \
526 c -= a; c -= b; c ^= (b >> 15); \
527 } while (/*CONSTCOND*/0)
530 static __inline uint32_t
531 nm_bridge_rthash(const uint8_t *addr)
533 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
543 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1)
544 return (c & BRIDGE_RTHASH_MASK);
551 * Lookup function for a learning bridge.
552 * Update the hash table with the source address,
553 * and then returns the destination port index, and the
554 * ring in *dst_ring (at the moment, always use ring 0)
557 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
558 struct netmap_vp_adapter *na, void *private_data)
560 uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset;
561 u_int buf_len = ft->ft_len - ft->ft_offset;
562 struct nm_hash_ent *ht = private_data;
564 u_int dst, mysrc = na->bdg_port;
569 return NM_BDG_NOPORT;
572 if (ft->ft_flags & NS_INDIRECT) {
573 if (copyin(buf, indbuf, sizeof(indbuf))) {
574 return NM_BDG_NOPORT;
579 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
580 smac = le64toh(*(uint64_t *)(buf + 4));
584 * The hash is somewhat expensive, there might be some
585 * worthwhile optimizations here.
587 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
589 sh = nm_bridge_rthash(s); /* hash of source */
590 /* update source port forwarding entry */
591 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */
592 ht[sh].ports = mysrc;
594 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
595 s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
597 dst = NM_BDG_BROADCAST;
598 if ((buf[0] & 1) == 0) { /* unicast */
599 dh = nm_bridge_rthash(buf); /* hash of dst */
600 if (ht[dh].mac == dmac) { /* found dst */
609 * Available space in the ring. Only used in VALE code
610 * and only with is_rx = 1
612 static inline uint32_t
613 nm_kr_space(struct netmap_kring *k, int is_rx)
618 int busy = k->nkr_hwlease - k->nr_hwcur;
620 busy += k->nkr_num_slots;
621 space = k->nkr_num_slots - 1 - busy;
623 /* XXX never used in this branch */
624 space = k->nr_hwtail - k->nkr_hwlease;
626 space += k->nkr_num_slots;
630 if (k->nkr_hwlease >= k->nkr_num_slots ||
631 k->nr_hwcur >= k->nkr_num_slots ||
632 k->nr_tail >= k->nkr_num_slots ||
634 busy >= k->nkr_num_slots) {
635 D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
636 k->nkr_lease_idx, k->nkr_num_slots);
645 /* make a lease on the kring for N positions. return the
647 * XXX only used in VALE code and with is_rx = 1
649 static inline uint32_t
650 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
652 uint32_t lim = k->nkr_num_slots - 1;
653 uint32_t lease_idx = k->nkr_lease_idx;
655 k->nkr_leases[lease_idx] = NR_NOSLOT;
656 k->nkr_lease_idx = nm_next(lease_idx, lim);
658 if (n > nm_kr_space(k, is_rx)) {
659 D("invalid request for %d slots", n);
662 /* XXX verify that there are n slots */
664 if (k->nkr_hwlease > lim)
665 k->nkr_hwlease -= lim + 1;
667 if (k->nkr_hwlease >= k->nkr_num_slots ||
668 k->nr_hwcur >= k->nkr_num_slots ||
669 k->nr_hwtail >= k->nkr_num_slots ||
670 k->nkr_lease_idx >= k->nkr_num_slots) {
671 D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
673 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
674 k->nkr_lease_idx, k->nkr_num_slots);
681 * This flush routine supports only unicast and broadcast but a large
682 * number of ports, and lets us replace the learn and dispatch functions.
685 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
688 struct nm_bdg_q *dst_ents, *brddst;
689 uint16_t num_dsts = 0, *dsts;
690 struct nm_bridge *b = na->na_bdg;
691 u_int i, me = na->bdg_port;
694 * The work area (pointed by ft) is followed by an array of
695 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
696 * queues per port plus one for the broadcast traffic.
697 * Then we have an array of destination indexes.
699 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
700 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
702 /* first pass: find a destination for each packet in the batch */
703 for (i = 0; likely(i < n); i += ft[i].ft_frags) {
704 uint8_t dst_ring = ring_nr; /* default, same ring as origin */
705 uint16_t dst_port, d_i;
707 struct nm_bdg_fwd *start_ft = NULL;
709 ND("slot %d frags %d", i, ft[i].ft_frags);
711 if (na->up.virt_hdr_len < ft[i].ft_len) {
712 ft[i].ft_offset = na->up.virt_hdr_len;
714 } else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) {
715 ft[i].ft_offset = ft[i].ft_len;
718 /* Drop the packet if the virtio-net header is not into the first
719 * fragment nor at the very beginning of the second.
723 dst_port = b->bdg_ops->lookup(start_ft, &dst_ring, na, b->private_data);
724 if (netmap_verbose > 255)
725 RD(5, "slot %d port %d -> %d", i, me, dst_port);
726 if (dst_port >= NM_BDG_NOPORT)
727 continue; /* this packet is identified to be dropped */
728 else if (dst_port == NM_BDG_BROADCAST)
729 dst_ring = 0; /* broadcasts always go to ring 0 */
730 else if (unlikely(dst_port == me ||
731 !b->bdg_ports[dst_port]))
734 /* get a position in the scratch pad */
735 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
738 /* append the first fragment to the list */
739 if (d->bq_head == NM_FT_NULL) { /* new destination */
740 d->bq_head = d->bq_tail = i;
741 /* remember this position to be scanned later */
742 if (dst_port != NM_BDG_BROADCAST)
743 dsts[num_dsts++] = d_i;
745 ft[d->bq_tail].ft_next = i;
748 d->bq_len += ft[i].ft_frags;
752 * Broadcast traffic goes to ring 0 on all destinations.
753 * So we need to add these rings to the list of ports to scan.
754 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
755 * expensive. We should keep a compact list of active destinations
756 * so we could shorten this loop.
758 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
759 if (brddst->bq_head != NM_FT_NULL) {
761 for (j = 0; likely(j < b->bdg_active_ports); j++) {
763 i = b->bdg_port_index[j];
764 if (unlikely(i == me))
766 d_i = i * NM_BDG_MAXRINGS;
767 if (dst_ents[d_i].bq_head == NM_FT_NULL)
768 dsts[num_dsts++] = d_i;
772 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
773 /* second pass: scan destinations */
774 for (i = 0; i < num_dsts; i++) {
775 struct netmap_vp_adapter *dst_na;
776 struct netmap_kring *kring;
777 struct netmap_ring *ring;
778 u_int dst_nr, lim, j, d_i, next, brd_next;
779 u_int needed, howmany;
780 int retry = netmap_txsync_retry;
782 uint32_t my_start = 0, lease_idx = 0;
784 int virt_hdr_mismatch = 0;
787 ND("second pass %d port %d", i, d_i);
789 // XXX fix the division
790 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
791 /* protect from the lookup function returning an inactive
794 if (unlikely(dst_na == NULL))
796 if (dst_na->up.na_flags & NAF_SW_ONLY)
799 * The interface may be in !netmap mode in two cases:
800 * - when na is attached but not activated yet;
801 * - when na is being deactivated but is still attached.
803 if (unlikely(!nm_netmap_on(&dst_na->up))) {
804 ND("not in netmap mode!");
808 /* there is at least one either unicast or broadcast packet */
809 brd_next = brddst->bq_head;
811 /* we need to reserve this many slots. If fewer are
812 * available, some packets will be dropped.
813 * Packets may have multiple fragments, so we may not use
814 * there is a chance that we may not use all of the slots
815 * we have claimed, so we will need to handle the leftover
816 * ones when we regain the lock.
818 needed = d->bq_len + brddst->bq_len;
820 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
821 if (netmap_verbose) {
822 RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
823 dst_na->up.virt_hdr_len);
825 /* There is a virtio-net header/offloadings mismatch between
826 * source and destination. The slower mismatch datapath will
827 * be used to cope with all the mismatches.
829 virt_hdr_mismatch = 1;
830 if (dst_na->mfs < na->mfs) {
831 /* We may need to do segmentation offloadings, and so
832 * we may need a number of destination slots greater
833 * than the number of input slots ('needed').
834 * We look for the smallest integer 'x' which satisfies:
835 * needed * na->mfs + x * H <= x * na->mfs
836 * where 'H' is the length of the longest header that may
837 * be replicated in the segmentation process (e.g. for
838 * TCPv4 we must account for ethernet header, IP header
841 KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0"));
842 needed = (needed * na->mfs) /
843 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
844 ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
848 ND(5, "pass 2 dst %d is %x %s",
849 i, d_i, is_vp ? "virtual" : "nic/host");
850 dst_nr = d_i & (NM_BDG_MAXRINGS-1);
851 nrings = dst_na->up.num_rx_rings;
852 if (dst_nr >= nrings)
853 dst_nr = dst_nr % nrings;
854 kring = dst_na->up.rx_rings[dst_nr];
856 /* the destination ring may have not been opened for RX */
857 if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON))
859 lim = kring->nkr_num_slots - 1;
863 if (dst_na->retry && retry) {
864 /* try to get some free slot from the previous run */
865 kring->nm_notify(kring, 0);
866 /* actually useful only for bwraps, since there
867 * the notify will trigger a txsync on the hwna. VALE ports
868 * have dst_na->retry == 0
871 /* reserve the buffers in the queue and an entry
872 * to report completion, and drop lock.
873 * XXX this might become a helper function.
875 mtx_lock(&kring->q_lock);
876 if (kring->nkr_stopped) {
877 mtx_unlock(&kring->q_lock);
880 my_start = j = kring->nkr_hwlease;
881 howmany = nm_kr_space(kring, 1);
882 if (needed < howmany)
884 lease_idx = nm_kr_lease(kring, howmany, 1);
885 mtx_unlock(&kring->q_lock);
887 /* only retry if we need more than available slots */
888 if (retry && needed <= howmany)
891 /* copy to the destination queue */
892 while (howmany > 0) {
893 struct netmap_slot *slot;
894 struct nm_bdg_fwd *ft_p, *ft_end;
897 /* find the queue from which we pick next packet.
898 * NM_FT_NULL is always higher than valid indexes
899 * so we never dereference it if the other list
900 * has packets (and if both are empty we never
903 if (next < brd_next) {
905 next = ft_p->ft_next;
906 } else { /* insert broadcast */
907 ft_p = ft + brd_next;
908 brd_next = ft_p->ft_next;
910 cnt = ft_p->ft_frags; // cnt > 0
911 if (unlikely(cnt > howmany))
912 break; /* no more space */
913 if (netmap_verbose && cnt > 1)
914 RD(5, "rx %d frags to %d", cnt, j);
916 if (unlikely(virt_hdr_mismatch)) {
917 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
921 char *dst, *src = ft_p->ft_buf;
922 size_t copy_len = ft_p->ft_len, dst_len = copy_len;
924 slot = &ring->slot[j];
925 dst = NMB(&dst_na->up, slot);
927 ND("send [%d] %d(%d) bytes at %s:%d",
928 i, (int)copy_len, (int)dst_len,
929 NM_IFPNAME(dst_ifp), j);
930 /* round to a multiple of 64 */
931 copy_len = (copy_len + 63) & ~63;
933 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
934 copy_len > NETMAP_BUF_SIZE(&na->up))) {
935 RD(5, "invalid len %d, down to 64", (int)copy_len);
936 copy_len = dst_len = 64; // XXX
938 if (ft_p->ft_flags & NS_INDIRECT) {
939 if (copyin(src, dst, copy_len)) {
940 // invalid user pointer, pretend len is 0
944 //memcpy(dst, src, copy_len);
945 pkt_copy(src, dst, (int)copy_len);
948 slot->flags = (cnt << 8)| NS_MOREFRAG;
952 } while (ft_p != ft_end);
953 slot->flags = (cnt << 8); /* clear flag on last entry */
956 if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
960 /* current position */
961 uint32_t *p = kring->nkr_leases; /* shorthand */
963 int still_locked = 1;
965 mtx_lock(&kring->q_lock);
966 if (unlikely(howmany > 0)) {
967 /* not used all bufs. If i am the last one
968 * i can recover the slots, otherwise must
969 * fill them with 0 to mark empty packets.
971 ND("leftover %d bufs", howmany);
972 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
973 /* yes i am the last one */
974 ND("roll back nkr_hwlease to %d", j);
975 kring->nkr_hwlease = j;
977 while (howmany-- > 0) {
978 ring->slot[j].len = 0;
979 ring->slot[j].flags = 0;
984 p[lease_idx] = j; /* report I am done */
986 update_pos = kring->nr_hwtail;
988 if (my_start == update_pos) {
989 /* all slots before my_start have been reported,
990 * so scan subsequent leases to see if other ranges
991 * have been completed, and to a selwakeup or txsync.
993 while (lease_idx != kring->nkr_lease_idx &&
994 p[lease_idx] != NR_NOSLOT) {
996 p[lease_idx] = NR_NOSLOT;
997 lease_idx = nm_next(lease_idx, lim);
999 /* j is the new 'write' position. j != my_start
1000 * means there are new buffers to report
1002 if (likely(j != my_start)) {
1003 kring->nr_hwtail = j;
1005 mtx_unlock(&kring->q_lock);
1006 kring->nm_notify(kring, 0);
1007 /* this is netmap_notify for VALE ports and
1008 * netmap_bwrap_notify for bwrap. The latter will
1009 * trigger a txsync on the underlying hwna
1011 if (dst_na->retry && retry--) {
1012 /* XXX this is going to call nm_notify again.
1013 * Only useful for bwrap in virtual machines
1020 mtx_unlock(&kring->q_lock);
1023 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1026 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1031 /* nm_txsync callback for VALE ports */
1033 netmap_vp_txsync(struct netmap_kring *kring, int flags)
1035 struct netmap_vp_adapter *na =
1036 (struct netmap_vp_adapter *)kring->na;
1038 u_int const lim = kring->nkr_num_slots - 1;
1039 u_int const head = kring->rhead;
1041 if (bridge_batch <= 0) { /* testing only */
1042 done = head; // used all
1049 if (bridge_batch > NM_BDG_BATCH)
1050 bridge_batch = NM_BDG_BATCH;
1052 done = nm_bdg_preflush(kring, head);
1055 D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
1057 * packets between 'done' and 'cur' are left unsent.
1059 kring->nr_hwcur = done;
1060 kring->nr_hwtail = nm_prev(done, lim);
1062 D("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
1067 /* create a netmap_vp_adapter that describes a VALE port.
1068 * Only persistent VALE ports have a non-null ifp.
1071 netmap_vp_create(struct nmreq_header *hdr, struct ifnet *ifp,
1072 struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret)
1074 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1075 struct netmap_vp_adapter *vpna;
1076 struct netmap_adapter *na;
1079 u_int extrabufs = 0;
1081 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1085 vpna = nm_os_malloc(sizeof(*vpna));
1092 strncpy(na->name, hdr->nr_name, sizeof(na->name));
1094 /* bound checking */
1095 na->num_tx_rings = req->nr_tx_rings;
1096 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1097 req->nr_tx_rings = na->num_tx_rings; /* write back */
1098 na->num_rx_rings = req->nr_rx_rings;
1099 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1100 req->nr_rx_rings = na->num_rx_rings; /* write back */
1101 nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1102 1, NM_BDG_MAXSLOTS, NULL);
1103 na->num_tx_desc = req->nr_tx_slots;
1104 nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1105 1, NM_BDG_MAXSLOTS, NULL);
1106 /* validate number of pipes. We want at least 1,
1107 * but probably can do with some more.
1108 * So let's use 2 as default (when 0 is supplied)
1110 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
1111 /* validate extra bufs */
1112 nm_bound_var(&extrabufs, 0, 0,
1113 128*NM_BDG_MAXSLOTS, NULL);
1114 req->nr_extra_bufs = extrabufs; /* write back */
1115 na->num_rx_desc = req->nr_rx_slots;
1116 /* Set the mfs to a default value, as it is needed on the VALE
1117 * mismatch datapath. XXX We should set it according to the MTU
1118 * known to the kernel. */
1119 vpna->mfs = NM_BDG_MFS_DEFAULT;
1120 vpna->last_smac = ~0llu;
1121 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero??
1122 vpna->mfs = netmap_buf_size; */
1124 D("max frame size %u", vpna->mfs);
1126 na->na_flags |= NAF_BDG_MAYSLEEP;
1127 /* persistent VALE ports look like hw devices
1128 * with a native netmap adapter
1131 na->na_flags |= NAF_NATIVE;
1132 na->nm_txsync = netmap_vp_txsync;
1133 na->nm_rxsync = netmap_vp_rxsync;
1134 na->nm_register = netmap_vp_reg;
1135 na->nm_krings_create = netmap_vp_krings_create;
1136 na->nm_krings_delete = netmap_vp_krings_delete;
1137 na->nm_dtor = netmap_vp_dtor;
1138 ND("nr_mem_id %d", req->nr_mem_id);
1140 netmap_mem_get(nmd):
1141 netmap_mem_private_new(
1142 na->num_tx_rings, na->num_tx_desc,
1143 na->num_rx_rings, na->num_rx_desc,
1144 req->nr_extra_bufs, npipes, &error);
1145 if (na->nm_mem == NULL)
1147 na->nm_bdg_attach = netmap_vp_bdg_attach;
1148 /* other nmd fields are set in the common routine */
1149 error = netmap_attach_common(na);
1156 if (na->nm_mem != NULL)
1157 netmap_mem_put(na->nm_mem);
1162 /* nm_bdg_attach callback for VALE ports
1163 * The na_vp port is this same netmap_adapter. There is no host port.
1166 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na,
1167 struct nm_bridge *b)
1169 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
1171 if (b->bdg_ops != &vale_bdg_ops) {
1172 return NM_NEED_BWRAP;
1175 return NM_NEED_BWRAP;
1178 strncpy(na->name, name, sizeof(na->name));
1179 na->na_hostvp = NULL;
1184 netmap_vale_bwrap_krings_create(struct netmap_adapter *na)
1188 /* impersonate a netmap_vp_adapter */
1189 error = netmap_vp_krings_create(na);
1192 error = netmap_bwrap_krings_create_common(na);
1194 netmap_vp_krings_delete(na);
1200 netmap_vale_bwrap_krings_delete(struct netmap_adapter *na)
1202 netmap_bwrap_krings_delete_common(na);
1203 netmap_vp_krings_delete(na);
1207 netmap_vale_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
1209 struct netmap_bwrap_adapter *bna;
1210 struct netmap_adapter *na = NULL;
1211 struct netmap_adapter *hostna = NULL;
1214 bna = nm_os_malloc(sizeof(*bna));
1219 strncpy(na->name, nr_name, sizeof(na->name));
1220 na->nm_register = netmap_bwrap_reg;
1221 na->nm_txsync = netmap_vp_txsync;
1222 // na->nm_rxsync = netmap_bwrap_rxsync;
1223 na->nm_krings_create = netmap_vale_bwrap_krings_create;
1224 na->nm_krings_delete = netmap_vale_bwrap_krings_delete;
1225 na->nm_notify = netmap_bwrap_notify;
1226 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
1227 /* Set the mfs, needed on the VALE mismatch datapath. */
1228 bna->up.mfs = NM_BDG_MFS_DEFAULT;
1230 if (hwna->na_flags & NAF_HOST_RINGS) {
1231 hostna = &bna->host.up;
1232 hostna->nm_notify = netmap_bwrap_notify;
1233 bna->host.mfs = NM_BDG_MFS_DEFAULT;
1236 error = netmap_bwrap_attach_common(na, hwna);
1244 netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na,
1245 struct netmap_mem_d *nmd, int create)
1247 return netmap_get_bdg_na(hdr, na, nmd, create, &vale_bdg_ops);
1251 /* creates a persistent VALE port */
1253 nm_vi_create(struct nmreq_header *hdr)
1255 struct nmreq_vale_newif *req =
1256 (struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body;
1258 /* Build a nmreq_register out of the nmreq_vale_newif,
1259 * so that we can call netmap_get_bdg_na(). */
1260 struct nmreq_register regreq;
1261 bzero(®req, sizeof(regreq));
1262 regreq.nr_tx_slots = req->nr_tx_slots;
1263 regreq.nr_rx_slots = req->nr_rx_slots;
1264 regreq.nr_tx_rings = req->nr_tx_rings;
1265 regreq.nr_rx_rings = req->nr_rx_rings;
1266 regreq.nr_mem_id = req->nr_mem_id;
1267 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
1268 hdr->nr_body = (uintptr_t)®req;
1269 error = netmap_vi_create(hdr, 0 /* no autodelete */);
1270 hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF;
1271 hdr->nr_body = (uintptr_t)req;
1272 /* Write back to the original struct. */
1273 req->nr_tx_slots = regreq.nr_tx_slots;
1274 req->nr_rx_slots = regreq.nr_rx_slots;
1275 req->nr_tx_rings = regreq.nr_tx_rings;
1276 req->nr_rx_rings = regreq.nr_rx_rings;
1277 req->nr_mem_id = regreq.nr_mem_id;
1281 /* remove a persistent VALE port from the system */
1283 nm_vi_destroy(const char *name)
1286 struct netmap_vp_adapter *vpna;
1289 ifp = ifunit_ref(name);
1293 /* make sure this is actually a VALE port */
1294 if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
1299 vpna = (struct netmap_vp_adapter *)NA(ifp);
1301 /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
1302 if (vpna->autodelete) {
1307 /* also make sure that nobody is using the inferface */
1308 if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
1309 vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
1316 D("destroying a persistent vale interface %s", ifp->if_xname);
1317 /* Linux requires all the references are released
1322 nm_os_vi_detach(ifp);
1332 nm_update_info(struct nmreq_register *req, struct netmap_adapter *na)
1334 req->nr_rx_rings = na->num_rx_rings;
1335 req->nr_tx_rings = na->num_tx_rings;
1336 req->nr_rx_slots = na->num_rx_desc;
1337 req->nr_tx_slots = na->num_tx_desc;
1338 return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL,
1344 * Create a virtual interface registered to the system.
1345 * The interface will be attached to a bridge later.
1348 netmap_vi_create(struct nmreq_header *hdr, int autodelete)
1350 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1352 struct netmap_vp_adapter *vpna;
1353 struct netmap_mem_d *nmd = NULL;
1356 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1360 /* don't include VALE prefix */
1361 if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
1363 if (strlen(hdr->nr_name) >= IFNAMSIZ) {
1366 ifp = ifunit_ref(hdr->nr_name);
1367 if (ifp) { /* already exist, cannot create new one */
1370 if (NM_NA_VALID(ifp)) {
1371 int update_err = nm_update_info(req, NA(ifp));
1379 error = nm_os_vi_persist(hdr->nr_name, &ifp);
1384 if (req->nr_mem_id) {
1385 nmd = netmap_mem_find(req->nr_mem_id);
1391 /* netmap_vp_create creates a struct netmap_vp_adapter */
1392 error = netmap_vp_create(hdr, ifp, nmd, &vpna);
1394 D("error %d", error);
1397 /* persist-specific routines */
1398 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
1400 netmap_adapter_get(&vpna->up);
1402 vpna->autodelete = 1;
1404 NM_ATTACH_NA(ifp, &vpna->up);
1405 /* return the updated info */
1406 error = nm_update_info(req, &vpna->up);
1410 ND("returning nr_mem_id %d", req->nr_mem_id);
1412 netmap_mem_put(nmd);
1414 ND("created %s", ifp->if_xname);
1421 netmap_mem_put(nmd);
1423 nm_os_vi_detach(ifp);
1428 #endif /* WITH_VALE */