2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (C) 2013-2016 Universita` di Pisa
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * This module implements the VALE switch for netmap
35 NMG_LOCK() serializes all modifications to switches and ports.
36 A switch cannot be deleted until all ports are gone.
38 For each switch, an SX lock (RWlock on linux) protects
39 deletion of ports. When configuring or deleting a new port, the
40 lock is acquired in exclusive mode (after holding NMG_LOCK).
41 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
42 The lock is held throughout the entire forwarding cycle,
43 during which the thread may incur in a page fault.
44 Hence it is important that sleepable shared locks are used.
46 On the rx ring, the per-port lock is grabbed initially to reserve
47 a number of slot in the ring, then the lock is released,
48 packets are copied from source to destination, and then
49 the lock is acquired again and the receive ring is updated.
50 (A similar thing is done on the tx ring for NIC and host stack
51 ports attached to the switch)
56 * OS-specific code that is used only within this file.
57 * Other OS-specific code that must be accessed by drivers
58 * is present in netmap_kern.h
61 #if defined(__FreeBSD__)
62 #include <sys/cdefs.h> /* prerequisite */
63 __FBSDID("$FreeBSD$");
65 #include <sys/types.h>
66 #include <sys/errno.h>
67 #include <sys/param.h> /* defines used in kernel.h */
68 #include <sys/kernel.h> /* types used in module initialization */
69 #include <sys/conf.h> /* cdevsw struct, UID, GID */
70 #include <sys/sockio.h>
71 #include <sys/socketvar.h> /* struct socket */
72 #include <sys/malloc.h>
74 #include <sys/rwlock.h>
75 #include <sys/socket.h> /* sockaddrs */
76 #include <sys/selinfo.h>
77 #include <sys/sysctl.h>
79 #include <net/if_var.h>
80 #include <net/bpf.h> /* BIOCIMMEDIATE */
81 #include <machine/bus.h> /* bus_dmamap_* */
82 #include <sys/endian.h>
83 #include <sys/refcount.h>
86 #define BDG_RWLOCK_T struct rwlock // struct rwlock
88 #define BDG_RWINIT(b) \
89 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
90 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock)
91 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock)
92 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock)
93 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock)
94 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock)
95 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock)
100 #include "bsd_glue.h"
102 #elif defined(__APPLE__)
104 #warning OSX support is only partial
105 #include "osx_glue.h"
107 #elif defined(_WIN32)
108 #include "win_glue.h"
112 #error Unsupported platform
114 #endif /* unsupported */
120 #include <net/netmap.h>
121 #include <dev/netmap/netmap_kern.h>
122 #include <dev/netmap/netmap_mem2.h>
127 * system parameters (most of them in netmap_kern.h)
128 * NM_BDG_NAME prefix for switch port names, default "vale"
129 * NM_BDG_MAXPORTS number of ports
130 * NM_BRIDGES max number of switches in the system.
131 * XXX should become a sysctl or tunable
133 * Switch ports are named valeX:Y where X is the switch name and Y
134 * is the port. If Y matches a physical interface name, the port is
135 * connected to a physical device.
137 * Unlike physical interfaces, switch ports use their own memory region
138 * for rings and buffers.
139 * The virtual interfaces use per-queue lock instead of core lock.
140 * In the tx loop, we aggregate traffic in batches to make all operations
141 * faster. The batch size is bridge_batch.
143 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */
144 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */
145 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */
146 #define NM_BDG_HASH 1024 /* forwarding table entries */
147 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */
148 #define NM_MULTISEG 64 /* max size of a chain of bufs */
149 /* actual size of the tables */
150 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG)
151 /* NM_FT_NULL terminates a list of slots in the ft */
152 #define NM_FT_NULL NM_BDG_BATCH_MAX
153 /* Default size for the Maximum Frame Size. */
154 #define NM_BDG_MFS_DEFAULT 1514
158 * bridge_batch is set via sysctl to the max batch size to be
159 * used in the bridge. The actual value may be larger as the
160 * last packet in the block may overflow the size.
162 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
164 SYSCTL_DECL(_dev_netmap);
165 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0,
166 "Max batch size to be used in the bridge");
169 static int netmap_vp_create(struct nmreq_header *hdr, struct ifnet *,
170 struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
171 static int netmap_vp_reg(struct netmap_adapter *na, int onoff);
172 static int netmap_bwrap_reg(struct netmap_adapter *, int onoff);
175 * For each output interface, nm_bdg_q is used to construct a list.
176 * bq_len is the number of output buffers (we can have coalescing
182 uint32_t bq_len; /* number of buffers */
185 /* XXX revise this */
187 uint64_t mac; /* the top 2 bytes are the epoch */
191 /* Holds the default callbacks */
192 static struct netmap_bdg_ops default_bdg_ops = {netmap_bdg_learning, NULL, NULL};
195 * nm_bridge is a descriptor for a VALE switch.
196 * Interfaces for a bridge are all in bdg_ports[].
197 * The array has fixed size, an empty entry does not terminate
198 * the search, but lookups only occur on attach/detach so we
199 * don't mind if they are slow.
201 * The bridge is non blocking on the transmit ports: excess
202 * packets are dropped if there is no room on the output port.
204 * bdg_lock protects accesses to the bdg_ports array.
205 * This is a rw lock (or equivalent).
207 #define NM_BDG_IFNAMSIZ IFNAMSIZ
209 /* XXX what is the proper alignment/layout ? */
210 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */
212 uint32_t bdg_active_ports;
213 char bdg_basename[NM_BDG_IFNAMSIZ];
215 /* Indexes of active ports (up to active_ports)
216 * and all other remaining ports.
218 uint32_t bdg_port_index[NM_BDG_MAXPORTS];
219 /* used by netmap_bdg_detach_common() */
220 uint32_t tmp_bdg_port_index[NM_BDG_MAXPORTS];
222 struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
225 * Programmable lookup functions to figure out the destination port.
226 * It returns either of an index of the destination port,
227 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
228 * forward this packet. ring_nr is the source ring index, and the
229 * function may overwrite this value to forward this packet to a
230 * different ring index.
231 * The function is set by netmap_bdg_regops().
233 struct netmap_bdg_ops *bdg_ops;
236 * Contains the data structure used by the bdg_ops.lookup function.
237 * By default points to *ht which is allocated on attach and used by the default lookup
238 * otherwise will point to the data structure received by netmap_bdg_regops().
241 struct nm_hash_ent *ht;
243 /* Currently used to specify if the bridge is still in use while empty and
244 * if it has been put in exclusive mode by an external module, see netmap_bdg_regops()
245 * and netmap_bdg_create().
247 #define NM_BDG_ACTIVE 1
248 #define NM_BDG_EXCLUSIVE 2
254 #endif /* CONFIG_NET_NS */
258 netmap_bdg_name(struct netmap_vp_adapter *vp)
260 struct nm_bridge *b = vp->na_bdg;
263 return b->bdg_basename;
267 #ifndef CONFIG_NET_NS
269 * XXX in principle nm_bridges could be created dynamically
270 * Right now we have a static array and deletions are protected
271 * by an exclusive lock.
273 static struct nm_bridge *nm_bridges;
274 #endif /* !CONFIG_NET_NS */
278 * this is a slightly optimized copy routine which rounds
279 * to multiple of 64 bytes and is often faster than dealing
280 * with other odd sizes. We assume there is enough room
281 * in the source and destination buffers.
283 * XXX only for multiples of 64 bytes, non overlapped.
286 pkt_copy(void *_src, void *_dst, int l)
288 uint64_t *src = _src;
289 uint64_t *dst = _dst;
290 if (unlikely(l >= 1024)) {
294 for (; likely(l > 0); l-=64) {
308 nm_is_id_char(const char c)
310 return (c >= 'a' && c <= 'z') ||
311 (c >= 'A' && c <= 'Z') ||
312 (c >= '0' && c <= '9') ||
316 /* Validate the name of a VALE bridge port and return the
317 * position of the ":" character. */
319 nm_vale_name_validate(const char *name)
324 if (!name || strlen(name) < strlen(NM_BDG_NAME)) {
328 for (i = 0; i < NM_BDG_IFNAMSIZ && name[i]; i++) {
329 if (name[i] == ':') {
332 } else if (!nm_is_id_char(name[i])) {
337 if (strlen(name) - colon_pos > IFNAMSIZ) {
338 /* interface name too long */
346 * locate a bridge among the existing ones.
347 * MUST BE CALLED WITH NMG_LOCK()
349 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
350 * We assume that this is called with a name of at least NM_NAME chars.
352 static struct nm_bridge *
353 nm_find_bridge(const char *name, int create)
356 struct nm_bridge *b = NULL, *bridges;
361 netmap_bns_getbridges(&bridges, &num_bridges);
363 namelen = nm_vale_name_validate(name);
365 D("invalid bridge name %s", name ? name : NULL);
369 /* lookup the name, remember empty slot if there is one */
370 for (i = 0; i < num_bridges; i++) {
371 struct nm_bridge *x = bridges + i;
373 if ((x->bdg_flags & NM_BDG_ACTIVE) + x->bdg_active_ports == 0) {
374 if (create && b == NULL)
375 b = x; /* record empty slot */
376 } else if (x->bdg_namelen != namelen) {
378 } else if (strncmp(name, x->bdg_basename, namelen) == 0) {
379 ND("found '%.*s' at %d", namelen, name, i);
384 if (i == num_bridges && b) { /* name not found, can create entry */
385 /* initialize the bridge */
386 ND("create new bridge %s with ports %d", b->bdg_basename,
387 b->bdg_active_ports);
388 b->ht = nm_os_malloc(sizeof(struct nm_hash_ent) * NM_BDG_HASH);
390 D("failed to allocate hash table");
393 strncpy(b->bdg_basename, name, namelen);
394 b->bdg_namelen = namelen;
395 b->bdg_active_ports = 0;
396 for (i = 0; i < NM_BDG_MAXPORTS; i++)
397 b->bdg_port_index[i] = i;
398 /* set the default function */
399 b->bdg_ops = &default_bdg_ops;
400 b->private_data = b->ht;
409 * Free the forwarding tables for rings attached to switch ports.
412 nm_free_bdgfwd(struct netmap_adapter *na)
415 struct netmap_kring **kring;
418 nrings = na->num_tx_rings;
419 kring = na->tx_rings;
420 for (i = 0; i < nrings; i++) {
421 if (kring[i]->nkr_ft) {
422 nm_os_free(kring[i]->nkr_ft);
423 kring[i]->nkr_ft = NULL; /* protect from freeing twice */
430 * Allocate the forwarding tables for the rings attached to the bridge ports.
433 nm_alloc_bdgfwd(struct netmap_adapter *na)
435 int nrings, l, i, num_dstq;
436 struct netmap_kring **kring;
439 /* all port:rings + broadcast */
440 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
441 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
442 l += sizeof(struct nm_bdg_q) * num_dstq;
443 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
445 nrings = netmap_real_rings(na, NR_TX);
446 kring = na->tx_rings;
447 for (i = 0; i < nrings; i++) {
448 struct nm_bdg_fwd *ft;
449 struct nm_bdg_q *dstq;
452 ft = nm_os_malloc(l);
457 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
458 for (j = 0; j < num_dstq; j++) {
459 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
462 kring[i]->nkr_ft = ft;
468 netmap_bdg_free(struct nm_bridge *b)
470 if ((b->bdg_flags & NM_BDG_ACTIVE) + b->bdg_active_ports != 0) {
474 ND("marking bridge %s as free", b->bdg_basename);
483 /* remove from bridge b the ports in slots hw and sw
484 * (sw can be -1 if not needed)
487 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
489 int s_hw = hw, s_sw = sw;
490 int i, lim =b->bdg_active_ports;
491 uint32_t *tmp = b->tmp_bdg_port_index;
495 make a copy of bdg_port_index;
496 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
497 in the array of bdg_port_index, replacing them with
498 entries from the bottom of the array;
499 decrement bdg_active_ports;
500 acquire BDG_WLOCK() and copy back the array.
504 D("detach %d and %d (lim %d)", hw, sw, lim);
505 /* make a copy of the list of active ports, update it,
506 * and then copy back within BDG_WLOCK().
508 memcpy(b->tmp_bdg_port_index, b->bdg_port_index, sizeof(b->tmp_bdg_port_index));
509 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
510 if (hw >= 0 && tmp[i] == hw) {
511 ND("detach hw %d at %d", hw, i);
512 lim--; /* point to last active port */
513 tmp[i] = tmp[lim]; /* swap with i */
514 tmp[lim] = hw; /* now this is inactive */
516 } else if (sw >= 0 && tmp[i] == sw) {
517 ND("detach sw %d at %d", sw, i);
526 if (hw >= 0 || sw >= 0) {
527 D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
531 if (b->bdg_ops->dtor)
532 b->bdg_ops->dtor(b->bdg_ports[s_hw]);
533 b->bdg_ports[s_hw] = NULL;
535 b->bdg_ports[s_sw] = NULL;
537 memcpy(b->bdg_port_index, b->tmp_bdg_port_index, sizeof(b->tmp_bdg_port_index));
538 b->bdg_active_ports = lim;
541 ND("now %d active ports", lim);
546 nm_bdg_get_auth_token(struct nm_bridge *b)
551 /* bridge not in exclusive mode ==> always valid
552 * bridge in exclusive mode (created through netmap_bdg_create()) ==> check authentication token
555 nm_bdg_valid_auth_token(struct nm_bridge *b, void *auth_token)
557 return !(b->bdg_flags & NM_BDG_EXCLUSIVE) || b->ht == auth_token;
560 /* Allows external modules to create bridges in exclusive mode,
561 * returns an authentication token that the external module will need
562 * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(),
563 * and nm_bdg_update_private_data() operations.
564 * Successfully executed if ret != NULL and *return_status == 0.
567 netmap_bdg_create(const char *bdg_name, int *return_status)
569 struct nm_bridge *b = NULL;
573 b = nm_find_bridge(bdg_name, 0 /* don't create */);
575 *return_status = EEXIST;
576 goto unlock_bdg_create;
579 b = nm_find_bridge(bdg_name, 1 /* create */);
581 *return_status = ENOMEM;
582 goto unlock_bdg_create;
585 b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE;
586 ret = nm_bdg_get_auth_token(b);
594 /* Allows external modules to destroy a bridge created through
595 * netmap_bdg_create(), the bridge must be empty.
598 netmap_bdg_destroy(const char *bdg_name, void *auth_token)
600 struct nm_bridge *b = NULL;
604 b = nm_find_bridge(bdg_name, 0 /* don't create */);
607 goto unlock_bdg_free;
610 if (!nm_bdg_valid_auth_token(b, auth_token)) {
612 goto unlock_bdg_free;
614 if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) {
616 goto unlock_bdg_free;
619 b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE);
620 ret = netmap_bdg_free(b);
622 b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE;
632 /* nm_bdg_ctl callback for VALE ports */
634 netmap_vp_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
636 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
637 struct nm_bridge *b = vpna->na_bdg;
639 if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
640 return 0; /* nothing to do */
643 netmap_set_all_rings(na, 0 /* disable */);
644 netmap_bdg_detach_common(b, vpna->bdg_port, -1);
646 netmap_set_all_rings(na, 1 /* enable */);
648 /* I have took reference just for attach */
649 netmap_adapter_put(na);
653 /* nm_dtor callback for ephemeral VALE ports */
655 netmap_vp_dtor(struct netmap_adapter *na)
657 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
658 struct nm_bridge *b = vpna->na_bdg;
660 ND("%s has %d references", na->name, na->na_refcount);
663 netmap_bdg_detach_common(b, vpna->bdg_port, -1);
666 if (na->ifp != NULL && !nm_iszombie(na)) {
668 if (vpna->autodelete) {
669 ND("releasing %s", na->ifp->if_xname);
671 nm_os_vi_detach(na->ifp);
677 /* creates a persistent VALE port */
679 nm_vi_create(struct nmreq_header *hdr)
681 struct nmreq_vale_newif *req =
682 (struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body;
684 /* Build a nmreq_register out of the nmreq_vale_newif,
685 * so that we can call netmap_get_bdg_na(). */
686 struct nmreq_register regreq;
687 bzero(®req, sizeof(regreq));
688 regreq.nr_tx_slots = req->nr_tx_slots;
689 regreq.nr_rx_slots = req->nr_rx_slots;
690 regreq.nr_tx_rings = req->nr_tx_rings;
691 regreq.nr_rx_rings = req->nr_rx_rings;
692 regreq.nr_mem_id = req->nr_mem_id;
693 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
694 hdr->nr_body = (uintptr_t)®req;
695 error = netmap_vi_create(hdr, 0 /* no autodelete */);
696 hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF;
697 hdr->nr_body = (uintptr_t)req;
698 /* Write back to the original struct. */
699 req->nr_tx_slots = regreq.nr_tx_slots;
700 req->nr_rx_slots = regreq.nr_rx_slots;
701 req->nr_tx_rings = regreq.nr_tx_rings;
702 req->nr_rx_rings = regreq.nr_rx_rings;
703 req->nr_mem_id = regreq.nr_mem_id;
707 /* remove a persistent VALE port from the system */
709 nm_vi_destroy(const char *name)
712 struct netmap_vp_adapter *vpna;
715 ifp = ifunit_ref(name);
719 /* make sure this is actually a VALE port */
720 if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
725 vpna = (struct netmap_vp_adapter *)NA(ifp);
727 /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
728 if (vpna->autodelete) {
733 /* also make sure that nobody is using the inferface */
734 if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
735 vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
742 D("destroying a persistent vale interface %s", ifp->if_xname);
743 /* Linux requires all the references are released
748 nm_os_vi_detach(ifp);
758 nm_update_info(struct nmreq_register *req, struct netmap_adapter *na)
760 req->nr_rx_rings = na->num_rx_rings;
761 req->nr_tx_rings = na->num_tx_rings;
762 req->nr_rx_slots = na->num_rx_desc;
763 req->nr_tx_slots = na->num_tx_desc;
764 return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL,
769 * Create a virtual interface registered to the system.
770 * The interface will be attached to a bridge later.
773 netmap_vi_create(struct nmreq_header *hdr, int autodelete)
775 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
777 struct netmap_vp_adapter *vpna;
778 struct netmap_mem_d *nmd = NULL;
781 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
785 /* don't include VALE prefix */
786 if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
788 if (strlen(hdr->nr_name) >= IFNAMSIZ) {
791 ifp = ifunit_ref(hdr->nr_name);
792 if (ifp) { /* already exist, cannot create new one */
795 if (NM_NA_VALID(ifp)) {
796 int update_err = nm_update_info(req, NA(ifp));
804 error = nm_os_vi_persist(hdr->nr_name, &ifp);
809 if (req->nr_mem_id) {
810 nmd = netmap_mem_find(req->nr_mem_id);
816 /* netmap_vp_create creates a struct netmap_vp_adapter */
817 error = netmap_vp_create(hdr, ifp, nmd, &vpna);
819 D("error %d", error);
822 /* persist-specific routines */
823 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
825 netmap_adapter_get(&vpna->up);
827 vpna->autodelete = 1;
829 NM_ATTACH_NA(ifp, &vpna->up);
830 /* return the updated info */
831 error = nm_update_info(req, &vpna->up);
835 ND("returning nr_mem_id %d", req->nr_mem_id);
839 ND("created %s", ifp->if_xname);
848 nm_os_vi_detach(ifp);
853 /* Try to get a reference to a netmap adapter attached to a VALE switch.
854 * If the adapter is found (or is created), this function returns 0, a
855 * non NULL pointer is returned into *na, and the caller holds a
856 * reference to the adapter.
857 * If an adapter is not found, then no reference is grabbed and the
858 * function returns an error code, or 0 if there is just a VALE prefix
859 * mismatch. Therefore the caller holds a reference when
860 * (*na != NULL && return == 0).
863 netmap_get_bdg_na(struct nmreq_header *hdr, struct netmap_adapter **na,
864 struct netmap_mem_d *nmd, int create)
866 char *nr_name = hdr->nr_name;
868 struct ifnet *ifp = NULL;
870 struct netmap_vp_adapter *vpna, *hostna = NULL;
873 uint32_t cand = NM_BDG_NOPORT, cand2 = NM_BDG_NOPORT;
876 *na = NULL; /* default return value */
878 /* first try to see if this is a bridge port. */
880 if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) {
881 return 0; /* no error, but no VALE prefix */
884 b = nm_find_bridge(nr_name, create);
886 ND("no bridges available for '%s'", nr_name);
887 return (create ? ENOMEM : ENXIO);
889 if (strlen(nr_name) < b->bdg_namelen) /* impossible */
892 /* Now we are sure that name starts with the bridge's name,
893 * lookup the port in the bridge. We need to scan the entire
894 * list. It is not important to hold a WLOCK on the bridge
895 * during the search because NMG_LOCK already guarantees
896 * that there are no other possible writers.
899 /* lookup in the local list of ports */
900 for (j = 0; j < b->bdg_active_ports; j++) {
901 i = b->bdg_port_index[j];
902 vpna = b->bdg_ports[i];
903 ND("checking %s", vpna->up.name);
904 if (!strcmp(vpna->up.name, nr_name)) {
905 netmap_adapter_get(&vpna->up);
906 ND("found existing if %s refs %d", nr_name)
911 /* not found, should we create it? */
914 /* yes we should, see if we have space to attach entries */
915 needed = 2; /* in some cases we only need 1 */
916 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
917 D("bridge full %d, cannot create new port", b->bdg_active_ports);
920 /* record the next two ports available, but do not allocate yet */
921 cand = b->bdg_port_index[b->bdg_active_ports];
922 cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
923 ND("+++ bridge %s port %s used %d avail %d %d",
924 b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
927 * try see if there is a matching NIC with this name
928 * (after the bridge's name)
930 ifname = nr_name + b->bdg_namelen + 1;
931 ifp = ifunit_ref(ifname);
933 /* Create an ephemeral virtual port.
934 * This block contains all the ephemeral-specific logic.
937 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
942 /* bdg_netmap_attach creates a struct netmap_adapter */
943 error = netmap_vp_create(hdr, NULL, nmd, &vpna);
945 D("error %d", error);
948 /* shortcut - we can skip get_hw_na(),
949 * ownership check and nm_bdg_attach()
953 struct netmap_adapter *hw;
955 /* the vale:nic syntax is only valid for some commands */
956 switch (hdr->nr_reqtype) {
957 case NETMAP_REQ_VALE_ATTACH:
958 case NETMAP_REQ_VALE_DETACH:
959 case NETMAP_REQ_VALE_POLLING_ENABLE:
960 case NETMAP_REQ_VALE_POLLING_DISABLE:
967 error = netmap_get_hw_na(ifp, nmd, &hw);
968 if (error || hw == NULL)
971 /* host adapter might not be created */
972 error = hw->nm_bdg_attach(nr_name, hw);
976 hostna = hw->na_hostvp;
977 if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
978 /* Check if we need to skip the host rings. */
979 struct nmreq_vale_attach *areq =
980 (struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
981 if (areq->reg.nr_mode != NR_REG_NIC_SW) {
988 vpna->bdg_port = cand;
989 ND("NIC %p to bridge port %d", vpna, cand);
990 /* bind the port to the bridge (virtual ports are not active) */
991 b->bdg_ports[cand] = vpna;
993 b->bdg_active_ports++;
994 if (hostna != NULL) {
995 /* also bind the host stack to the bridge */
996 b->bdg_ports[cand2] = hostna;
997 hostna->bdg_port = cand2;
999 b->bdg_active_ports++;
1000 ND("host %p to bridge port %d", hostna, cand2);
1002 ND("if %s refs %d", ifname, vpna->up.na_refcount);
1005 netmap_adapter_get(*na);
1014 /* Process NETMAP_REQ_VALE_ATTACH.
1017 nm_bdg_ctl_attach(struct nmreq_header *hdr, void *auth_token)
1019 struct nmreq_vale_attach *req =
1020 (struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
1021 struct netmap_vp_adapter * vpna;
1022 struct netmap_adapter *na;
1023 struct netmap_mem_d *nmd = NULL;
1024 struct nm_bridge *b = NULL;
1028 /* permission check for modified bridges */
1029 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */);
1030 if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
1035 if (req->reg.nr_mem_id) {
1036 nmd = netmap_mem_find(req->reg.nr_mem_id);
1043 /* check for existing one */
1044 error = netmap_get_bdg_na(hdr, &na, nmd, 0);
1049 error = netmap_get_bdg_na(hdr, &na,
1050 nmd, 1 /* create if not exists */);
1051 if (error) { /* no device */
1055 if (na == NULL) { /* VALE prefix missing */
1060 if (NETMAP_OWNED_BY_ANY(na)) {
1065 if (na->nm_bdg_ctl) {
1066 /* nop for VALE ports. The bwrap needs to put the hwna
1067 * in netmap mode (see netmap_bwrap_bdg_ctl)
1069 error = na->nm_bdg_ctl(hdr, na);
1072 ND("registered %s to netmap-mode", na->name);
1074 vpna = (struct netmap_vp_adapter *)na;
1075 req->port_index = vpna->bdg_port;
1080 netmap_adapter_put(na);
1087 nm_is_bwrap(struct netmap_adapter *na)
1089 return na->nm_register == netmap_bwrap_reg;
1092 /* Process NETMAP_REQ_VALE_DETACH.
1095 nm_bdg_ctl_detach(struct nmreq_header *hdr, void *auth_token)
1097 struct nmreq_vale_detach *nmreq_det = (void *)(uintptr_t)hdr->nr_body;
1098 struct netmap_vp_adapter *vpna;
1099 struct netmap_adapter *na;
1100 struct nm_bridge *b = NULL;
1104 /* permission check for modified bridges */
1105 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */);
1106 if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
1111 error = netmap_get_bdg_na(hdr, &na, NULL, 0 /* don't create */);
1112 if (error) { /* no device, or another bridge or user owns the device */
1116 if (na == NULL) { /* VALE prefix missing */
1119 } else if (nm_is_bwrap(na) &&
1120 ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
1121 /* Don't detach a NIC with polling */
1126 vpna = (struct netmap_vp_adapter *)na;
1127 if (na->na_vp != vpna) {
1128 /* trying to detach first attach of VALE persistent port attached
1134 nmreq_det->port_index = vpna->bdg_port;
1136 if (na->nm_bdg_ctl) {
1137 /* remove the port from bridge. The bwrap
1138 * also needs to put the hwna in normal mode
1140 error = na->nm_bdg_ctl(hdr, na);
1144 netmap_adapter_put(na);
1151 struct nm_bdg_polling_state;
1154 struct nm_kctx *nmk;
1157 struct nm_bdg_polling_state *bps;
1160 struct nm_bdg_polling_state {
1163 struct netmap_bwrap_adapter *bna;
1169 struct nm_bdg_kthread *kthreads;
1173 netmap_bwrap_polling(void *data, int is_kthread)
1175 struct nm_bdg_kthread *nbk = data;
1176 struct netmap_bwrap_adapter *bna;
1177 u_int qfirst, qlast, i;
1178 struct netmap_kring **kring0, *kring;
1182 qfirst = nbk->qfirst;
1184 bna = nbk->bps->bna;
1185 kring0 = NMR(bna->hwna, NR_RX);
1187 for (i = qfirst; i < qlast; i++) {
1189 kring->nm_notify(kring, 0);
1194 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)
1196 struct nm_kctx_cfg kcfg;
1199 bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus);
1200 if (bps->kthreads == NULL)
1203 bzero(&kcfg, sizeof(kcfg));
1204 kcfg.worker_fn = netmap_bwrap_polling;
1205 kcfg.use_kthread = 1;
1206 for (i = 0; i < bps->ncpus; i++) {
1207 struct nm_bdg_kthread *t = bps->kthreads + i;
1208 int all = (bps->ncpus == 1 &&
1209 bps->mode == NETMAP_POLLING_MODE_SINGLE_CPU);
1210 int affinity = bps->cpu_from + i;
1213 t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;
1214 t->qlast = all ? bps->qlast : t->qfirst + 1;
1215 D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,
1219 kcfg.worker_private = t;
1220 t->nmk = nm_os_kctx_create(&kcfg, NULL);
1221 if (t->nmk == NULL) {
1224 nm_os_kctx_worker_setaff(t->nmk, affinity);
1229 for (j = 0; j < i; j++) {
1230 struct nm_bdg_kthread *t = bps->kthreads + i;
1231 nm_os_kctx_destroy(t->nmk);
1233 nm_os_free(bps->kthreads);
1237 /* A variant of ptnetmap_start_kthreads() */
1239 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)
1244 D("polling is not configured");
1247 bps->stopped = false;
1249 for (i = 0; i < bps->ncpus; i++) {
1250 struct nm_bdg_kthread *t = bps->kthreads + i;
1251 error = nm_os_kctx_worker_start(t->nmk);
1253 D("error in nm_kthread_start()");
1260 for (j = 0; j < i; j++) {
1261 struct nm_bdg_kthread *t = bps->kthreads + i;
1262 nm_os_kctx_worker_stop(t->nmk);
1264 bps->stopped = true;
1269 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)
1276 for (i = 0; i < bps->ncpus; i++) {
1277 struct nm_bdg_kthread *t = bps->kthreads + i;
1278 nm_os_kctx_worker_stop(t->nmk);
1279 nm_os_kctx_destroy(t->nmk);
1281 bps->stopped = true;
1285 get_polling_cfg(struct nmreq_vale_polling *req, struct netmap_adapter *na,
1286 struct nm_bdg_polling_state *bps)
1288 unsigned int avail_cpus, core_from;
1289 unsigned int qfirst, qlast;
1290 uint32_t i = req->nr_first_cpu_id;
1291 uint32_t req_cpus = req->nr_num_polling_cpus;
1293 avail_cpus = nm_os_ncpus();
1295 if (req_cpus == 0) {
1296 D("req_cpus must be > 0");
1298 } else if (req_cpus >= avail_cpus) {
1299 D("Cannot use all the CPUs in the system");
1303 if (req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU) {
1304 /* Use a separate core for each ring. If nr_num_polling_cpus>1
1305 * more consecutive rings are polled.
1306 * For example, if nr_first_cpu_id=2 and nr_num_polling_cpus=2,
1307 * ring 2 and 3 are polled by core 2 and 3, respectively. */
1308 if (i + req_cpus > nma_get_nrings(na, NR_RX)) {
1309 D("Rings %u-%u not in range (have %d rings)",
1310 i, i + req_cpus, nma_get_nrings(na, NR_RX));
1314 qlast = qfirst + req_cpus;
1317 } else if (req->nr_mode == NETMAP_POLLING_MODE_SINGLE_CPU) {
1318 /* Poll all the rings using a core specified by nr_first_cpu_id.
1319 * the number of cores must be 1. */
1320 if (req_cpus != 1) {
1321 D("ncpus must be 1 for NETMAP_POLLING_MODE_SINGLE_CPU "
1322 "(was %d)", req_cpus);
1326 qlast = nma_get_nrings(na, NR_RX);
1329 D("Invalid polling mode");
1333 bps->mode = req->nr_mode;
1334 bps->qfirst = qfirst;
1336 bps->cpu_from = core_from;
1337 bps->ncpus = req_cpus;
1338 D("%s qfirst %u qlast %u cpu_from %u ncpus %u",
1339 req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU ?
1341 qfirst, qlast, core_from, req_cpus);
1346 nm_bdg_ctl_polling_start(struct nmreq_vale_polling *req, struct netmap_adapter *na)
1348 struct nm_bdg_polling_state *bps;
1349 struct netmap_bwrap_adapter *bna;
1352 bna = (struct netmap_bwrap_adapter *)na;
1353 if (bna->na_polling_state) {
1354 D("ERROR adapter already in polling mode");
1358 bps = nm_os_malloc(sizeof(*bps));
1361 bps->configured = false;
1362 bps->stopped = true;
1364 if (get_polling_cfg(req, na, bps)) {
1369 if (nm_bdg_create_kthreads(bps)) {
1374 bps->configured = true;
1375 bna->na_polling_state = bps;
1378 /* disable interrupts if possible */
1379 nma_intr_enable(bna->hwna, 0);
1380 /* start kthread now */
1381 error = nm_bdg_polling_start_kthreads(bps);
1383 D("ERROR nm_bdg_polling_start_kthread()");
1384 nm_os_free(bps->kthreads);
1386 bna->na_polling_state = NULL;
1387 nma_intr_enable(bna->hwna, 1);
1393 nm_bdg_ctl_polling_stop(struct netmap_adapter *na)
1395 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;
1396 struct nm_bdg_polling_state *bps;
1398 if (!bna->na_polling_state) {
1399 D("ERROR adapter is not in polling mode");
1402 bps = bna->na_polling_state;
1403 nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);
1404 bps->configured = false;
1406 bna->na_polling_state = NULL;
1407 /* reenable interrupts */
1408 nma_intr_enable(bna->hwna, 1);
1413 nm_bdg_polling(struct nmreq_header *hdr)
1415 struct nmreq_vale_polling *req =
1416 (struct nmreq_vale_polling *)(uintptr_t)hdr->nr_body;
1417 struct netmap_adapter *na = NULL;
1421 error = netmap_get_bdg_na(hdr, &na, NULL, /*create=*/0);
1423 if (!nm_is_bwrap(na)) {
1425 } else if (hdr->nr_reqtype == NETMAP_BDG_POLLING_ON) {
1426 error = nm_bdg_ctl_polling_start(req, na);
1428 netmap_adapter_get(na);
1430 error = nm_bdg_ctl_polling_stop(na);
1432 netmap_adapter_put(na);
1434 netmap_adapter_put(na);
1435 } else if (!na && !error) {
1436 /* Not VALE port. */
1444 /* Process NETMAP_REQ_VALE_LIST. */
1446 netmap_bdg_list(struct nmreq_header *hdr)
1448 struct nmreq_vale_list *req =
1449 (struct nmreq_vale_list *)(uintptr_t)hdr->nr_body;
1450 int namelen = strlen(hdr->nr_name);
1451 struct nm_bridge *b, *bridges;
1452 struct netmap_vp_adapter *vpna;
1453 int error = 0, i, j;
1456 netmap_bns_getbridges(&bridges, &num_bridges);
1458 /* this is used to enumerate bridges and ports */
1459 if (namelen) { /* look up indexes of bridge and port */
1460 if (strncmp(hdr->nr_name, NM_BDG_NAME,
1461 strlen(NM_BDG_NAME))) {
1465 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */);
1471 req->nr_bridge_idx = b - bridges; /* bridge index */
1472 req->nr_port_idx = NM_BDG_NOPORT;
1473 for (j = 0; j < b->bdg_active_ports; j++) {
1474 i = b->bdg_port_index[j];
1475 vpna = b->bdg_ports[i];
1477 D("This should not happen");
1480 /* the former and the latter identify a
1481 * virtual port and a NIC, respectively
1483 if (!strcmp(vpna->up.name, hdr->nr_name)) {
1484 req->nr_port_idx = i; /* port index */
1490 /* return the first non-empty entry starting from
1491 * bridge nr_arg1 and port nr_arg2.
1493 * Users can detect the end of the same bridge by
1494 * seeing the new and old value of nr_arg1, and can
1495 * detect the end of all the bridge by error != 0
1497 i = req->nr_bridge_idx;
1498 j = req->nr_port_idx;
1501 for (error = ENOENT; i < NM_BRIDGES; i++) {
1503 for ( ; j < NM_BDG_MAXPORTS; j++) {
1504 if (b->bdg_ports[j] == NULL)
1506 vpna = b->bdg_ports[j];
1507 /* write back the VALE switch name */
1508 strncpy(hdr->nr_name, vpna->up.name,
1513 j = 0; /* following bridges scan from 0 */
1516 req->nr_bridge_idx = i;
1517 req->nr_port_idx = j;
1524 /* Called by external kernel modules (e.g., Openvswitch).
1525 * to set configure/lookup/dtor functions of a VALE instance.
1526 * Register callbacks to the given bridge. 'name' may be just
1527 * bridge's name (including ':' if it is not just NM_BDG_NAME).
1529 * Called without NMG_LOCK.
1533 netmap_bdg_regops(const char *name, struct netmap_bdg_ops *bdg_ops, void *private_data, void *auth_token)
1535 struct nm_bridge *b;
1539 b = nm_find_bridge(name, 0 /* don't create */);
1544 if (!nm_bdg_valid_auth_token(b, auth_token)) {
1551 /* resetting the bridge */
1552 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
1553 b->bdg_ops = &default_bdg_ops;
1554 b->private_data = b->ht;
1556 /* modifying the bridge */
1557 b->private_data = private_data;
1558 b->bdg_ops = bdg_ops;
1567 /* Called by external kernel modules (e.g., Openvswitch).
1568 * to modify the private data previously given to regops().
1569 * 'name' may be just bridge's name (including ':' if it
1570 * is not just NM_BDG_NAME).
1571 * Called without NMG_LOCK.
1574 nm_bdg_update_private_data(const char *name, bdg_update_private_data_fn_t callback,
1575 void *callback_data, void *auth_token)
1577 void *private_data = NULL;
1578 struct nm_bridge *b;
1582 b = nm_find_bridge(name, 0 /* don't create */);
1585 goto unlock_update_priv;
1587 if (!nm_bdg_valid_auth_token(b, auth_token)) {
1589 goto unlock_update_priv;
1592 private_data = callback(b->private_data, callback_data, &error);
1593 b->private_data = private_data;
1602 netmap_bdg_config(struct nm_ifreq *nr)
1604 struct nm_bridge *b;
1608 b = nm_find_bridge(nr->nifr_name, 0);
1614 /* Don't call config() with NMG_LOCK() held */
1616 if (b->bdg_ops->config != NULL)
1617 error = b->bdg_ops->config(nr);
1623 /* nm_krings_create callback for VALE ports.
1624 * Calls the standard netmap_krings_create, then adds leases on rx
1625 * rings and bdgfwd on tx rings.
1628 netmap_vp_krings_create(struct netmap_adapter *na)
1633 u_int nrx = netmap_real_rings(na, NR_RX);
1636 * Leases are attached to RX rings on vale ports
1638 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
1640 error = netmap_krings_create(na, tailroom);
1644 leases = na->tailroom;
1646 for (i = 0; i < nrx; i++) { /* Receive rings */
1647 na->rx_rings[i]->nkr_leases = leases;
1648 leases += na->num_rx_desc;
1651 error = nm_alloc_bdgfwd(na);
1653 netmap_krings_delete(na);
1661 /* nm_krings_delete callback for VALE ports. */
1663 netmap_vp_krings_delete(struct netmap_adapter *na)
1666 netmap_krings_delete(na);
1671 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
1672 struct netmap_vp_adapter *na, u_int ring_nr);
1676 * main dispatch routine for the bridge.
1677 * Grab packets from a kring, move them into the ft structure
1678 * associated to the tx (input) port. Max one instance per port,
1679 * filtered on input (ioctl, poll or XXX).
1680 * Returns the next position in the ring.
1683 nm_bdg_preflush(struct netmap_kring *kring, u_int end)
1685 struct netmap_vp_adapter *na =
1686 (struct netmap_vp_adapter*)kring->na;
1687 struct netmap_ring *ring = kring->ring;
1688 struct nm_bdg_fwd *ft;
1689 u_int ring_nr = kring->ring_id;
1690 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
1691 u_int ft_i = 0; /* start from 0 */
1692 u_int frags = 1; /* how many frags ? */
1693 struct nm_bridge *b = na->na_bdg;
1695 /* To protect against modifications to the bridge we acquire a
1696 * shared lock, waiting if we can sleep (if the source port is
1697 * attached to a user process) or with a trylock otherwise (NICs).
1699 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1700 if (na->up.na_flags & NAF_BDG_MAYSLEEP)
1702 else if (!BDG_RTRYLOCK(b))
1704 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1707 for (; likely(j != end); j = nm_next(j, lim)) {
1708 struct netmap_slot *slot = &ring->slot[j];
1711 ft[ft_i].ft_len = slot->len;
1712 ft[ft_i].ft_flags = slot->flags;
1713 ft[ft_i].ft_offset = 0;
1715 ND("flags is 0x%x", slot->flags);
1716 /* we do not use the buf changed flag, but we still need to reset it */
1717 slot->flags &= ~NS_BUF_CHANGED;
1719 /* this slot goes into a list so initialize the link field */
1720 ft[ft_i].ft_next = NM_FT_NULL;
1721 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
1722 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
1723 if (unlikely(buf == NULL)) {
1724 RD(5, "NULL %s buffer pointer from %s slot %d len %d",
1725 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
1726 kring->name, j, ft[ft_i].ft_len);
1727 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
1728 ft[ft_i].ft_len = 0;
1729 ft[ft_i].ft_flags = 0;
1731 __builtin_prefetch(buf);
1733 if (slot->flags & NS_MOREFRAG) {
1737 if (unlikely(netmap_verbose && frags > 1))
1738 RD(5, "%d frags at %d", frags, ft_i - frags);
1739 ft[ft_i - frags].ft_frags = frags;
1741 if (unlikely((int)ft_i >= bridge_batch))
1742 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1745 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
1746 * have to fix frags count. */
1748 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
1749 ft[ft_i - frags].ft_frags = frags;
1750 D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
1753 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1759 /* ----- FreeBSD if_bridge hash function ------- */
1762 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1763 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1765 * http://www.burtleburtle.net/bob/hash/spooky.html
1767 #define mix(a, b, c) \
1769 a -= b; a -= c; a ^= (c >> 13); \
1770 b -= c; b -= a; b ^= (a << 8); \
1771 c -= a; c -= b; c ^= (b >> 13); \
1772 a -= b; a -= c; a ^= (c >> 12); \
1773 b -= c; b -= a; b ^= (a << 16); \
1774 c -= a; c -= b; c ^= (b >> 5); \
1775 a -= b; a -= c; a ^= (c >> 3); \
1776 b -= c; b -= a; b ^= (a << 10); \
1777 c -= a; c -= b; c ^= (b >> 15); \
1778 } while (/*CONSTCOND*/0)
1781 static __inline uint32_t
1782 nm_bridge_rthash(const uint8_t *addr)
1784 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1794 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1)
1795 return (c & BRIDGE_RTHASH_MASK);
1801 /* nm_register callback for VALE ports */
1803 netmap_vp_reg(struct netmap_adapter *na, int onoff)
1805 struct netmap_vp_adapter *vpna =
1806 (struct netmap_vp_adapter*)na;
1810 /* persistent ports may be put in netmap mode
1811 * before being attached to a bridge
1814 BDG_WLOCK(vpna->na_bdg);
1817 for (i = 0; i < netmap_real_rings(na, t); i++) {
1818 struct netmap_kring *kring = NMR(na, t)[i];
1820 if (nm_kring_pending_on(kring))
1821 kring->nr_mode = NKR_NETMAP_ON;
1824 if (na->active_fds == 0)
1825 na->na_flags |= NAF_NETMAP_ON;
1826 /* XXX on FreeBSD, persistent VALE ports should also
1827 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
1830 if (na->active_fds == 0)
1831 na->na_flags &= ~NAF_NETMAP_ON;
1833 for (i = 0; i < netmap_real_rings(na, t); i++) {
1834 struct netmap_kring *kring = NMR(na, t)[i];
1836 if (nm_kring_pending_off(kring))
1837 kring->nr_mode = NKR_NETMAP_OFF;
1842 BDG_WUNLOCK(vpna->na_bdg);
1848 * Lookup function for a learning bridge.
1849 * Update the hash table with the source address,
1850 * and then returns the destination port index, and the
1851 * ring in *dst_ring (at the moment, always use ring 0)
1854 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
1855 struct netmap_vp_adapter *na, void *private_data)
1857 uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset;
1858 u_int buf_len = ft->ft_len - ft->ft_offset;
1859 struct nm_hash_ent *ht = private_data;
1861 u_int dst, mysrc = na->bdg_port;
1862 uint64_t smac, dmac;
1866 return NM_BDG_NOPORT;
1869 if (ft->ft_flags & NS_INDIRECT) {
1870 if (copyin(buf, indbuf, sizeof(indbuf))) {
1871 return NM_BDG_NOPORT;
1876 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1877 smac = le64toh(*(uint64_t *)(buf + 4));
1881 * The hash is somewhat expensive, there might be some
1882 * worthwhile optimizations here.
1884 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
1886 sh = nm_bridge_rthash(s); /* hash of source */
1887 /* update source port forwarding entry */
1888 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */
1889 ht[sh].ports = mysrc;
1891 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1892 s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1894 dst = NM_BDG_BROADCAST;
1895 if ((buf[0] & 1) == 0) { /* unicast */
1896 dh = nm_bridge_rthash(buf); /* hash of dst */
1897 if (ht[dh].mac == dmac) { /* found dst */
1906 * Available space in the ring. Only used in VALE code
1907 * and only with is_rx = 1
1909 static inline uint32_t
1910 nm_kr_space(struct netmap_kring *k, int is_rx)
1915 int busy = k->nkr_hwlease - k->nr_hwcur;
1917 busy += k->nkr_num_slots;
1918 space = k->nkr_num_slots - 1 - busy;
1920 /* XXX never used in this branch */
1921 space = k->nr_hwtail - k->nkr_hwlease;
1923 space += k->nkr_num_slots;
1927 if (k->nkr_hwlease >= k->nkr_num_slots ||
1928 k->nr_hwcur >= k->nkr_num_slots ||
1929 k->nr_tail >= k->nkr_num_slots ||
1931 busy >= k->nkr_num_slots) {
1932 D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1933 k->nkr_lease_idx, k->nkr_num_slots);
1942 /* make a lease on the kring for N positions. return the
1944 * XXX only used in VALE code and with is_rx = 1
1946 static inline uint32_t
1947 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1949 uint32_t lim = k->nkr_num_slots - 1;
1950 uint32_t lease_idx = k->nkr_lease_idx;
1952 k->nkr_leases[lease_idx] = NR_NOSLOT;
1953 k->nkr_lease_idx = nm_next(lease_idx, lim);
1955 if (n > nm_kr_space(k, is_rx)) {
1956 D("invalid request for %d slots", n);
1959 /* XXX verify that there are n slots */
1960 k->nkr_hwlease += n;
1961 if (k->nkr_hwlease > lim)
1962 k->nkr_hwlease -= lim + 1;
1964 if (k->nkr_hwlease >= k->nkr_num_slots ||
1965 k->nr_hwcur >= k->nkr_num_slots ||
1966 k->nr_hwtail >= k->nkr_num_slots ||
1967 k->nkr_lease_idx >= k->nkr_num_slots) {
1968 D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1970 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1971 k->nkr_lease_idx, k->nkr_num_slots);
1978 * This flush routine supports only unicast and broadcast but a large
1979 * number of ports, and lets us replace the learn and dispatch functions.
1982 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1985 struct nm_bdg_q *dst_ents, *brddst;
1986 uint16_t num_dsts = 0, *dsts;
1987 struct nm_bridge *b = na->na_bdg;
1988 u_int i, me = na->bdg_port;
1991 * The work area (pointed by ft) is followed by an array of
1992 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1993 * queues per port plus one for the broadcast traffic.
1994 * Then we have an array of destination indexes.
1996 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1997 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1999 /* first pass: find a destination for each packet in the batch */
2000 for (i = 0; likely(i < n); i += ft[i].ft_frags) {
2001 uint8_t dst_ring = ring_nr; /* default, same ring as origin */
2002 uint16_t dst_port, d_i;
2004 struct nm_bdg_fwd *start_ft = NULL;
2006 ND("slot %d frags %d", i, ft[i].ft_frags);
2008 if (na->up.virt_hdr_len < ft[i].ft_len) {
2009 ft[i].ft_offset = na->up.virt_hdr_len;
2011 } else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) {
2012 ft[i].ft_offset = ft[i].ft_len;
2013 start_ft = &ft[i+1];
2015 /* Drop the packet if the virtio-net header is not into the first
2016 * fragment nor at the very beginning of the second.
2020 dst_port = b->bdg_ops->lookup(start_ft, &dst_ring, na, b->private_data);
2021 if (netmap_verbose > 255)
2022 RD(5, "slot %d port %d -> %d", i, me, dst_port);
2023 if (dst_port >= NM_BDG_NOPORT)
2024 continue; /* this packet is identified to be dropped */
2025 else if (dst_port == NM_BDG_BROADCAST)
2026 dst_ring = 0; /* broadcasts always go to ring 0 */
2027 else if (unlikely(dst_port == me ||
2028 !b->bdg_ports[dst_port]))
2031 /* get a position in the scratch pad */
2032 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
2035 /* append the first fragment to the list */
2036 if (d->bq_head == NM_FT_NULL) { /* new destination */
2037 d->bq_head = d->bq_tail = i;
2038 /* remember this position to be scanned later */
2039 if (dst_port != NM_BDG_BROADCAST)
2040 dsts[num_dsts++] = d_i;
2042 ft[d->bq_tail].ft_next = i;
2045 d->bq_len += ft[i].ft_frags;
2049 * Broadcast traffic goes to ring 0 on all destinations.
2050 * So we need to add these rings to the list of ports to scan.
2051 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
2052 * expensive. We should keep a compact list of active destinations
2053 * so we could shorten this loop.
2055 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
2056 if (brddst->bq_head != NM_FT_NULL) {
2058 for (j = 0; likely(j < b->bdg_active_ports); j++) {
2060 i = b->bdg_port_index[j];
2061 if (unlikely(i == me))
2063 d_i = i * NM_BDG_MAXRINGS;
2064 if (dst_ents[d_i].bq_head == NM_FT_NULL)
2065 dsts[num_dsts++] = d_i;
2069 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
2070 /* second pass: scan destinations */
2071 for (i = 0; i < num_dsts; i++) {
2072 struct netmap_vp_adapter *dst_na;
2073 struct netmap_kring *kring;
2074 struct netmap_ring *ring;
2075 u_int dst_nr, lim, j, d_i, next, brd_next;
2076 u_int needed, howmany;
2077 int retry = netmap_txsync_retry;
2079 uint32_t my_start = 0, lease_idx = 0;
2081 int virt_hdr_mismatch = 0;
2084 ND("second pass %d port %d", i, d_i);
2086 // XXX fix the division
2087 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
2088 /* protect from the lookup function returning an inactive
2091 if (unlikely(dst_na == NULL))
2093 if (dst_na->up.na_flags & NAF_SW_ONLY)
2096 * The interface may be in !netmap mode in two cases:
2097 * - when na is attached but not activated yet;
2098 * - when na is being deactivated but is still attached.
2100 if (unlikely(!nm_netmap_on(&dst_na->up))) {
2101 ND("not in netmap mode!");
2105 /* there is at least one either unicast or broadcast packet */
2106 brd_next = brddst->bq_head;
2108 /* we need to reserve this many slots. If fewer are
2109 * available, some packets will be dropped.
2110 * Packets may have multiple fragments, so we may not use
2111 * there is a chance that we may not use all of the slots
2112 * we have claimed, so we will need to handle the leftover
2113 * ones when we regain the lock.
2115 needed = d->bq_len + brddst->bq_len;
2117 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
2118 if (netmap_verbose) {
2119 RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
2120 dst_na->up.virt_hdr_len);
2122 /* There is a virtio-net header/offloadings mismatch between
2123 * source and destination. The slower mismatch datapath will
2124 * be used to cope with all the mismatches.
2126 virt_hdr_mismatch = 1;
2127 if (dst_na->mfs < na->mfs) {
2128 /* We may need to do segmentation offloadings, and so
2129 * we may need a number of destination slots greater
2130 * than the number of input slots ('needed').
2131 * We look for the smallest integer 'x' which satisfies:
2132 * needed * na->mfs + x * H <= x * na->mfs
2133 * where 'H' is the length of the longest header that may
2134 * be replicated in the segmentation process (e.g. for
2135 * TCPv4 we must account for ethernet header, IP header
2136 * and TCPv4 header).
2138 KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0"));
2139 needed = (needed * na->mfs) /
2140 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
2141 ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
2145 ND(5, "pass 2 dst %d is %x %s",
2146 i, d_i, is_vp ? "virtual" : "nic/host");
2147 dst_nr = d_i & (NM_BDG_MAXRINGS-1);
2148 nrings = dst_na->up.num_rx_rings;
2149 if (dst_nr >= nrings)
2150 dst_nr = dst_nr % nrings;
2151 kring = dst_na->up.rx_rings[dst_nr];
2153 /* the destination ring may have not been opened for RX */
2154 if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON))
2156 lim = kring->nkr_num_slots - 1;
2160 if (dst_na->retry && retry) {
2161 /* try to get some free slot from the previous run */
2162 kring->nm_notify(kring, 0);
2163 /* actually useful only for bwraps, since there
2164 * the notify will trigger a txsync on the hwna. VALE ports
2165 * have dst_na->retry == 0
2168 /* reserve the buffers in the queue and an entry
2169 * to report completion, and drop lock.
2170 * XXX this might become a helper function.
2172 mtx_lock(&kring->q_lock);
2173 if (kring->nkr_stopped) {
2174 mtx_unlock(&kring->q_lock);
2177 my_start = j = kring->nkr_hwlease;
2178 howmany = nm_kr_space(kring, 1);
2179 if (needed < howmany)
2181 lease_idx = nm_kr_lease(kring, howmany, 1);
2182 mtx_unlock(&kring->q_lock);
2184 /* only retry if we need more than available slots */
2185 if (retry && needed <= howmany)
2188 /* copy to the destination queue */
2189 while (howmany > 0) {
2190 struct netmap_slot *slot;
2191 struct nm_bdg_fwd *ft_p, *ft_end;
2194 /* find the queue from which we pick next packet.
2195 * NM_FT_NULL is always higher than valid indexes
2196 * so we never dereference it if the other list
2197 * has packets (and if both are empty we never
2200 if (next < brd_next) {
2202 next = ft_p->ft_next;
2203 } else { /* insert broadcast */
2204 ft_p = ft + brd_next;
2205 brd_next = ft_p->ft_next;
2207 cnt = ft_p->ft_frags; // cnt > 0
2208 if (unlikely(cnt > howmany))
2209 break; /* no more space */
2210 if (netmap_verbose && cnt > 1)
2211 RD(5, "rx %d frags to %d", cnt, j);
2212 ft_end = ft_p + cnt;
2213 if (unlikely(virt_hdr_mismatch)) {
2214 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
2218 char *dst, *src = ft_p->ft_buf;
2219 size_t copy_len = ft_p->ft_len, dst_len = copy_len;
2221 slot = &ring->slot[j];
2222 dst = NMB(&dst_na->up, slot);
2224 ND("send [%d] %d(%d) bytes at %s:%d",
2225 i, (int)copy_len, (int)dst_len,
2226 NM_IFPNAME(dst_ifp), j);
2227 /* round to a multiple of 64 */
2228 copy_len = (copy_len + 63) & ~63;
2230 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
2231 copy_len > NETMAP_BUF_SIZE(&na->up))) {
2232 RD(5, "invalid len %d, down to 64", (int)copy_len);
2233 copy_len = dst_len = 64; // XXX
2235 if (ft_p->ft_flags & NS_INDIRECT) {
2236 if (copyin(src, dst, copy_len)) {
2237 // invalid user pointer, pretend len is 0
2241 //memcpy(dst, src, copy_len);
2242 pkt_copy(src, dst, (int)copy_len);
2244 slot->len = dst_len;
2245 slot->flags = (cnt << 8)| NS_MOREFRAG;
2246 j = nm_next(j, lim);
2249 } while (ft_p != ft_end);
2250 slot->flags = (cnt << 8); /* clear flag on last entry */
2253 if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
2257 /* current position */
2258 uint32_t *p = kring->nkr_leases; /* shorthand */
2259 uint32_t update_pos;
2260 int still_locked = 1;
2262 mtx_lock(&kring->q_lock);
2263 if (unlikely(howmany > 0)) {
2264 /* not used all bufs. If i am the last one
2265 * i can recover the slots, otherwise must
2266 * fill them with 0 to mark empty packets.
2268 ND("leftover %d bufs", howmany);
2269 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
2270 /* yes i am the last one */
2271 ND("roll back nkr_hwlease to %d", j);
2272 kring->nkr_hwlease = j;
2274 while (howmany-- > 0) {
2275 ring->slot[j].len = 0;
2276 ring->slot[j].flags = 0;
2277 j = nm_next(j, lim);
2281 p[lease_idx] = j; /* report I am done */
2283 update_pos = kring->nr_hwtail;
2285 if (my_start == update_pos) {
2286 /* all slots before my_start have been reported,
2287 * so scan subsequent leases to see if other ranges
2288 * have been completed, and to a selwakeup or txsync.
2290 while (lease_idx != kring->nkr_lease_idx &&
2291 p[lease_idx] != NR_NOSLOT) {
2293 p[lease_idx] = NR_NOSLOT;
2294 lease_idx = nm_next(lease_idx, lim);
2296 /* j is the new 'write' position. j != my_start
2297 * means there are new buffers to report
2299 if (likely(j != my_start)) {
2300 kring->nr_hwtail = j;
2302 mtx_unlock(&kring->q_lock);
2303 kring->nm_notify(kring, 0);
2304 /* this is netmap_notify for VALE ports and
2305 * netmap_bwrap_notify for bwrap. The latter will
2306 * trigger a txsync on the underlying hwna
2308 if (dst_na->retry && retry--) {
2309 /* XXX this is going to call nm_notify again.
2310 * Only useful for bwrap in virtual machines
2317 mtx_unlock(&kring->q_lock);
2320 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
2323 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
2328 /* nm_txsync callback for VALE ports */
2330 netmap_vp_txsync(struct netmap_kring *kring, int flags)
2332 struct netmap_vp_adapter *na =
2333 (struct netmap_vp_adapter *)kring->na;
2335 u_int const lim = kring->nkr_num_slots - 1;
2336 u_int const head = kring->rhead;
2338 if (bridge_batch <= 0) { /* testing only */
2339 done = head; // used all
2346 if (bridge_batch > NM_BDG_BATCH)
2347 bridge_batch = NM_BDG_BATCH;
2349 done = nm_bdg_preflush(kring, head);
2352 D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
2354 * packets between 'done' and 'cur' are left unsent.
2356 kring->nr_hwcur = done;
2357 kring->nr_hwtail = nm_prev(done, lim);
2359 D("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
2364 /* rxsync code used by VALE ports nm_rxsync callback and also
2365 * internally by the brwap
2368 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
2370 struct netmap_adapter *na = kring->na;
2371 struct netmap_ring *ring = kring->ring;
2372 u_int nm_i, lim = kring->nkr_num_slots - 1;
2373 u_int head = kring->rhead;
2377 D("ouch dangerous reset!!!");
2378 n = netmap_ring_reinit(kring);
2382 /* First part, import newly received packets. */
2383 /* actually nothing to do here, they are already in the kring */
2385 /* Second part, skip past packets that userspace has released. */
2386 nm_i = kring->nr_hwcur;
2388 /* consistency check, but nothing really important here */
2389 for (n = 0; likely(nm_i != head); n++) {
2390 struct netmap_slot *slot = &ring->slot[nm_i];
2391 void *addr = NMB(na, slot);
2393 if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
2394 D("bad buffer index %d, ignore ?",
2397 slot->flags &= ~NS_BUF_CHANGED;
2398 nm_i = nm_next(nm_i, lim);
2400 kring->nr_hwcur = head;
2409 * nm_rxsync callback for VALE ports
2410 * user process reading from a VALE switch.
2411 * Already protected against concurrent calls from userspace,
2412 * but we must acquire the queue's lock to protect against
2413 * writers on the same queue.
2416 netmap_vp_rxsync(struct netmap_kring *kring, int flags)
2420 mtx_lock(&kring->q_lock);
2421 n = netmap_vp_rxsync_locked(kring, flags);
2422 mtx_unlock(&kring->q_lock);
2427 /* nm_bdg_attach callback for VALE ports
2428 * The na_vp port is this same netmap_adapter. There is no host port.
2431 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na)
2433 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
2436 return netmap_bwrap_attach(name, na);
2439 strncpy(na->name, name, sizeof(na->name));
2440 na->na_hostvp = NULL;
2444 /* create a netmap_vp_adapter that describes a VALE port.
2445 * Only persistent VALE ports have a non-null ifp.
2448 netmap_vp_create(struct nmreq_header *hdr, struct ifnet *ifp,
2449 struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret)
2451 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
2452 struct netmap_vp_adapter *vpna;
2453 struct netmap_adapter *na;
2456 u_int extrabufs = 0;
2458 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
2462 vpna = nm_os_malloc(sizeof(*vpna));
2469 strncpy(na->name, hdr->nr_name, sizeof(na->name));
2471 /* bound checking */
2472 na->num_tx_rings = req->nr_tx_rings;
2473 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2474 req->nr_tx_rings = na->num_tx_rings; /* write back */
2475 na->num_rx_rings = req->nr_rx_rings;
2476 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2477 req->nr_rx_rings = na->num_rx_rings; /* write back */
2478 nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE,
2479 1, NM_BDG_MAXSLOTS, NULL);
2480 na->num_tx_desc = req->nr_tx_slots;
2481 nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE,
2482 1, NM_BDG_MAXSLOTS, NULL);
2483 /* validate number of pipes. We want at least 1,
2484 * but probably can do with some more.
2485 * So let's use 2 as default (when 0 is supplied)
2487 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
2488 /* validate extra bufs */
2489 nm_bound_var(&extrabufs, 0, 0,
2490 128*NM_BDG_MAXSLOTS, NULL);
2491 req->nr_extra_bufs = extrabufs; /* write back */
2492 na->num_rx_desc = req->nr_rx_slots;
2493 /* Set the mfs to a default value, as it is needed on the VALE
2494 * mismatch datapath. XXX We should set it according to the MTU
2495 * known to the kernel. */
2496 vpna->mfs = NM_BDG_MFS_DEFAULT;
2497 vpna->last_smac = ~0llu;
2498 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero??
2499 vpna->mfs = netmap_buf_size; */
2501 D("max frame size %u", vpna->mfs);
2503 na->na_flags |= NAF_BDG_MAYSLEEP;
2504 /* persistent VALE ports look like hw devices
2505 * with a native netmap adapter
2508 na->na_flags |= NAF_NATIVE;
2509 na->nm_txsync = netmap_vp_txsync;
2510 na->nm_rxsync = netmap_vp_rxsync;
2511 na->nm_register = netmap_vp_reg;
2512 na->nm_krings_create = netmap_vp_krings_create;
2513 na->nm_krings_delete = netmap_vp_krings_delete;
2514 na->nm_dtor = netmap_vp_dtor;
2515 ND("nr_mem_id %d", req->nr_mem_id);
2517 netmap_mem_get(nmd):
2518 netmap_mem_private_new(
2519 na->num_tx_rings, na->num_tx_desc,
2520 na->num_rx_rings, na->num_rx_desc,
2521 req->nr_extra_bufs, npipes, &error);
2522 if (na->nm_mem == NULL)
2524 na->nm_bdg_attach = netmap_vp_bdg_attach;
2525 /* other nmd fields are set in the common routine */
2526 error = netmap_attach_common(na);
2533 if (na->nm_mem != NULL)
2534 netmap_mem_put(na->nm_mem);
2539 /* Bridge wrapper code (bwrap).
2540 * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
2542 * The main task is to swap the meaning of tx and rx rings to match the
2543 * expectations of the VALE switch code (see nm_bdg_flush).
2545 * The bwrap works by interposing a netmap_bwrap_adapter between the
2546 * rest of the system and the hwna. The netmap_bwrap_adapter looks like
2547 * a netmap_vp_adapter to the rest the system, but, internally, it
2548 * translates all callbacks to what the hwna expects.
2550 * Note that we have to intercept callbacks coming from two sides:
2552 * - callbacks coming from the netmap module are intercepted by
2553 * passing around the netmap_bwrap_adapter instead of the hwna
2555 * - callbacks coming from outside of the netmap module only know
2556 * about the hwna. This, however, only happens in interrupt
2557 * handlers, where only the hwna->nm_notify callback is called.
2558 * What the bwrap does is to overwrite the hwna->nm_notify callback
2559 * with its own netmap_bwrap_intr_notify.
2560 * XXX This assumes that the hwna->nm_notify callback was the
2561 * standard netmap_notify(), as it is the case for nic adapters.
2562 * Any additional action performed by hwna->nm_notify will not be
2563 * performed by netmap_bwrap_intr_notify.
2565 * Additionally, the bwrap can optionally attach the host rings pair
2566 * of the wrapped adapter to a different port of the switch.
2571 netmap_bwrap_dtor(struct netmap_adapter *na)
2573 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2574 struct netmap_adapter *hwna = bna->hwna;
2575 struct nm_bridge *b = bna->up.na_bdg,
2576 *bh = bna->host.na_bdg;
2578 if (bna->host.up.nm_mem)
2579 netmap_mem_put(bna->host.up.nm_mem);
2582 netmap_bdg_detach_common(b, bna->up.bdg_port,
2583 (bh ? bna->host.bdg_port : -1));
2588 bna->host.up.ifp = NULL;
2589 hwna->na_vp = bna->saved_na_vp;
2590 hwna->na_hostvp = NULL;
2591 hwna->na_private = NULL;
2592 hwna->na_flags &= ~NAF_BUSY;
2593 netmap_adapter_put(hwna);
2599 * Intr callback for NICs connected to a bridge.
2600 * Simply ignore tx interrupts (maybe we could try to recover space ?)
2601 * and pass received packets from nic to the bridge.
2603 * XXX TODO check locking: this is called from the interrupt
2604 * handler so we should make sure that the interface is not
2605 * disconnected while passing down an interrupt.
2607 * Note, no user process can access this NIC or the host stack.
2608 * The only part of the ring that is significant are the slots,
2609 * and head/cur/tail are set from the kring as needed
2610 * (part as a receive ring, part as a transmit ring).
2612 * callback that overwrites the hwna notify callback.
2613 * Packets come from the outside or from the host stack and are put on an
2615 * The bridge wrapper then sends the packets through the bridge.
2618 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
2620 struct netmap_adapter *na = kring->na;
2621 struct netmap_bwrap_adapter *bna = na->na_private;
2622 struct netmap_kring *bkring;
2623 struct netmap_vp_adapter *vpna = &bna->up;
2624 u_int ring_nr = kring->ring_id;
2625 int ret = NM_IRQ_COMPLETED;
2629 D("%s %s 0x%x", na->name, kring->name, flags);
2631 bkring = vpna->up.tx_rings[ring_nr];
2633 /* make sure the ring is not disabled */
2634 if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {
2639 D("%s head %d cur %d tail %d", na->name,
2640 kring->rhead, kring->rcur, kring->rtail);
2642 /* simulate a user wakeup on the rx ring
2643 * fetch packets that have arrived.
2645 error = kring->nm_sync(kring, 0);
2648 if (kring->nr_hwcur == kring->nr_hwtail) {
2650 D("how strange, interrupt with no packets on %s",
2655 /* new packets are kring->rcur to kring->nr_hwtail, and the bkring
2656 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
2657 * to push all packets out.
2659 bkring->rhead = bkring->rcur = kring->nr_hwtail;
2661 netmap_vp_txsync(bkring, flags);
2663 /* mark all buffers as released on this ring */
2664 kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
2665 /* another call to actually release the buffers */
2666 error = kring->nm_sync(kring, 0);
2668 /* The second rxsync may have further advanced hwtail. If this happens,
2669 * return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */
2670 if (kring->rcur != kring->nr_hwtail) {
2671 ret = NM_IRQ_RESCHED;
2676 return error ? error : ret;
2680 /* nm_register callback for bwrap */
2682 netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
2684 struct netmap_bwrap_adapter *bna =
2685 (struct netmap_bwrap_adapter *)na;
2686 struct netmap_adapter *hwna = bna->hwna;
2687 struct netmap_vp_adapter *hostna = &bna->host;
2691 ND("%s %s", na->name, onoff ? "on" : "off");
2694 /* netmap_do_regif has been called on the bwrap na.
2695 * We need to pass the information about the
2696 * memory allocator down to the hwna before
2697 * putting it in netmap mode
2699 hwna->na_lut = na->na_lut;
2701 if (hostna->na_bdg) {
2702 /* if the host rings have been attached to switch,
2703 * we need to copy the memory allocator information
2704 * in the hostna also
2706 hostna->up.na_lut = na->na_lut;
2711 /* pass down the pending ring state information */
2713 for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
2714 NMR(hwna, t)[i]->nr_pending_mode =
2715 NMR(na, t)[i]->nr_pending_mode;
2718 /* forward the request to the hwna */
2719 error = hwna->nm_register(hwna, onoff);
2723 /* copy up the current ring state information */
2725 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
2726 struct netmap_kring *kring = NMR(hwna, t)[i];
2727 NMR(na, t)[i]->nr_mode = kring->nr_mode;
2731 /* impersonate a netmap_vp_adapter */
2732 netmap_vp_reg(na, onoff);
2734 netmap_vp_reg(&hostna->up, onoff);
2738 /* intercept the hwna nm_nofify callback on the hw rings */
2739 for (i = 0; i < hwna->num_rx_rings; i++) {
2740 hwna->rx_rings[i]->save_notify = hwna->rx_rings[i]->nm_notify;
2741 hwna->rx_rings[i]->nm_notify = netmap_bwrap_intr_notify;
2743 i = hwna->num_rx_rings; /* for safety */
2744 /* save the host ring notify unconditionally */
2745 hwna->rx_rings[i]->save_notify = hwna->rx_rings[i]->nm_notify;
2746 if (hostna->na_bdg) {
2747 /* also intercept the host ring notify */
2748 hwna->rx_rings[i]->nm_notify = netmap_bwrap_intr_notify;
2750 if (na->active_fds == 0)
2751 na->na_flags |= NAF_NETMAP_ON;
2755 if (na->active_fds == 0)
2756 na->na_flags &= ~NAF_NETMAP_ON;
2758 /* reset all notify callbacks (including host ring) */
2759 for (i = 0; i <= hwna->num_rx_rings; i++) {
2760 hwna->rx_rings[i]->nm_notify = hwna->rx_rings[i]->save_notify;
2761 hwna->rx_rings[i]->save_notify = NULL;
2763 hwna->na_lut.lut = NULL;
2764 hwna->na_lut.plut = NULL;
2765 hwna->na_lut.objtotal = 0;
2766 hwna->na_lut.objsize = 0;
2768 /* pass ownership of the netmap rings to the hwna */
2770 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
2771 NMR(na, t)[i]->ring = NULL;
2780 /* nm_config callback for bwrap */
2782 netmap_bwrap_config(struct netmap_adapter *na, struct nm_config_info *info)
2784 struct netmap_bwrap_adapter *bna =
2785 (struct netmap_bwrap_adapter *)na;
2786 struct netmap_adapter *hwna = bna->hwna;
2789 /* Forward the request to the hwna. It may happen that nobody
2790 * registered hwna yet, so netmap_mem_get_lut() may have not
2791 * been called yet. */
2792 error = netmap_mem_get_lut(hwna->nm_mem, &hwna->na_lut);
2795 netmap_update_config(hwna);
2796 /* swap the results and propagate */
2797 info->num_tx_rings = hwna->num_rx_rings;
2798 info->num_tx_descs = hwna->num_rx_desc;
2799 info->num_rx_rings = hwna->num_tx_rings;
2800 info->num_rx_descs = hwna->num_tx_desc;
2801 info->rx_buf_maxsize = hwna->rx_buf_maxsize;
2807 /* nm_krings_create callback for bwrap */
2809 netmap_bwrap_krings_create(struct netmap_adapter *na)
2811 struct netmap_bwrap_adapter *bna =
2812 (struct netmap_bwrap_adapter *)na;
2813 struct netmap_adapter *hwna = bna->hwna;
2814 struct netmap_adapter *hostna = &bna->host.up;
2820 /* impersonate a netmap_vp_adapter */
2821 error = netmap_vp_krings_create(na);
2825 /* also create the hwna krings */
2826 error = hwna->nm_krings_create(hwna);
2828 goto err_del_vp_rings;
2831 /* increment the usage counter for all the hwna krings */
2833 for (i = 0; i < nma_get_nrings(hwna, t) + 1; i++) {
2834 NMR(hwna, t)[i]->users++;
2838 /* now create the actual rings */
2839 error = netmap_mem_rings_create(hwna);
2844 /* cross-link the netmap rings
2845 * The original number of rings comes from hwna,
2846 * rx rings on one side equals tx rings on the other.
2849 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2850 for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {
2851 NMR(na, t)[i]->nkr_num_slots = NMR(hwna, r)[i]->nkr_num_slots;
2852 NMR(na, t)[i]->ring = NMR(hwna, r)[i]->ring;
2856 if (na->na_flags & NAF_HOST_RINGS) {
2857 /* the hostna rings are the host rings of the bwrap.
2858 * The corresponding krings must point back to the
2861 hostna->tx_rings = &na->tx_rings[na->num_tx_rings];
2862 hostna->tx_rings[0]->na = hostna;
2863 hostna->rx_rings = &na->rx_rings[na->num_rx_rings];
2864 hostna->rx_rings[0]->na = hostna;
2871 NMR(hwna, t)[i]->users--;
2873 hwna->nm_krings_delete(hwna);
2875 netmap_vp_krings_delete(na);
2882 netmap_bwrap_krings_delete(struct netmap_adapter *na)
2884 struct netmap_bwrap_adapter *bna =
2885 (struct netmap_bwrap_adapter *)na;
2886 struct netmap_adapter *hwna = bna->hwna;
2892 /* decrement the usage counter for all the hwna krings */
2894 for (i = 0; i < nma_get_nrings(hwna, t) + 1; i++) {
2895 NMR(hwna, t)[i]->users--;
2899 /* delete any netmap rings that are no longer needed */
2900 netmap_mem_rings_delete(hwna);
2901 hwna->nm_krings_delete(hwna);
2902 netmap_vp_krings_delete(na);
2906 /* notify method for the bridge-->hwna direction */
2908 netmap_bwrap_notify(struct netmap_kring *kring, int flags)
2910 struct netmap_adapter *na = kring->na;
2911 struct netmap_bwrap_adapter *bna = na->na_private;
2912 struct netmap_adapter *hwna = bna->hwna;
2913 u_int ring_n = kring->ring_id;
2914 u_int lim = kring->nkr_num_slots - 1;
2915 struct netmap_kring *hw_kring;
2918 ND("%s: na %s hwna %s",
2919 (kring ? kring->name : "NULL!"),
2920 (na ? na->name : "NULL!"),
2921 (hwna ? hwna->name : "NULL!"));
2922 hw_kring = hwna->tx_rings[ring_n];
2924 if (nm_kr_tryget(hw_kring, 0, NULL)) {
2928 /* first step: simulate a user wakeup on the rx ring */
2929 netmap_vp_rxsync(kring, flags);
2930 ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2932 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2933 ring->head, ring->cur, ring->tail,
2934 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
2935 /* second step: the new packets are sent on the tx ring
2936 * (which is actually the same ring)
2938 hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
2939 error = hw_kring->nm_sync(hw_kring, flags);
2943 /* third step: now we are back the rx ring */
2944 /* claim ownership on all hw owned bufs */
2945 kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
2947 /* fourth step: the user goes to sleep again, causing another rxsync */
2948 netmap_vp_rxsync(kring, flags);
2949 ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2951 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2952 ring->head, ring->cur, ring->tail,
2953 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
2955 nm_kr_put(hw_kring);
2957 return error ? error : NM_IRQ_COMPLETED;
2961 /* nm_bdg_ctl callback for the bwrap.
2962 * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd].
2963 * On attach, it needs to provide a fake netmap_priv_d structure and
2964 * perform a netmap_do_regif() on the bwrap. This will put both the
2965 * bwrap and the hwna in netmap mode, with the netmap rings shared
2966 * and cross linked. Moroever, it will start intercepting interrupts
2970 netmap_bwrap_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
2972 struct netmap_priv_d *npriv;
2973 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2976 if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
2977 struct nmreq_vale_attach *req =
2978 (struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
2979 if (req->reg.nr_ringid != 0 ||
2980 (req->reg.nr_mode != NR_REG_ALL_NIC &&
2981 req->reg.nr_mode != NR_REG_NIC_SW)) {
2982 /* We only support attaching all the NIC rings
2983 * and/or the host stack. */
2986 if (NETMAP_OWNED_BY_ANY(na)) {
2989 if (bna->na_kpriv) {
2993 npriv = netmap_priv_new();
2996 npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */
2997 error = netmap_do_regif(npriv, na, req->reg.nr_mode,
2998 req->reg.nr_ringid, req->reg.nr_flags);
3000 netmap_priv_delete(npriv);
3003 bna->na_kpriv = npriv;
3004 na->na_flags |= NAF_BUSY;
3006 if (na->active_fds == 0) /* not registered */
3008 netmap_priv_delete(bna->na_kpriv);
3009 bna->na_kpriv = NULL;
3010 na->na_flags &= ~NAF_BUSY;
3016 /* attach a bridge wrapper to the 'real' device */
3018 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
3020 struct netmap_bwrap_adapter *bna;
3021 struct netmap_adapter *na = NULL;
3022 struct netmap_adapter *hostna = NULL;
3026 /* make sure the NIC is not already in use */
3027 if (NETMAP_OWNED_BY_ANY(hwna)) {
3028 D("NIC %s busy, cannot attach to bridge", hwna->name);
3032 bna = nm_os_malloc(sizeof(*bna));
3038 /* make bwrap ifp point to the real ifp */
3039 na->ifp = hwna->ifp;
3041 na->na_private = bna;
3042 strncpy(na->name, nr_name, sizeof(na->name));
3043 /* fill the ring data for the bwrap adapter with rx/tx meanings
3044 * swapped. The real cross-linking will be done during register,
3045 * when all the krings will have been created.
3048 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
3049 nma_set_nrings(na, t, nma_get_nrings(hwna, r));
3050 nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
3052 na->nm_dtor = netmap_bwrap_dtor;
3053 na->nm_register = netmap_bwrap_reg;
3054 // na->nm_txsync = netmap_bwrap_txsync;
3055 // na->nm_rxsync = netmap_bwrap_rxsync;
3056 na->nm_config = netmap_bwrap_config;
3057 na->nm_krings_create = netmap_bwrap_krings_create;
3058 na->nm_krings_delete = netmap_bwrap_krings_delete;
3059 na->nm_notify = netmap_bwrap_notify;
3060 na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
3061 na->pdev = hwna->pdev;
3062 na->nm_mem = netmap_mem_get(hwna->nm_mem);
3063 na->virt_hdr_len = hwna->virt_hdr_len;
3064 na->rx_buf_maxsize = hwna->rx_buf_maxsize;
3065 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
3066 /* Set the mfs, needed on the VALE mismatch datapath. */
3067 bna->up.mfs = NM_BDG_MFS_DEFAULT;
3070 netmap_adapter_get(hwna);
3071 hwna->na_private = bna; /* weak reference */
3072 bna->saved_na_vp = hwna->na_vp;
3073 hwna->na_vp = &bna->up;
3074 bna->up.up.na_vp = &(bna->up);
3076 if (hwna->na_flags & NAF_HOST_RINGS) {
3077 if (hwna->na_flags & NAF_SW_ONLY)
3078 na->na_flags |= NAF_SW_ONLY;
3079 na->na_flags |= NAF_HOST_RINGS;
3080 hostna = &bna->host.up;
3081 snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name);
3082 hostna->ifp = hwna->ifp;
3084 enum txrx r = nm_txrx_swap(t);
3085 nma_set_nrings(hostna, t, 1);
3086 nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
3088 // hostna->nm_txsync = netmap_bwrap_host_txsync;
3089 // hostna->nm_rxsync = netmap_bwrap_host_rxsync;
3090 hostna->nm_notify = netmap_bwrap_notify;
3091 hostna->nm_mem = netmap_mem_get(na->nm_mem);
3092 hostna->na_private = bna;
3093 hostna->na_vp = &bna->up;
3094 na->na_hostvp = hwna->na_hostvp =
3095 hostna->na_hostvp = &bna->host;
3096 hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
3097 hostna->rx_buf_maxsize = hwna->rx_buf_maxsize;
3098 bna->host.mfs = NM_BDG_MFS_DEFAULT;
3101 ND("%s<->%s txr %d txd %d rxr %d rxd %d",
3102 na->name, ifp->if_xname,
3103 na->num_tx_rings, na->num_tx_desc,
3104 na->num_rx_rings, na->num_rx_desc);
3106 error = netmap_attach_common(na);
3110 hwna->na_flags |= NAF_BUSY;
3114 hwna->na_vp = hwna->na_hostvp = NULL;
3115 netmap_adapter_put(hwna);
3122 netmap_init_bridges2(u_int n)
3125 struct nm_bridge *b;
3127 b = nm_os_malloc(sizeof(struct nm_bridge) * n);
3130 for (i = 0; i < n; i++)
3136 netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
3143 for (i = 0; i < n; i++)
3144 BDG_RWDESTROY(&b[i]);
3149 netmap_init_bridges(void)
3151 #ifdef CONFIG_NET_NS
3152 return netmap_bns_register();
3154 nm_bridges = netmap_init_bridges2(NM_BRIDGES);
3155 if (nm_bridges == NULL)
3162 netmap_uninit_bridges(void)
3164 #ifdef CONFIG_NET_NS
3165 netmap_bns_unregister();
3167 netmap_uninit_bridges2(nm_bridges, NM_BRIDGES);
3170 #endif /* WITH_VALE */