2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (C) 2013-2016 Universita` di Pisa
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * This module implements the VALE switch for netmap
35 NMG_LOCK() serializes all modifications to switches and ports.
36 A switch cannot be deleted until all ports are gone.
38 For each switch, an SX lock (RWlock on linux) protects
39 deletion of ports. When configuring or deleting a new port, the
40 lock is acquired in exclusive mode (after holding NMG_LOCK).
41 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
42 The lock is held throughout the entire forwarding cycle,
43 during which the thread may incur in a page fault.
44 Hence it is important that sleepable shared locks are used.
46 On the rx ring, the per-port lock is grabbed initially to reserve
47 a number of slot in the ring, then the lock is released,
48 packets are copied from source to destination, and then
49 the lock is acquired again and the receive ring is updated.
50 (A similar thing is done on the tx ring for NIC and host stack
51 ports attached to the switch)
56 * OS-specific code that is used only within this file.
57 * Other OS-specific code that must be accessed by drivers
58 * is present in netmap_kern.h
61 #if defined(__FreeBSD__)
62 #include <sys/cdefs.h> /* prerequisite */
63 __FBSDID("$FreeBSD$");
65 #include <sys/types.h>
66 #include <sys/errno.h>
67 #include <sys/param.h> /* defines used in kernel.h */
68 #include <sys/kernel.h> /* types used in module initialization */
69 #include <sys/conf.h> /* cdevsw struct, UID, GID */
70 #include <sys/sockio.h>
71 #include <sys/socketvar.h> /* struct socket */
72 #include <sys/malloc.h>
74 #include <sys/rwlock.h>
75 #include <sys/socket.h> /* sockaddrs */
76 #include <sys/selinfo.h>
77 #include <sys/sysctl.h>
79 #include <net/if_var.h>
80 #include <net/bpf.h> /* BIOCIMMEDIATE */
81 #include <machine/bus.h> /* bus_dmamap_* */
82 #include <sys/endian.h>
83 #include <sys/refcount.h>
86 #define BDG_RWLOCK_T struct rwlock // struct rwlock
88 #define BDG_RWINIT(b) \
89 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
90 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock)
91 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock)
92 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock)
93 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock)
94 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock)
95 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock)
100 #include "bsd_glue.h"
102 #elif defined(__APPLE__)
104 #warning OSX support is only partial
105 #include "osx_glue.h"
107 #elif defined(_WIN32)
108 #include "win_glue.h"
112 #error Unsupported platform
114 #endif /* unsupported */
120 #include <net/netmap.h>
121 #include <dev/netmap/netmap_kern.h>
122 #include <dev/netmap/netmap_mem2.h>
127 * system parameters (most of them in netmap_kern.h)
128 * NM_BDG_NAME prefix for switch port names, default "vale"
129 * NM_BDG_MAXPORTS number of ports
130 * NM_BRIDGES max number of switches in the system.
131 * XXX should become a sysctl or tunable
133 * Switch ports are named valeX:Y where X is the switch name and Y
134 * is the port. If Y matches a physical interface name, the port is
135 * connected to a physical device.
137 * Unlike physical interfaces, switch ports use their own memory region
138 * for rings and buffers.
139 * The virtual interfaces use per-queue lock instead of core lock.
140 * In the tx loop, we aggregate traffic in batches to make all operations
141 * faster. The batch size is bridge_batch.
143 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */
144 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */
145 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */
146 #define NM_BDG_HASH 1024 /* forwarding table entries */
147 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */
148 #define NM_MULTISEG 64 /* max size of a chain of bufs */
149 /* actual size of the tables */
150 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG)
151 /* NM_FT_NULL terminates a list of slots in the ft */
152 #define NM_FT_NULL NM_BDG_BATCH_MAX
156 * bridge_batch is set via sysctl to the max batch size to be
157 * used in the bridge. The actual value may be larger as the
158 * last packet in the block may overflow the size.
160 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
162 SYSCTL_DECL(_dev_netmap);
163 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
166 static int netmap_vp_create(struct nmreq *, struct ifnet *,
167 struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
168 static int netmap_vp_reg(struct netmap_adapter *na, int onoff);
169 static int netmap_bwrap_reg(struct netmap_adapter *, int onoff);
172 * For each output interface, nm_bdg_q is used to construct a list.
173 * bq_len is the number of output buffers (we can have coalescing
179 uint32_t bq_len; /* number of buffers */
182 /* XXX revise this */
184 uint64_t mac; /* the top 2 bytes are the epoch */
189 * nm_bridge is a descriptor for a VALE switch.
190 * Interfaces for a bridge are all in bdg_ports[].
191 * The array has fixed size, an empty entry does not terminate
192 * the search, but lookups only occur on attach/detach so we
193 * don't mind if they are slow.
195 * The bridge is non blocking on the transmit ports: excess
196 * packets are dropped if there is no room on the output port.
198 * bdg_lock protects accesses to the bdg_ports array.
199 * This is a rw lock (or equivalent).
202 /* XXX what is the proper alignment/layout ? */
203 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */
205 uint32_t bdg_active_ports; /* 0 means free */
206 char bdg_basename[IFNAMSIZ];
208 /* Indexes of active ports (up to active_ports)
209 * and all other remaining ports.
211 uint8_t bdg_port_index[NM_BDG_MAXPORTS];
213 struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
217 * The function to decide the destination port.
218 * It returns either of an index of the destination port,
219 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
220 * forward this packet. ring_nr is the source ring index, and the
221 * function may overwrite this value to forward this packet to a
222 * different ring index.
223 * This function must be set by netmap_bdg_ctl().
225 struct netmap_bdg_ops bdg_ops;
227 /* the forwarding table, MAC+ports.
228 * XXX should be changed to an argument to be passed to
229 * the lookup function, and allocated on attach
231 struct nm_hash_ent ht[NM_BDG_HASH];
235 #endif /* CONFIG_NET_NS */
239 netmap_bdg_name(struct netmap_vp_adapter *vp)
241 struct nm_bridge *b = vp->na_bdg;
244 return b->bdg_basename;
248 #ifndef CONFIG_NET_NS
250 * XXX in principle nm_bridges could be created dynamically
251 * Right now we have a static array and deletions are protected
252 * by an exclusive lock.
254 static struct nm_bridge *nm_bridges;
255 #endif /* !CONFIG_NET_NS */
259 * this is a slightly optimized copy routine which rounds
260 * to multiple of 64 bytes and is often faster than dealing
261 * with other odd sizes. We assume there is enough room
262 * in the source and destination buffers.
264 * XXX only for multiples of 64 bytes, non overlapped.
267 pkt_copy(void *_src, void *_dst, int l)
269 uint64_t *src = _src;
270 uint64_t *dst = _dst;
271 if (unlikely(l >= 1024)) {
275 for (; likely(l > 0); l-=64) {
289 nm_is_id_char(const char c)
291 return (c >= 'a' && c <= 'z') ||
292 (c >= 'A' && c <= 'Z') ||
293 (c >= '0' && c <= '9') ||
297 /* Validate the name of a VALE bridge port and return the
298 * position of the ":" character. */
300 nm_vale_name_validate(const char *name)
305 if (!name || strlen(name) < strlen(NM_BDG_NAME)) {
309 for (i = 0; name[i]; i++) {
310 if (name[i] == ':') {
311 if (colon_pos != -1) {
315 } else if (!nm_is_id_char(name[i])) {
328 * locate a bridge among the existing ones.
329 * MUST BE CALLED WITH NMG_LOCK()
331 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
332 * We assume that this is called with a name of at least NM_NAME chars.
334 static struct nm_bridge *
335 nm_find_bridge(const char *name, int create)
338 struct nm_bridge *b = NULL, *bridges;
343 netmap_bns_getbridges(&bridges, &num_bridges);
345 namelen = nm_vale_name_validate(name);
347 D("invalid bridge name %s", name ? name : NULL);
351 /* lookup the name, remember empty slot if there is one */
352 for (i = 0; i < num_bridges; i++) {
353 struct nm_bridge *x = bridges + i;
355 if (x->bdg_active_ports == 0) {
356 if (create && b == NULL)
357 b = x; /* record empty slot */
358 } else if (x->bdg_namelen != namelen) {
360 } else if (strncmp(name, x->bdg_basename, namelen) == 0) {
361 ND("found '%.*s' at %d", namelen, name, i);
366 if (i == num_bridges && b) { /* name not found, can create entry */
367 /* initialize the bridge */
368 strncpy(b->bdg_basename, name, namelen);
369 ND("create new bridge %s with ports %d", b->bdg_basename,
370 b->bdg_active_ports);
371 b->bdg_namelen = namelen;
372 b->bdg_active_ports = 0;
373 for (i = 0; i < NM_BDG_MAXPORTS; i++)
374 b->bdg_port_index[i] = i;
375 /* set the default function */
376 b->bdg_ops.lookup = netmap_bdg_learning;
377 /* reset the MAC address table */
378 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
386 * Free the forwarding tables for rings attached to switch ports.
389 nm_free_bdgfwd(struct netmap_adapter *na)
392 struct netmap_kring *kring;
395 nrings = na->num_tx_rings;
396 kring = na->tx_rings;
397 for (i = 0; i < nrings; i++) {
398 if (kring[i].nkr_ft) {
399 nm_os_free(kring[i].nkr_ft);
400 kring[i].nkr_ft = NULL; /* protect from freeing twice */
407 * Allocate the forwarding tables for the rings attached to the bridge ports.
410 nm_alloc_bdgfwd(struct netmap_adapter *na)
412 int nrings, l, i, num_dstq;
413 struct netmap_kring *kring;
416 /* all port:rings + broadcast */
417 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
418 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
419 l += sizeof(struct nm_bdg_q) * num_dstq;
420 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
422 nrings = netmap_real_rings(na, NR_TX);
423 kring = na->tx_rings;
424 for (i = 0; i < nrings; i++) {
425 struct nm_bdg_fwd *ft;
426 struct nm_bdg_q *dstq;
429 ft = nm_os_malloc(l);
434 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
435 for (j = 0; j < num_dstq; j++) {
436 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
439 kring[i].nkr_ft = ft;
445 /* remove from bridge b the ports in slots hw and sw
446 * (sw can be -1 if not needed)
449 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
451 int s_hw = hw, s_sw = sw;
452 int i, lim =b->bdg_active_ports;
453 uint8_t tmp[NM_BDG_MAXPORTS];
457 make a copy of bdg_port_index;
458 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
459 in the array of bdg_port_index, replacing them with
460 entries from the bottom of the array;
461 decrement bdg_active_ports;
462 acquire BDG_WLOCK() and copy back the array.
466 D("detach %d and %d (lim %d)", hw, sw, lim);
467 /* make a copy of the list of active ports, update it,
468 * and then copy back within BDG_WLOCK().
470 memcpy(tmp, b->bdg_port_index, sizeof(tmp));
471 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
472 if (hw >= 0 && tmp[i] == hw) {
473 ND("detach hw %d at %d", hw, i);
474 lim--; /* point to last active port */
475 tmp[i] = tmp[lim]; /* swap with i */
476 tmp[lim] = hw; /* now this is inactive */
478 } else if (sw >= 0 && tmp[i] == sw) {
479 ND("detach sw %d at %d", sw, i);
488 if (hw >= 0 || sw >= 0) {
489 D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
494 b->bdg_ops.dtor(b->bdg_ports[s_hw]);
495 b->bdg_ports[s_hw] = NULL;
497 b->bdg_ports[s_sw] = NULL;
499 memcpy(b->bdg_port_index, tmp, sizeof(tmp));
500 b->bdg_active_ports = lim;
503 ND("now %d active ports", lim);
505 ND("marking bridge %s as free", b->bdg_basename);
506 bzero(&b->bdg_ops, sizeof(b->bdg_ops));
511 /* nm_bdg_ctl callback for VALE ports */
513 netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
515 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
516 struct nm_bridge *b = vpna->na_bdg;
518 (void)nmr; // XXX merge ?
520 return 0; /* nothing to do */
522 netmap_set_all_rings(na, 0 /* disable */);
523 netmap_bdg_detach_common(b, vpna->bdg_port, -1);
525 netmap_set_all_rings(na, 1 /* enable */);
527 /* I have took reference just for attach */
528 netmap_adapter_put(na);
532 /* nm_dtor callback for ephemeral VALE ports */
534 netmap_vp_dtor(struct netmap_adapter *na)
536 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
537 struct nm_bridge *b = vpna->na_bdg;
539 ND("%s has %d references", na->name, na->na_refcount);
542 netmap_bdg_detach_common(b, vpna->bdg_port, -1);
545 if (vpna->autodelete && na->ifp != NULL) {
546 ND("releasing %s", na->ifp->if_xname);
548 nm_os_vi_detach(na->ifp);
553 /* remove a persistent VALE port from the system */
555 nm_vi_destroy(const char *name)
558 struct netmap_vp_adapter *vpna;
561 ifp = ifunit_ref(name);
565 /* make sure this is actually a VALE port */
566 if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
571 vpna = (struct netmap_vp_adapter *)NA(ifp);
573 /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
574 if (vpna->autodelete) {
579 /* also make sure that nobody is using the inferface */
580 if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
581 vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
588 D("destroying a persistent vale interface %s", ifp->if_xname);
589 /* Linux requires all the references are released
594 nm_os_vi_detach(ifp);
604 nm_update_info(struct nmreq *nmr, struct netmap_adapter *na)
606 nmr->nr_rx_rings = na->num_rx_rings;
607 nmr->nr_tx_rings = na->num_tx_rings;
608 nmr->nr_rx_slots = na->num_rx_desc;
609 nmr->nr_tx_slots = na->num_tx_desc;
610 return netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, NULL, &nmr->nr_arg2);
614 * Create a virtual interface registered to the system.
615 * The interface will be attached to a bridge later.
618 netmap_vi_create(struct nmreq *nmr, int autodelete)
621 struct netmap_vp_adapter *vpna;
622 struct netmap_mem_d *nmd = NULL;
625 /* don't include VALE prefix */
626 if (!strncmp(nmr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
628 ifp = ifunit_ref(nmr->nr_name);
629 if (ifp) { /* already exist, cannot create new one */
632 if (NM_NA_VALID(ifp)) {
633 int update_err = nm_update_info(nmr, NA(ifp));
641 error = nm_os_vi_persist(nmr->nr_name, &ifp);
647 nmd = netmap_mem_find(nmr->nr_arg2);
653 /* netmap_vp_create creates a struct netmap_vp_adapter */
654 error = netmap_vp_create(nmr, ifp, nmd, &vpna);
656 D("error %d", error);
659 /* persist-specific routines */
660 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
662 netmap_adapter_get(&vpna->up);
664 vpna->autodelete = 1;
666 NM_ATTACH_NA(ifp, &vpna->up);
667 /* return the updated info */
668 error = nm_update_info(nmr, &vpna->up);
672 D("returning nr_arg2 %d", nmr->nr_arg2);
676 D("created %s", ifp->if_xname);
685 nm_os_vi_detach(ifp);
690 /* Try to get a reference to a netmap adapter attached to a VALE switch.
691 * If the adapter is found (or is created), this function returns 0, a
692 * non NULL pointer is returned into *na, and the caller holds a
693 * reference to the adapter.
694 * If an adapter is not found, then no reference is grabbed and the
695 * function returns an error code, or 0 if there is just a VALE prefix
696 * mismatch. Therefore the caller holds a reference when
697 * (*na != NULL && return == 0).
700 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na,
701 struct netmap_mem_d *nmd, int create)
703 char *nr_name = nmr->nr_name;
705 struct ifnet *ifp = NULL;
707 struct netmap_vp_adapter *vpna, *hostna = NULL;
709 int i, j, cand = -1, cand2 = -1;
712 *na = NULL; /* default return value */
714 /* first try to see if this is a bridge port. */
716 if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) {
717 return 0; /* no error, but no VALE prefix */
720 b = nm_find_bridge(nr_name, create);
722 D("no bridges available for '%s'", nr_name);
723 return (create ? ENOMEM : ENXIO);
725 if (strlen(nr_name) < b->bdg_namelen) /* impossible */
728 /* Now we are sure that name starts with the bridge's name,
729 * lookup the port in the bridge. We need to scan the entire
730 * list. It is not important to hold a WLOCK on the bridge
731 * during the search because NMG_LOCK already guarantees
732 * that there are no other possible writers.
735 /* lookup in the local list of ports */
736 for (j = 0; j < b->bdg_active_ports; j++) {
737 i = b->bdg_port_index[j];
738 vpna = b->bdg_ports[i];
739 // KASSERT(na != NULL);
740 ND("checking %s", vpna->up.name);
741 if (!strcmp(vpna->up.name, nr_name)) {
742 netmap_adapter_get(&vpna->up);
743 ND("found existing if %s refs %d", nr_name)
748 /* not found, should we create it? */
751 /* yes we should, see if we have space to attach entries */
752 needed = 2; /* in some cases we only need 1 */
753 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
754 D("bridge full %d, cannot create new port", b->bdg_active_ports);
757 /* record the next two ports available, but do not allocate yet */
758 cand = b->bdg_port_index[b->bdg_active_ports];
759 cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
760 ND("+++ bridge %s port %s used %d avail %d %d",
761 b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
764 * try see if there is a matching NIC with this name
765 * (after the bridge's name)
767 ifname = nr_name + b->bdg_namelen + 1;
768 ifp = ifunit_ref(ifname);
770 /* Create an ephemeral virtual port
771 * This block contains all the ephemeral-specific logics
774 /* nr_cmd must be 0 for a virtual port */
779 /* bdg_netmap_attach creates a struct netmap_adapter */
780 error = netmap_vp_create(nmr, NULL, nmd, &vpna);
782 D("error %d", error);
785 /* shortcut - we can skip get_hw_na(),
786 * ownership check and nm_bdg_attach()
789 struct netmap_adapter *hw;
791 error = netmap_get_hw_na(ifp, nmd, &hw);
792 if (error || hw == NULL)
795 /* host adapter might not be created */
796 error = hw->nm_bdg_attach(nr_name, hw);
800 hostna = hw->na_hostvp;
801 if (nmr->nr_arg1 != NETMAP_BDG_HOST)
806 vpna->bdg_port = cand;
807 ND("NIC %p to bridge port %d", vpna, cand);
808 /* bind the port to the bridge (virtual ports are not active) */
809 b->bdg_ports[cand] = vpna;
811 b->bdg_active_ports++;
812 if (hostna != NULL) {
813 /* also bind the host stack to the bridge */
814 b->bdg_ports[cand2] = hostna;
815 hostna->bdg_port = cand2;
817 b->bdg_active_ports++;
818 ND("host %p to bridge port %d", hostna, cand2);
820 ND("if %s refs %d", ifname, vpna->up.na_refcount);
823 netmap_adapter_get(*na);
833 /* Process NETMAP_BDG_ATTACH */
835 nm_bdg_ctl_attach(struct nmreq *nmr)
837 struct netmap_adapter *na;
838 struct netmap_mem_d *nmd = NULL;
844 nmd = netmap_mem_find(nmr->nr_arg2);
851 error = netmap_get_bdg_na(nmr, &na, nmd, 1 /* create if not exists */);
852 if (error) /* no device */
855 if (na == NULL) { /* VALE prefix missing */
860 if (NETMAP_OWNED_BY_ANY(na)) {
865 if (na->nm_bdg_ctl) {
866 /* nop for VALE ports. The bwrap needs to put the hwna
867 * in netmap mode (see netmap_bwrap_bdg_ctl)
869 error = na->nm_bdg_ctl(na, nmr, 1);
872 ND("registered %s to netmap-mode", na->name);
878 netmap_adapter_put(na);
885 nm_is_bwrap(struct netmap_adapter *na)
887 return na->nm_register == netmap_bwrap_reg;
890 /* process NETMAP_BDG_DETACH */
892 nm_bdg_ctl_detach(struct nmreq *nmr)
894 struct netmap_adapter *na;
898 error = netmap_get_bdg_na(nmr, &na, NULL, 0 /* don't create */);
899 if (error) { /* no device, or another bridge or user owns the device */
903 if (na == NULL) { /* VALE prefix missing */
906 } else if (nm_is_bwrap(na) &&
907 ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
908 /* Don't detach a NIC with polling */
910 netmap_adapter_put(na);
913 if (na->nm_bdg_ctl) {
914 /* remove the port from bridge. The bwrap
915 * also needs to put the hwna in normal mode
917 error = na->nm_bdg_ctl(na, nmr, 0);
920 netmap_adapter_put(na);
927 struct nm_bdg_polling_state;
933 struct nm_bdg_polling_state *bps;
936 struct nm_bdg_polling_state {
939 struct netmap_bwrap_adapter *bna;
945 struct nm_bdg_kthread *kthreads;
949 netmap_bwrap_polling(void *data, int is_kthread)
951 struct nm_bdg_kthread *nbk = data;
952 struct netmap_bwrap_adapter *bna;
953 u_int qfirst, qlast, i;
954 struct netmap_kring *kring0, *kring;
958 qfirst = nbk->qfirst;
961 kring0 = NMR(bna->hwna, NR_RX);
963 for (i = qfirst; i < qlast; i++) {
965 kring->nm_notify(kring, 0);
970 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)
972 struct nm_kctx_cfg kcfg;
975 bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus);
976 if (bps->kthreads == NULL)
979 bzero(&kcfg, sizeof(kcfg));
980 kcfg.worker_fn = netmap_bwrap_polling;
981 kcfg.use_kthread = 1;
982 for (i = 0; i < bps->ncpus; i++) {
983 struct nm_bdg_kthread *t = bps->kthreads + i;
984 int all = (bps->ncpus == 1 && bps->reg == NR_REG_ALL_NIC);
985 int affinity = bps->cpu_from + i;
988 t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;
989 t->qlast = all ? bps->qlast : t->qfirst + 1;
990 D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,
994 kcfg.worker_private = t;
995 t->nmk = nm_os_kctx_create(&kcfg, 0, NULL);
996 if (t->nmk == NULL) {
999 nm_os_kctx_worker_setaff(t->nmk, affinity);
1004 for (j = 0; j < i; j++) {
1005 struct nm_bdg_kthread *t = bps->kthreads + i;
1006 nm_os_kctx_destroy(t->nmk);
1008 nm_os_free(bps->kthreads);
1012 /* A variant of ptnetmap_start_kthreads() */
1014 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)
1019 D("polling is not configured");
1022 bps->stopped = false;
1024 for (i = 0; i < bps->ncpus; i++) {
1025 struct nm_bdg_kthread *t = bps->kthreads + i;
1026 error = nm_os_kctx_worker_start(t->nmk);
1028 D("error in nm_kthread_start()");
1035 for (j = 0; j < i; j++) {
1036 struct nm_bdg_kthread *t = bps->kthreads + i;
1037 nm_os_kctx_worker_stop(t->nmk);
1039 bps->stopped = true;
1044 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)
1051 for (i = 0; i < bps->ncpus; i++) {
1052 struct nm_bdg_kthread *t = bps->kthreads + i;
1053 nm_os_kctx_worker_stop(t->nmk);
1054 nm_os_kctx_destroy(t->nmk);
1056 bps->stopped = true;
1060 get_polling_cfg(struct nmreq *nmr, struct netmap_adapter *na,
1061 struct nm_bdg_polling_state *bps)
1063 int req_cpus, avail_cpus, core_from;
1064 u_int reg, i, qfirst, qlast;
1066 avail_cpus = nm_os_ncpus();
1067 req_cpus = nmr->nr_arg1;
1069 if (req_cpus == 0) {
1070 D("req_cpus must be > 0");
1072 } else if (req_cpus >= avail_cpus) {
1073 D("for safety, we need at least one core left in the system");
1076 reg = nmr->nr_flags & NR_REG_MASK;
1077 i = nmr->nr_ringid & NETMAP_RING_MASK;
1079 * ONE_NIC: dedicate one core to one ring. If multiple cores
1080 * are specified, consecutive rings are also polled.
1081 * For example, if ringid=2 and 2 cores are given,
1082 * ring 2 and 3 are polled by core 2 and 3, respectively.
1083 * ALL_NIC: poll all the rings using a core specified by ringid.
1084 * the number of cores must be 1.
1086 if (reg == NR_REG_ONE_NIC) {
1087 if (i + req_cpus > nma_get_nrings(na, NR_RX)) {
1088 D("only %d rings exist (ring %u-%u is given)",
1089 nma_get_nrings(na, NR_RX), i, i+req_cpus);
1093 qlast = qfirst + req_cpus;
1095 } else if (reg == NR_REG_ALL_NIC) {
1096 if (req_cpus != 1) {
1097 D("ncpus must be 1 not %d for REG_ALL_NIC", req_cpus);
1101 qlast = nma_get_nrings(na, NR_RX);
1104 D("reg must be ALL_NIC or ONE_NIC");
1109 bps->qfirst = qfirst;
1111 bps->cpu_from = core_from;
1112 bps->ncpus = req_cpus;
1113 D("%s qfirst %u qlast %u cpu_from %u ncpus %u",
1114 reg == NR_REG_ALL_NIC ? "REG_ALL_NIC" : "REG_ONE_NIC",
1115 qfirst, qlast, core_from, req_cpus);
1120 nm_bdg_ctl_polling_start(struct nmreq *nmr, struct netmap_adapter *na)
1122 struct nm_bdg_polling_state *bps;
1123 struct netmap_bwrap_adapter *bna;
1126 bna = (struct netmap_bwrap_adapter *)na;
1127 if (bna->na_polling_state) {
1128 D("ERROR adapter already in polling mode");
1132 bps = nm_os_malloc(sizeof(*bps));
1135 bps->configured = false;
1136 bps->stopped = true;
1138 if (get_polling_cfg(nmr, na, bps)) {
1143 if (nm_bdg_create_kthreads(bps)) {
1148 bps->configured = true;
1149 bna->na_polling_state = bps;
1152 /* disable interrupt if possible */
1153 if (bna->hwna->nm_intr)
1154 bna->hwna->nm_intr(bna->hwna, 0);
1155 /* start kthread now */
1156 error = nm_bdg_polling_start_kthreads(bps);
1158 D("ERROR nm_bdg_polling_start_kthread()");
1159 nm_os_free(bps->kthreads);
1161 bna->na_polling_state = NULL;
1162 if (bna->hwna->nm_intr)
1163 bna->hwna->nm_intr(bna->hwna, 1);
1169 nm_bdg_ctl_polling_stop(struct nmreq *nmr, struct netmap_adapter *na)
1171 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;
1172 struct nm_bdg_polling_state *bps;
1174 if (!bna->na_polling_state) {
1175 D("ERROR adapter is not in polling mode");
1178 bps = bna->na_polling_state;
1179 nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);
1180 bps->configured = false;
1182 bna->na_polling_state = NULL;
1183 /* reenable interrupt */
1184 if (bna->hwna->nm_intr)
1185 bna->hwna->nm_intr(bna->hwna, 1);
1189 /* Called by either user's context (netmap_ioctl())
1190 * or external kernel modules (e.g., Openvswitch).
1191 * Operation is indicated in nmr->nr_cmd.
1192 * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge
1193 * requires bdg_ops argument; the other commands ignore this argument.
1195 * Called without NMG_LOCK.
1198 netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)
1200 struct nm_bridge *b, *bridges;
1201 struct netmap_adapter *na;
1202 struct netmap_vp_adapter *vpna;
1203 char *name = nmr->nr_name;
1204 int cmd = nmr->nr_cmd, namelen = strlen(name);
1205 int error = 0, i, j;
1208 netmap_bns_getbridges(&bridges, &num_bridges);
1211 case NETMAP_BDG_NEWIF:
1212 error = netmap_vi_create(nmr, 0 /* no autodelete */);
1215 case NETMAP_BDG_DELIF:
1216 error = nm_vi_destroy(nmr->nr_name);
1219 case NETMAP_BDG_ATTACH:
1220 error = nm_bdg_ctl_attach(nmr);
1223 case NETMAP_BDG_DETACH:
1224 error = nm_bdg_ctl_detach(nmr);
1227 case NETMAP_BDG_LIST:
1228 /* this is used to enumerate bridges and ports */
1229 if (namelen) { /* look up indexes of bridge and port */
1230 if (strncmp(name, NM_BDG_NAME, strlen(NM_BDG_NAME))) {
1235 b = nm_find_bridge(name, 0 /* don't create */);
1243 nmr->nr_arg1 = b - bridges; /* bridge index */
1244 nmr->nr_arg2 = NM_BDG_NOPORT;
1245 for (j = 0; j < b->bdg_active_ports; j++) {
1246 i = b->bdg_port_index[j];
1247 vpna = b->bdg_ports[i];
1249 D("---AAAAAAAAARGH-------");
1252 /* the former and the latter identify a
1253 * virtual port and a NIC, respectively
1255 if (!strcmp(vpna->up.name, name)) {
1256 nmr->nr_arg2 = i; /* port index */
1262 /* return the first non-empty entry starting from
1263 * bridge nr_arg1 and port nr_arg2.
1265 * Users can detect the end of the same bridge by
1266 * seeing the new and old value of nr_arg1, and can
1267 * detect the end of all the bridge by error != 0
1273 for (error = ENOENT; i < NM_BRIDGES; i++) {
1275 for ( ; j < NM_BDG_MAXPORTS; j++) {
1276 if (b->bdg_ports[j] == NULL)
1278 vpna = b->bdg_ports[j];
1279 strncpy(name, vpna->up.name, (size_t)IFNAMSIZ);
1283 j = 0; /* following bridges scan from 0 */
1292 case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */
1293 /* register callbacks to the given bridge.
1294 * nmr->nr_name may be just bridge's name (including ':'
1295 * if it is not just NM_NAME).
1302 b = nm_find_bridge(name, 0 /* don't create */);
1306 b->bdg_ops = *bdg_ops;
1311 case NETMAP_BDG_VNET_HDR:
1312 /* Valid lengths for the virtio-net header are 0 (no header),
1314 if (nmr->nr_arg1 != 0 &&
1315 nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) &&
1316 nmr->nr_arg1 != 12) {
1321 error = netmap_get_bdg_na(nmr, &na, NULL, 0);
1323 vpna = (struct netmap_vp_adapter *)na;
1324 na->virt_hdr_len = nmr->nr_arg1;
1325 if (na->virt_hdr_len) {
1326 vpna->mfs = NETMAP_BUF_SIZE(na);
1328 D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
1329 netmap_adapter_put(na);
1336 case NETMAP_BDG_POLLING_ON:
1337 case NETMAP_BDG_POLLING_OFF:
1339 error = netmap_get_bdg_na(nmr, &na, NULL, 0);
1341 if (!nm_is_bwrap(na)) {
1343 } else if (cmd == NETMAP_BDG_POLLING_ON) {
1344 error = nm_bdg_ctl_polling_start(nmr, na);
1346 netmap_adapter_get(na);
1348 error = nm_bdg_ctl_polling_stop(nmr, na);
1350 netmap_adapter_put(na);
1352 netmap_adapter_put(na);
1358 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
1366 netmap_bdg_config(struct nmreq *nmr)
1368 struct nm_bridge *b;
1372 b = nm_find_bridge(nmr->nr_name, 0);
1378 /* Don't call config() with NMG_LOCK() held */
1380 if (b->bdg_ops.config != NULL)
1381 error = b->bdg_ops.config((struct nm_ifreq *)nmr);
1387 /* nm_krings_create callback for VALE ports.
1388 * Calls the standard netmap_krings_create, then adds leases on rx
1389 * rings and bdgfwd on tx rings.
1392 netmap_vp_krings_create(struct netmap_adapter *na)
1397 u_int nrx = netmap_real_rings(na, NR_RX);
1400 * Leases are attached to RX rings on vale ports
1402 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
1404 error = netmap_krings_create(na, tailroom);
1408 leases = na->tailroom;
1410 for (i = 0; i < nrx; i++) { /* Receive rings */
1411 na->rx_rings[i].nkr_leases = leases;
1412 leases += na->num_rx_desc;
1415 error = nm_alloc_bdgfwd(na);
1417 netmap_krings_delete(na);
1425 /* nm_krings_delete callback for VALE ports. */
1427 netmap_vp_krings_delete(struct netmap_adapter *na)
1430 netmap_krings_delete(na);
1435 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
1436 struct netmap_vp_adapter *na, u_int ring_nr);
1440 * main dispatch routine for the bridge.
1441 * Grab packets from a kring, move them into the ft structure
1442 * associated to the tx (input) port. Max one instance per port,
1443 * filtered on input (ioctl, poll or XXX).
1444 * Returns the next position in the ring.
1447 nm_bdg_preflush(struct netmap_kring *kring, u_int end)
1449 struct netmap_vp_adapter *na =
1450 (struct netmap_vp_adapter*)kring->na;
1451 struct netmap_ring *ring = kring->ring;
1452 struct nm_bdg_fwd *ft;
1453 u_int ring_nr = kring->ring_id;
1454 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
1455 u_int ft_i = 0; /* start from 0 */
1456 u_int frags = 1; /* how many frags ? */
1457 struct nm_bridge *b = na->na_bdg;
1459 /* To protect against modifications to the bridge we acquire a
1460 * shared lock, waiting if we can sleep (if the source port is
1461 * attached to a user process) or with a trylock otherwise (NICs).
1463 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1464 if (na->up.na_flags & NAF_BDG_MAYSLEEP)
1466 else if (!BDG_RTRYLOCK(b))
1468 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1471 for (; likely(j != end); j = nm_next(j, lim)) {
1472 struct netmap_slot *slot = &ring->slot[j];
1475 ft[ft_i].ft_len = slot->len;
1476 ft[ft_i].ft_flags = slot->flags;
1478 ND("flags is 0x%x", slot->flags);
1479 /* we do not use the buf changed flag, but we still need to reset it */
1480 slot->flags &= ~NS_BUF_CHANGED;
1482 /* this slot goes into a list so initialize the link field */
1483 ft[ft_i].ft_next = NM_FT_NULL;
1484 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
1485 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
1486 if (unlikely(buf == NULL)) {
1487 RD(5, "NULL %s buffer pointer from %s slot %d len %d",
1488 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
1489 kring->name, j, ft[ft_i].ft_len);
1490 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
1491 ft[ft_i].ft_len = 0;
1492 ft[ft_i].ft_flags = 0;
1494 __builtin_prefetch(buf);
1496 if (slot->flags & NS_MOREFRAG) {
1500 if (unlikely(netmap_verbose && frags > 1))
1501 RD(5, "%d frags at %d", frags, ft_i - frags);
1502 ft[ft_i - frags].ft_frags = frags;
1504 if (unlikely((int)ft_i >= bridge_batch))
1505 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1508 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
1509 * have to fix frags count. */
1511 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
1512 ft[ft_i - frags].ft_frags = frags;
1513 D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
1516 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1522 /* ----- FreeBSD if_bridge hash function ------- */
1525 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1526 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1528 * http://www.burtleburtle.net/bob/hash/spooky.html
1530 #define mix(a, b, c) \
1532 a -= b; a -= c; a ^= (c >> 13); \
1533 b -= c; b -= a; b ^= (a << 8); \
1534 c -= a; c -= b; c ^= (b >> 13); \
1535 a -= b; a -= c; a ^= (c >> 12); \
1536 b -= c; b -= a; b ^= (a << 16); \
1537 c -= a; c -= b; c ^= (b >> 5); \
1538 a -= b; a -= c; a ^= (c >> 3); \
1539 b -= c; b -= a; b ^= (a << 10); \
1540 c -= a; c -= b; c ^= (b >> 15); \
1541 } while (/*CONSTCOND*/0)
1544 static __inline uint32_t
1545 nm_bridge_rthash(const uint8_t *addr)
1547 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1557 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1)
1558 return (c & BRIDGE_RTHASH_MASK);
1564 /* nm_register callback for VALE ports */
1566 netmap_vp_reg(struct netmap_adapter *na, int onoff)
1568 struct netmap_vp_adapter *vpna =
1569 (struct netmap_vp_adapter*)na;
1573 /* persistent ports may be put in netmap mode
1574 * before being attached to a bridge
1577 BDG_WLOCK(vpna->na_bdg);
1580 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
1581 struct netmap_kring *kring = &NMR(na, t)[i];
1583 if (nm_kring_pending_on(kring))
1584 kring->nr_mode = NKR_NETMAP_ON;
1587 if (na->active_fds == 0)
1588 na->na_flags |= NAF_NETMAP_ON;
1589 /* XXX on FreeBSD, persistent VALE ports should also
1590 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
1593 if (na->active_fds == 0)
1594 na->na_flags &= ~NAF_NETMAP_ON;
1596 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
1597 struct netmap_kring *kring = &NMR(na, t)[i];
1599 if (nm_kring_pending_off(kring))
1600 kring->nr_mode = NKR_NETMAP_OFF;
1605 BDG_WUNLOCK(vpna->na_bdg);
1611 * Lookup function for a learning bridge.
1612 * Update the hash table with the source address,
1613 * and then returns the destination port index, and the
1614 * ring in *dst_ring (at the moment, always use ring 0)
1617 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
1618 struct netmap_vp_adapter *na)
1620 uint8_t *buf = ft->ft_buf;
1621 u_int buf_len = ft->ft_len;
1622 struct nm_hash_ent *ht = na->na_bdg->ht;
1624 u_int dst, mysrc = na->bdg_port;
1625 uint64_t smac, dmac;
1628 /* safety check, unfortunately we have many cases */
1629 if (buf_len >= 14 + na->up.virt_hdr_len) {
1630 /* virthdr + mac_hdr in the same slot */
1631 buf += na->up.virt_hdr_len;
1632 buf_len -= na->up.virt_hdr_len;
1633 } else if (buf_len == na->up.virt_hdr_len && ft->ft_flags & NS_MOREFRAG) {
1634 /* only header in first fragment */
1637 buf_len = ft->ft_len;
1639 RD(5, "invalid buf format, length %d", buf_len);
1640 return NM_BDG_NOPORT;
1643 if (ft->ft_flags & NS_INDIRECT) {
1644 if (copyin(buf, indbuf, sizeof(indbuf))) {
1645 return NM_BDG_NOPORT;
1650 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1651 smac = le64toh(*(uint64_t *)(buf + 4));
1655 * The hash is somewhat expensive, there might be some
1656 * worthwhile optimizations here.
1658 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
1660 sh = nm_bridge_rthash(s); // XXX hash of source
1661 /* update source port forwarding entry */
1662 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */
1663 ht[sh].ports = mysrc;
1665 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1666 s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1668 dst = NM_BDG_BROADCAST;
1669 if ((buf[0] & 1) == 0) { /* unicast */
1670 dh = nm_bridge_rthash(buf); // XXX hash of dst
1671 if (ht[dh].mac == dmac) { /* found dst */
1674 /* XXX otherwise return NM_BDG_UNKNOWN ? */
1681 * Available space in the ring. Only used in VALE code
1682 * and only with is_rx = 1
1684 static inline uint32_t
1685 nm_kr_space(struct netmap_kring *k, int is_rx)
1690 int busy = k->nkr_hwlease - k->nr_hwcur;
1692 busy += k->nkr_num_slots;
1693 space = k->nkr_num_slots - 1 - busy;
1695 /* XXX never used in this branch */
1696 space = k->nr_hwtail - k->nkr_hwlease;
1698 space += k->nkr_num_slots;
1702 if (k->nkr_hwlease >= k->nkr_num_slots ||
1703 k->nr_hwcur >= k->nkr_num_slots ||
1704 k->nr_tail >= k->nkr_num_slots ||
1706 busy >= k->nkr_num_slots) {
1707 D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1708 k->nkr_lease_idx, k->nkr_num_slots);
1717 /* make a lease on the kring for N positions. return the
1719 * XXX only used in VALE code and with is_rx = 1
1721 static inline uint32_t
1722 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1724 uint32_t lim = k->nkr_num_slots - 1;
1725 uint32_t lease_idx = k->nkr_lease_idx;
1727 k->nkr_leases[lease_idx] = NR_NOSLOT;
1728 k->nkr_lease_idx = nm_next(lease_idx, lim);
1730 if (n > nm_kr_space(k, is_rx)) {
1731 D("invalid request for %d slots", n);
1734 /* XXX verify that there are n slots */
1735 k->nkr_hwlease += n;
1736 if (k->nkr_hwlease > lim)
1737 k->nkr_hwlease -= lim + 1;
1739 if (k->nkr_hwlease >= k->nkr_num_slots ||
1740 k->nr_hwcur >= k->nkr_num_slots ||
1741 k->nr_hwtail >= k->nkr_num_slots ||
1742 k->nkr_lease_idx >= k->nkr_num_slots) {
1743 D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1745 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1746 k->nkr_lease_idx, k->nkr_num_slots);
1753 * This flush routine supports only unicast and broadcast but a large
1754 * number of ports, and lets us replace the learn and dispatch functions.
1757 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1760 struct nm_bdg_q *dst_ents, *brddst;
1761 uint16_t num_dsts = 0, *dsts;
1762 struct nm_bridge *b = na->na_bdg;
1763 u_int i, me = na->bdg_port;
1766 * The work area (pointed by ft) is followed by an array of
1767 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1768 * queues per port plus one for the broadcast traffic.
1769 * Then we have an array of destination indexes.
1771 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1772 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1774 /* first pass: find a destination for each packet in the batch */
1775 for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1776 uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1777 uint16_t dst_port, d_i;
1780 ND("slot %d frags %d", i, ft[i].ft_frags);
1781 /* Drop the packet if the virtio-net header is not into the first
1782 fragment nor at the very beginning of the second. */
1783 if (unlikely(na->up.virt_hdr_len > ft[i].ft_len))
1785 dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na);
1786 if (netmap_verbose > 255)
1787 RD(5, "slot %d port %d -> %d", i, me, dst_port);
1788 if (dst_port == NM_BDG_NOPORT)
1789 continue; /* this packet is identified to be dropped */
1790 else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1792 else if (dst_port == NM_BDG_BROADCAST)
1793 dst_ring = 0; /* broadcasts always go to ring 0 */
1794 else if (unlikely(dst_port == me ||
1795 !b->bdg_ports[dst_port]))
1798 /* get a position in the scratch pad */
1799 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1802 /* append the first fragment to the list */
1803 if (d->bq_head == NM_FT_NULL) { /* new destination */
1804 d->bq_head = d->bq_tail = i;
1805 /* remember this position to be scanned later */
1806 if (dst_port != NM_BDG_BROADCAST)
1807 dsts[num_dsts++] = d_i;
1809 ft[d->bq_tail].ft_next = i;
1812 d->bq_len += ft[i].ft_frags;
1816 * Broadcast traffic goes to ring 0 on all destinations.
1817 * So we need to add these rings to the list of ports to scan.
1818 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1819 * expensive. We should keep a compact list of active destinations
1820 * so we could shorten this loop.
1822 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1823 if (brddst->bq_head != NM_FT_NULL) {
1825 for (j = 0; likely(j < b->bdg_active_ports); j++) {
1827 i = b->bdg_port_index[j];
1828 if (unlikely(i == me))
1830 d_i = i * NM_BDG_MAXRINGS;
1831 if (dst_ents[d_i].bq_head == NM_FT_NULL)
1832 dsts[num_dsts++] = d_i;
1836 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1837 /* second pass: scan destinations */
1838 for (i = 0; i < num_dsts; i++) {
1839 struct netmap_vp_adapter *dst_na;
1840 struct netmap_kring *kring;
1841 struct netmap_ring *ring;
1842 u_int dst_nr, lim, j, d_i, next, brd_next;
1843 u_int needed, howmany;
1844 int retry = netmap_txsync_retry;
1846 uint32_t my_start = 0, lease_idx = 0;
1848 int virt_hdr_mismatch = 0;
1851 ND("second pass %d port %d", i, d_i);
1853 // XXX fix the division
1854 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1855 /* protect from the lookup function returning an inactive
1858 if (unlikely(dst_na == NULL))
1860 if (dst_na->up.na_flags & NAF_SW_ONLY)
1863 * The interface may be in !netmap mode in two cases:
1864 * - when na is attached but not activated yet;
1865 * - when na is being deactivated but is still attached.
1867 if (unlikely(!nm_netmap_on(&dst_na->up))) {
1868 ND("not in netmap mode!");
1872 /* there is at least one either unicast or broadcast packet */
1873 brd_next = brddst->bq_head;
1875 /* we need to reserve this many slots. If fewer are
1876 * available, some packets will be dropped.
1877 * Packets may have multiple fragments, so we may not use
1878 * there is a chance that we may not use all of the slots
1879 * we have claimed, so we will need to handle the leftover
1880 * ones when we regain the lock.
1882 needed = d->bq_len + brddst->bq_len;
1884 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
1885 if (netmap_verbose) {
1886 RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
1887 dst_na->up.virt_hdr_len);
1889 /* There is a virtio-net header/offloadings mismatch between
1890 * source and destination. The slower mismatch datapath will
1891 * be used to cope with all the mismatches.
1893 virt_hdr_mismatch = 1;
1894 if (dst_na->mfs < na->mfs) {
1895 /* We may need to do segmentation offloadings, and so
1896 * we may need a number of destination slots greater
1897 * than the number of input slots ('needed').
1898 * We look for the smallest integer 'x' which satisfies:
1899 * needed * na->mfs + x * H <= x * na->mfs
1900 * where 'H' is the length of the longest header that may
1901 * be replicated in the segmentation process (e.g. for
1902 * TCPv4 we must account for ethernet header, IP header
1903 * and TCPv4 header).
1905 needed = (needed * na->mfs) /
1906 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1907 ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1911 ND(5, "pass 2 dst %d is %x %s",
1912 i, d_i, is_vp ? "virtual" : "nic/host");
1913 dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1914 nrings = dst_na->up.num_rx_rings;
1915 if (dst_nr >= nrings)
1916 dst_nr = dst_nr % nrings;
1917 kring = &dst_na->up.rx_rings[dst_nr];
1919 lim = kring->nkr_num_slots - 1;
1923 if (dst_na->retry && retry) {
1924 /* try to get some free slot from the previous run */
1925 kring->nm_notify(kring, 0);
1926 /* actually useful only for bwraps, since there
1927 * the notify will trigger a txsync on the hwna. VALE ports
1928 * have dst_na->retry == 0
1931 /* reserve the buffers in the queue and an entry
1932 * to report completion, and drop lock.
1933 * XXX this might become a helper function.
1935 mtx_lock(&kring->q_lock);
1936 if (kring->nkr_stopped) {
1937 mtx_unlock(&kring->q_lock);
1940 my_start = j = kring->nkr_hwlease;
1941 howmany = nm_kr_space(kring, 1);
1942 if (needed < howmany)
1944 lease_idx = nm_kr_lease(kring, howmany, 1);
1945 mtx_unlock(&kring->q_lock);
1947 /* only retry if we need more than available slots */
1948 if (retry && needed <= howmany)
1951 /* copy to the destination queue */
1952 while (howmany > 0) {
1953 struct netmap_slot *slot;
1954 struct nm_bdg_fwd *ft_p, *ft_end;
1957 /* find the queue from which we pick next packet.
1958 * NM_FT_NULL is always higher than valid indexes
1959 * so we never dereference it if the other list
1960 * has packets (and if both are empty we never
1963 if (next < brd_next) {
1965 next = ft_p->ft_next;
1966 } else { /* insert broadcast */
1967 ft_p = ft + brd_next;
1968 brd_next = ft_p->ft_next;
1970 cnt = ft_p->ft_frags; // cnt > 0
1971 if (unlikely(cnt > howmany))
1972 break; /* no more space */
1973 if (netmap_verbose && cnt > 1)
1974 RD(5, "rx %d frags to %d", cnt, j);
1975 ft_end = ft_p + cnt;
1976 if (unlikely(virt_hdr_mismatch)) {
1977 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
1981 char *dst, *src = ft_p->ft_buf;
1982 size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1984 slot = &ring->slot[j];
1985 dst = NMB(&dst_na->up, slot);
1987 ND("send [%d] %d(%d) bytes at %s:%d",
1988 i, (int)copy_len, (int)dst_len,
1989 NM_IFPNAME(dst_ifp), j);
1990 /* round to a multiple of 64 */
1991 copy_len = (copy_len + 63) & ~63;
1993 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
1994 copy_len > NETMAP_BUF_SIZE(&na->up))) {
1995 RD(5, "invalid len %d, down to 64", (int)copy_len);
1996 copy_len = dst_len = 64; // XXX
1998 if (ft_p->ft_flags & NS_INDIRECT) {
1999 if (copyin(src, dst, copy_len)) {
2000 // invalid user pointer, pretend len is 0
2004 //memcpy(dst, src, copy_len);
2005 pkt_copy(src, dst, (int)copy_len);
2007 slot->len = dst_len;
2008 slot->flags = (cnt << 8)| NS_MOREFRAG;
2009 j = nm_next(j, lim);
2012 } while (ft_p != ft_end);
2013 slot->flags = (cnt << 8); /* clear flag on last entry */
2016 if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
2020 /* current position */
2021 uint32_t *p = kring->nkr_leases; /* shorthand */
2022 uint32_t update_pos;
2023 int still_locked = 1;
2025 mtx_lock(&kring->q_lock);
2026 if (unlikely(howmany > 0)) {
2027 /* not used all bufs. If i am the last one
2028 * i can recover the slots, otherwise must
2029 * fill them with 0 to mark empty packets.
2031 ND("leftover %d bufs", howmany);
2032 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
2033 /* yes i am the last one */
2034 ND("roll back nkr_hwlease to %d", j);
2035 kring->nkr_hwlease = j;
2037 while (howmany-- > 0) {
2038 ring->slot[j].len = 0;
2039 ring->slot[j].flags = 0;
2040 j = nm_next(j, lim);
2044 p[lease_idx] = j; /* report I am done */
2046 update_pos = kring->nr_hwtail;
2048 if (my_start == update_pos) {
2049 /* all slots before my_start have been reported,
2050 * so scan subsequent leases to see if other ranges
2051 * have been completed, and to a selwakeup or txsync.
2053 while (lease_idx != kring->nkr_lease_idx &&
2054 p[lease_idx] != NR_NOSLOT) {
2056 p[lease_idx] = NR_NOSLOT;
2057 lease_idx = nm_next(lease_idx, lim);
2059 /* j is the new 'write' position. j != my_start
2060 * means there are new buffers to report
2062 if (likely(j != my_start)) {
2063 kring->nr_hwtail = j;
2065 mtx_unlock(&kring->q_lock);
2066 kring->nm_notify(kring, 0);
2067 /* this is netmap_notify for VALE ports and
2068 * netmap_bwrap_notify for bwrap. The latter will
2069 * trigger a txsync on the underlying hwna
2071 if (dst_na->retry && retry--) {
2072 /* XXX this is going to call nm_notify again.
2073 * Only useful for bwrap in virtual machines
2080 mtx_unlock(&kring->q_lock);
2083 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
2086 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
2091 /* nm_txsync callback for VALE ports */
2093 netmap_vp_txsync(struct netmap_kring *kring, int flags)
2095 struct netmap_vp_adapter *na =
2096 (struct netmap_vp_adapter *)kring->na;
2098 u_int const lim = kring->nkr_num_slots - 1;
2099 u_int const head = kring->rhead;
2101 if (bridge_batch <= 0) { /* testing only */
2102 done = head; // used all
2109 if (bridge_batch > NM_BDG_BATCH)
2110 bridge_batch = NM_BDG_BATCH;
2112 done = nm_bdg_preflush(kring, head);
2115 D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
2117 * packets between 'done' and 'cur' are left unsent.
2119 kring->nr_hwcur = done;
2120 kring->nr_hwtail = nm_prev(done, lim);
2122 D("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
2127 /* rxsync code used by VALE ports nm_rxsync callback and also
2128 * internally by the brwap
2131 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
2133 struct netmap_adapter *na = kring->na;
2134 struct netmap_ring *ring = kring->ring;
2135 u_int nm_i, lim = kring->nkr_num_slots - 1;
2136 u_int head = kring->rhead;
2140 D("ouch dangerous reset!!!");
2141 n = netmap_ring_reinit(kring);
2145 /* First part, import newly received packets. */
2146 /* actually nothing to do here, they are already in the kring */
2148 /* Second part, skip past packets that userspace has released. */
2149 nm_i = kring->nr_hwcur;
2151 /* consistency check, but nothing really important here */
2152 for (n = 0; likely(nm_i != head); n++) {
2153 struct netmap_slot *slot = &ring->slot[nm_i];
2154 void *addr = NMB(na, slot);
2156 if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
2157 D("bad buffer index %d, ignore ?",
2160 slot->flags &= ~NS_BUF_CHANGED;
2161 nm_i = nm_next(nm_i, lim);
2163 kring->nr_hwcur = head;
2172 * nm_rxsync callback for VALE ports
2173 * user process reading from a VALE switch.
2174 * Already protected against concurrent calls from userspace,
2175 * but we must acquire the queue's lock to protect against
2176 * writers on the same queue.
2179 netmap_vp_rxsync(struct netmap_kring *kring, int flags)
2183 mtx_lock(&kring->q_lock);
2184 n = netmap_vp_rxsync_locked(kring, flags);
2185 mtx_unlock(&kring->q_lock);
2190 /* nm_bdg_attach callback for VALE ports
2191 * The na_vp port is this same netmap_adapter. There is no host port.
2194 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na)
2196 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
2201 strncpy(na->name, name, sizeof(na->name));
2202 na->na_hostvp = NULL;
2206 /* create a netmap_vp_adapter that describes a VALE port.
2207 * Only persistent VALE ports have a non-null ifp.
2210 netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp,
2211 struct netmap_mem_d *nmd,
2212 struct netmap_vp_adapter **ret)
2214 struct netmap_vp_adapter *vpna;
2215 struct netmap_adapter *na;
2219 vpna = nm_os_malloc(sizeof(*vpna));
2226 strncpy(na->name, nmr->nr_name, sizeof(na->name));
2228 /* bound checking */
2229 na->num_tx_rings = nmr->nr_tx_rings;
2230 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2231 nmr->nr_tx_rings = na->num_tx_rings; // write back
2232 na->num_rx_rings = nmr->nr_rx_rings;
2233 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2234 nmr->nr_rx_rings = na->num_rx_rings; // write back
2235 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
2236 1, NM_BDG_MAXSLOTS, NULL);
2237 na->num_tx_desc = nmr->nr_tx_slots;
2238 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
2239 1, NM_BDG_MAXSLOTS, NULL);
2240 /* validate number of pipes. We want at least 1,
2241 * but probably can do with some more.
2242 * So let's use 2 as default (when 0 is supplied)
2244 npipes = nmr->nr_arg1;
2245 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
2246 nmr->nr_arg1 = npipes; /* write back */
2247 /* validate extra bufs */
2248 nm_bound_var(&nmr->nr_arg3, 0, 0,
2249 128*NM_BDG_MAXSLOTS, NULL);
2250 na->num_rx_desc = nmr->nr_rx_slots;
2252 vpna->last_smac = ~0llu;
2253 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero??
2254 vpna->mfs = netmap_buf_size; */
2256 D("max frame size %u", vpna->mfs);
2258 na->na_flags |= NAF_BDG_MAYSLEEP;
2259 /* persistent VALE ports look like hw devices
2260 * with a native netmap adapter
2263 na->na_flags |= NAF_NATIVE;
2264 na->nm_txsync = netmap_vp_txsync;
2265 na->nm_rxsync = netmap_vp_rxsync;
2266 na->nm_register = netmap_vp_reg;
2267 na->nm_krings_create = netmap_vp_krings_create;
2268 na->nm_krings_delete = netmap_vp_krings_delete;
2269 na->nm_dtor = netmap_vp_dtor;
2270 D("nr_arg2 %d", nmr->nr_arg2);
2272 netmap_mem_get(nmd):
2273 netmap_mem_private_new(
2274 na->num_tx_rings, na->num_tx_desc,
2275 na->num_rx_rings, na->num_rx_desc,
2276 nmr->nr_arg3, npipes, &error);
2277 if (na->nm_mem == NULL)
2279 na->nm_bdg_attach = netmap_vp_bdg_attach;
2280 /* other nmd fields are set in the common routine */
2281 error = netmap_attach_common(na);
2288 if (na->nm_mem != NULL)
2289 netmap_mem_put(na->nm_mem);
2294 /* Bridge wrapper code (bwrap).
2295 * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
2297 * The main task is to swap the meaning of tx and rx rings to match the
2298 * expectations of the VALE switch code (see nm_bdg_flush).
2300 * The bwrap works by interposing a netmap_bwrap_adapter between the
2301 * rest of the system and the hwna. The netmap_bwrap_adapter looks like
2302 * a netmap_vp_adapter to the rest the system, but, internally, it
2303 * translates all callbacks to what the hwna expects.
2305 * Note that we have to intercept callbacks coming from two sides:
2307 * - callbacks coming from the netmap module are intercepted by
2308 * passing around the netmap_bwrap_adapter instead of the hwna
2310 * - callbacks coming from outside of the netmap module only know
2311 * about the hwna. This, however, only happens in interrupt
2312 * handlers, where only the hwna->nm_notify callback is called.
2313 * What the bwrap does is to overwrite the hwna->nm_notify callback
2314 * with its own netmap_bwrap_intr_notify.
2315 * XXX This assumes that the hwna->nm_notify callback was the
2316 * standard netmap_notify(), as it is the case for nic adapters.
2317 * Any additional action performed by hwna->nm_notify will not be
2318 * performed by netmap_bwrap_intr_notify.
2320 * Additionally, the bwrap can optionally attach the host rings pair
2321 * of the wrapped adapter to a different port of the switch.
2326 netmap_bwrap_dtor(struct netmap_adapter *na)
2328 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2329 struct netmap_adapter *hwna = bna->hwna;
2330 struct nm_bridge *b = bna->up.na_bdg,
2331 *bh = bna->host.na_bdg;
2333 netmap_mem_put(bna->host.up.nm_mem);
2336 netmap_bdg_detach_common(b, bna->up.bdg_port,
2337 (bh ? bna->host.bdg_port : -1));
2342 bna->host.up.ifp = NULL;
2343 hwna->na_private = NULL;
2344 hwna->na_vp = hwna->na_hostvp = NULL;
2345 hwna->na_flags &= ~NAF_BUSY;
2346 netmap_adapter_put(hwna);
2352 * Intr callback for NICs connected to a bridge.
2353 * Simply ignore tx interrupts (maybe we could try to recover space ?)
2354 * and pass received packets from nic to the bridge.
2356 * XXX TODO check locking: this is called from the interrupt
2357 * handler so we should make sure that the interface is not
2358 * disconnected while passing down an interrupt.
2360 * Note, no user process can access this NIC or the host stack.
2361 * The only part of the ring that is significant are the slots,
2362 * and head/cur/tail are set from the kring as needed
2363 * (part as a receive ring, part as a transmit ring).
2365 * callback that overwrites the hwna notify callback.
2366 * Packets come from the outside or from the host stack and are put on an
2368 * The bridge wrapper then sends the packets through the bridge.
2371 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
2373 struct netmap_adapter *na = kring->na;
2374 struct netmap_bwrap_adapter *bna = na->na_private;
2375 struct netmap_kring *bkring;
2376 struct netmap_vp_adapter *vpna = &bna->up;
2377 u_int ring_nr = kring->ring_id;
2378 int ret = NM_IRQ_COMPLETED;
2382 D("%s %s 0x%x", na->name, kring->name, flags);
2384 bkring = &vpna->up.tx_rings[ring_nr];
2386 /* make sure the ring is not disabled */
2387 if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {
2392 D("%s head %d cur %d tail %d", na->name,
2393 kring->rhead, kring->rcur, kring->rtail);
2395 /* simulate a user wakeup on the rx ring
2396 * fetch packets that have arrived.
2398 error = kring->nm_sync(kring, 0);
2401 if (kring->nr_hwcur == kring->nr_hwtail) {
2403 D("how strange, interrupt with no packets on %s",
2408 /* new packets are kring->rcur to kring->nr_hwtail, and the bkring
2409 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
2410 * to push all packets out.
2412 bkring->rhead = bkring->rcur = kring->nr_hwtail;
2414 netmap_vp_txsync(bkring, flags);
2416 /* mark all buffers as released on this ring */
2417 kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
2418 /* another call to actually release the buffers */
2419 error = kring->nm_sync(kring, 0);
2421 /* The second rxsync may have further advanced hwtail. If this happens,
2422 * return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */
2423 if (kring->rcur != kring->nr_hwtail) {
2424 ret = NM_IRQ_RESCHED;
2429 return error ? error : ret;
2433 /* nm_register callback for bwrap */
2435 netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
2437 struct netmap_bwrap_adapter *bna =
2438 (struct netmap_bwrap_adapter *)na;
2439 struct netmap_adapter *hwna = bna->hwna;
2440 struct netmap_vp_adapter *hostna = &bna->host;
2444 ND("%s %s", na->name, onoff ? "on" : "off");
2447 /* netmap_do_regif has been called on the bwrap na.
2448 * We need to pass the information about the
2449 * memory allocator down to the hwna before
2450 * putting it in netmap mode
2452 hwna->na_lut = na->na_lut;
2454 if (hostna->na_bdg) {
2455 /* if the host rings have been attached to switch,
2456 * we need to copy the memory allocator information
2457 * in the hostna also
2459 hostna->up.na_lut = na->na_lut;
2462 /* cross-link the netmap rings
2463 * The original number of rings comes from hwna,
2464 * rx rings on one side equals tx rings on the other.
2467 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2468 for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {
2469 NMR(hwna, r)[i].ring = NMR(na, t)[i].ring;
2473 if (na->na_flags & NAF_HOST_RINGS) {
2474 struct netmap_adapter *hna = &hostna->up;
2475 /* the hostna rings are the host rings of the bwrap.
2476 * The corresponding krings must point back to the
2479 hna->tx_rings = &na->tx_rings[na->num_tx_rings];
2480 hna->tx_rings[0].na = hna;
2481 hna->rx_rings = &na->rx_rings[na->num_rx_rings];
2482 hna->rx_rings[0].na = hna;
2486 /* pass down the pending ring state information */
2488 for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
2489 NMR(hwna, t)[i].nr_pending_mode =
2490 NMR(na, t)[i].nr_pending_mode;
2493 /* forward the request to the hwna */
2494 error = hwna->nm_register(hwna, onoff);
2498 /* copy up the current ring state information */
2500 for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
2501 NMR(na, t)[i].nr_mode =
2502 NMR(hwna, t)[i].nr_mode;
2505 /* impersonate a netmap_vp_adapter */
2506 netmap_vp_reg(na, onoff);
2508 netmap_vp_reg(&hostna->up, onoff);
2512 /* intercept the hwna nm_nofify callback on the hw rings */
2513 for (i = 0; i < hwna->num_rx_rings; i++) {
2514 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2515 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2517 i = hwna->num_rx_rings; /* for safety */
2518 /* save the host ring notify unconditionally */
2519 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2520 if (hostna->na_bdg) {
2521 /* also intercept the host ring notify */
2522 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2524 if (na->active_fds == 0)
2525 na->na_flags |= NAF_NETMAP_ON;
2529 if (na->active_fds == 0)
2530 na->na_flags &= ~NAF_NETMAP_ON;
2532 /* reset all notify callbacks (including host ring) */
2533 for (i = 0; i <= hwna->num_rx_rings; i++) {
2534 hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify;
2535 hwna->rx_rings[i].save_notify = NULL;
2537 hwna->na_lut.lut = NULL;
2538 hwna->na_lut.objtotal = 0;
2539 hwna->na_lut.objsize = 0;
2545 /* nm_config callback for bwrap */
2547 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
2548 u_int *rxr, u_int *rxd)
2550 struct netmap_bwrap_adapter *bna =
2551 (struct netmap_bwrap_adapter *)na;
2552 struct netmap_adapter *hwna = bna->hwna;
2554 /* forward the request */
2555 netmap_update_config(hwna);
2556 /* swap the results */
2557 *txr = hwna->num_rx_rings;
2558 *txd = hwna->num_rx_desc;
2559 *rxr = hwna->num_tx_rings;
2560 *rxd = hwna->num_rx_desc;
2566 /* nm_krings_create callback for bwrap */
2568 netmap_bwrap_krings_create(struct netmap_adapter *na)
2570 struct netmap_bwrap_adapter *bna =
2571 (struct netmap_bwrap_adapter *)na;
2572 struct netmap_adapter *hwna = bna->hwna;
2578 /* impersonate a netmap_vp_adapter */
2579 error = netmap_vp_krings_create(na);
2583 /* also create the hwna krings */
2584 error = hwna->nm_krings_create(hwna);
2586 goto err_del_vp_rings;
2589 /* get each ring slot number from the corresponding hwna ring */
2591 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2592 for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {
2593 NMR(na, t)[i].nkr_num_slots = NMR(hwna, r)[i].nkr_num_slots;
2600 netmap_vp_krings_delete(na);
2607 netmap_bwrap_krings_delete(struct netmap_adapter *na)
2609 struct netmap_bwrap_adapter *bna =
2610 (struct netmap_bwrap_adapter *)na;
2611 struct netmap_adapter *hwna = bna->hwna;
2615 hwna->nm_krings_delete(hwna);
2616 netmap_vp_krings_delete(na);
2620 /* notify method for the bridge-->hwna direction */
2622 netmap_bwrap_notify(struct netmap_kring *kring, int flags)
2624 struct netmap_adapter *na = kring->na;
2625 struct netmap_bwrap_adapter *bna = na->na_private;
2626 struct netmap_adapter *hwna = bna->hwna;
2627 u_int ring_n = kring->ring_id;
2628 u_int lim = kring->nkr_num_slots - 1;
2629 struct netmap_kring *hw_kring;
2632 ND("%s: na %s hwna %s",
2633 (kring ? kring->name : "NULL!"),
2634 (na ? na->name : "NULL!"),
2635 (hwna ? hwna->name : "NULL!"));
2636 hw_kring = &hwna->tx_rings[ring_n];
2638 if (nm_kr_tryget(hw_kring, 0, NULL)) {
2642 /* first step: simulate a user wakeup on the rx ring */
2643 netmap_vp_rxsync(kring, flags);
2644 ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2646 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2647 ring->head, ring->cur, ring->tail,
2648 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
2649 /* second step: the new packets are sent on the tx ring
2650 * (which is actually the same ring)
2652 hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
2653 error = hw_kring->nm_sync(hw_kring, flags);
2657 /* third step: now we are back the rx ring */
2658 /* claim ownership on all hw owned bufs */
2659 kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
2661 /* fourth step: the user goes to sleep again, causing another rxsync */
2662 netmap_vp_rxsync(kring, flags);
2663 ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2665 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2666 ring->head, ring->cur, ring->tail,
2667 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
2669 nm_kr_put(hw_kring);
2671 return error ? error : NM_IRQ_COMPLETED;
2675 /* nm_bdg_ctl callback for the bwrap.
2676 * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd].
2677 * On attach, it needs to provide a fake netmap_priv_d structure and
2678 * perform a netmap_do_regif() on the bwrap. This will put both the
2679 * bwrap and the hwna in netmap mode, with the netmap rings shared
2680 * and cross linked. Moroever, it will start intercepting interrupts
2684 netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
2686 struct netmap_priv_d *npriv;
2687 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2691 if (NETMAP_OWNED_BY_ANY(na)) {
2694 if (bna->na_kpriv) {
2698 npriv = netmap_priv_new();
2701 npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */
2702 error = netmap_do_regif(npriv, na, 0, NR_REG_NIC_SW);
2704 netmap_priv_delete(npriv);
2707 bna->na_kpriv = npriv;
2708 na->na_flags |= NAF_BUSY;
2710 if (na->active_fds == 0) /* not registered */
2712 netmap_priv_delete(bna->na_kpriv);
2713 bna->na_kpriv = NULL;
2714 na->na_flags &= ~NAF_BUSY;
2720 /* attach a bridge wrapper to the 'real' device */
2722 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
2724 struct netmap_bwrap_adapter *bna;
2725 struct netmap_adapter *na = NULL;
2726 struct netmap_adapter *hostna = NULL;
2730 /* make sure the NIC is not already in use */
2731 if (NETMAP_OWNED_BY_ANY(hwna)) {
2732 D("NIC %s busy, cannot attach to bridge", hwna->name);
2736 bna = nm_os_malloc(sizeof(*bna));
2742 /* make bwrap ifp point to the real ifp */
2743 na->ifp = hwna->ifp;
2745 na->na_private = bna;
2746 strncpy(na->name, nr_name, sizeof(na->name));
2747 /* fill the ring data for the bwrap adapter with rx/tx meanings
2748 * swapped. The real cross-linking will be done during register,
2749 * when all the krings will have been created.
2752 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2753 nma_set_nrings(na, t, nma_get_nrings(hwna, r));
2754 nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
2756 na->nm_dtor = netmap_bwrap_dtor;
2757 na->nm_register = netmap_bwrap_reg;
2758 // na->nm_txsync = netmap_bwrap_txsync;
2759 // na->nm_rxsync = netmap_bwrap_rxsync;
2760 na->nm_config = netmap_bwrap_config;
2761 na->nm_krings_create = netmap_bwrap_krings_create;
2762 na->nm_krings_delete = netmap_bwrap_krings_delete;
2763 na->nm_notify = netmap_bwrap_notify;
2764 na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
2765 na->pdev = hwna->pdev;
2766 na->nm_mem = netmap_mem_get(hwna->nm_mem);
2767 na->virt_hdr_len = hwna->virt_hdr_len;
2768 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2771 netmap_adapter_get(hwna);
2772 hwna->na_private = bna; /* weak reference */
2773 hwna->na_vp = &bna->up;
2775 if (hwna->na_flags & NAF_HOST_RINGS) {
2776 if (hwna->na_flags & NAF_SW_ONLY)
2777 na->na_flags |= NAF_SW_ONLY;
2778 na->na_flags |= NAF_HOST_RINGS;
2779 hostna = &bna->host.up;
2780 snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name);
2781 hostna->ifp = hwna->ifp;
2783 enum txrx r = nm_txrx_swap(t);
2784 nma_set_nrings(hostna, t, 1);
2785 nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
2787 // hostna->nm_txsync = netmap_bwrap_host_txsync;
2788 // hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2789 hostna->nm_notify = netmap_bwrap_notify;
2790 hostna->nm_mem = netmap_mem_get(na->nm_mem);
2791 hostna->na_private = bna;
2792 hostna->na_vp = &bna->up;
2793 na->na_hostvp = hwna->na_hostvp =
2794 hostna->na_hostvp = &bna->host;
2795 hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
2798 ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2799 na->name, ifp->if_xname,
2800 na->num_tx_rings, na->num_tx_desc,
2801 na->num_rx_rings, na->num_rx_desc);
2803 error = netmap_attach_common(na);
2807 hwna->na_flags |= NAF_BUSY;
2811 hwna->na_vp = hwna->na_hostvp = NULL;
2812 netmap_adapter_put(hwna);
2819 netmap_init_bridges2(u_int n)
2822 struct nm_bridge *b;
2824 b = nm_os_malloc(sizeof(struct nm_bridge) * n);
2827 for (i = 0; i < n; i++)
2833 netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
2840 for (i = 0; i < n; i++)
2841 BDG_RWDESTROY(&b[i]);
2846 netmap_init_bridges(void)
2848 #ifdef CONFIG_NET_NS
2849 return netmap_bns_register();
2851 nm_bridges = netmap_init_bridges2(NM_BRIDGES);
2852 if (nm_bridges == NULL)
2859 netmap_uninit_bridges(void)
2861 #ifdef CONFIG_NET_NS
2862 netmap_bns_unregister();
2864 netmap_uninit_bridges2(nm_bridges, NM_BRIDGES);
2867 #endif /* WITH_VALE */