2 * Copyright (C) 2013-2016 Universita` di Pisa
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * This module implements the VALE switch for netmap
33 NMG_LOCK() serializes all modifications to switches and ports.
34 A switch cannot be deleted until all ports are gone.
36 For each switch, an SX lock (RWlock on linux) protects
37 deletion of ports. When configuring or deleting a new port, the
38 lock is acquired in exclusive mode (after holding NMG_LOCK).
39 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
40 The lock is held throughout the entire forwarding cycle,
41 during which the thread may incur in a page fault.
42 Hence it is important that sleepable shared locks are used.
44 On the rx ring, the per-port lock is grabbed initially to reserve
45 a number of slot in the ring, then the lock is released,
46 packets are copied from source to destination, and then
47 the lock is acquired again and the receive ring is updated.
48 (A similar thing is done on the tx ring for NIC and host stack
49 ports attached to the switch)
54 * OS-specific code that is used only within this file.
55 * Other OS-specific code that must be accessed by drivers
56 * is present in netmap_kern.h
59 #if defined(__FreeBSD__)
60 #include <sys/cdefs.h> /* prerequisite */
61 __FBSDID("$FreeBSD$");
63 #include <sys/types.h>
64 #include <sys/errno.h>
65 #include <sys/param.h> /* defines used in kernel.h */
66 #include <sys/kernel.h> /* types used in module initialization */
67 #include <sys/conf.h> /* cdevsw struct, UID, GID */
68 #include <sys/sockio.h>
69 #include <sys/socketvar.h> /* struct socket */
70 #include <sys/malloc.h>
72 #include <sys/rwlock.h>
73 #include <sys/socket.h> /* sockaddrs */
74 #include <sys/selinfo.h>
75 #include <sys/sysctl.h>
77 #include <net/if_var.h>
78 #include <net/bpf.h> /* BIOCIMMEDIATE */
79 #include <machine/bus.h> /* bus_dmamap_* */
80 #include <sys/endian.h>
81 #include <sys/refcount.h>
84 #define BDG_RWLOCK_T struct rwlock // struct rwlock
86 #define BDG_RWINIT(b) \
87 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
88 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock)
89 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock)
90 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock)
91 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock)
92 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock)
93 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock)
100 #elif defined(__APPLE__)
102 #warning OSX support is only partial
103 #include "osx_glue.h"
105 #elif defined(_WIN32)
106 #include "win_glue.h"
110 #error Unsupported platform
112 #endif /* unsupported */
118 #include <net/netmap.h>
119 #include <dev/netmap/netmap_kern.h>
120 #include <dev/netmap/netmap_mem2.h>
125 * system parameters (most of them in netmap_kern.h)
126 * NM_BDG_NAME prefix for switch port names, default "vale"
127 * NM_BDG_MAXPORTS number of ports
128 * NM_BRIDGES max number of switches in the system.
129 * XXX should become a sysctl or tunable
131 * Switch ports are named valeX:Y where X is the switch name and Y
132 * is the port. If Y matches a physical interface name, the port is
133 * connected to a physical device.
135 * Unlike physical interfaces, switch ports use their own memory region
136 * for rings and buffers.
137 * The virtual interfaces use per-queue lock instead of core lock.
138 * In the tx loop, we aggregate traffic in batches to make all operations
139 * faster. The batch size is bridge_batch.
141 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */
142 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */
143 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */
144 #define NM_BDG_HASH 1024 /* forwarding table entries */
145 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */
146 #define NM_MULTISEG 64 /* max size of a chain of bufs */
147 /* actual size of the tables */
148 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG)
149 /* NM_FT_NULL terminates a list of slots in the ft */
150 #define NM_FT_NULL NM_BDG_BATCH_MAX
154 * bridge_batch is set via sysctl to the max batch size to be
155 * used in the bridge. The actual value may be larger as the
156 * last packet in the block may overflow the size.
158 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
160 SYSCTL_DECL(_dev_netmap);
161 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
164 static int netmap_vp_create(struct nmreq *, struct ifnet *,
165 struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
166 static int netmap_vp_reg(struct netmap_adapter *na, int onoff);
167 static int netmap_bwrap_reg(struct netmap_adapter *, int onoff);
170 * For each output interface, nm_bdg_q is used to construct a list.
171 * bq_len is the number of output buffers (we can have coalescing
177 uint32_t bq_len; /* number of buffers */
180 /* XXX revise this */
182 uint64_t mac; /* the top 2 bytes are the epoch */
187 * nm_bridge is a descriptor for a VALE switch.
188 * Interfaces for a bridge are all in bdg_ports[].
189 * The array has fixed size, an empty entry does not terminate
190 * the search, but lookups only occur on attach/detach so we
191 * don't mind if they are slow.
193 * The bridge is non blocking on the transmit ports: excess
194 * packets are dropped if there is no room on the output port.
196 * bdg_lock protects accesses to the bdg_ports array.
197 * This is a rw lock (or equivalent).
200 /* XXX what is the proper alignment/layout ? */
201 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */
203 uint32_t bdg_active_ports; /* 0 means free */
204 char bdg_basename[IFNAMSIZ];
206 /* Indexes of active ports (up to active_ports)
207 * and all other remaining ports.
209 uint8_t bdg_port_index[NM_BDG_MAXPORTS];
211 struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
215 * The function to decide the destination port.
216 * It returns either of an index of the destination port,
217 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
218 * forward this packet. ring_nr is the source ring index, and the
219 * function may overwrite this value to forward this packet to a
220 * different ring index.
221 * This function must be set by netmap_bdg_ctl().
223 struct netmap_bdg_ops bdg_ops;
225 /* the forwarding table, MAC+ports.
226 * XXX should be changed to an argument to be passed to
227 * the lookup function, and allocated on attach
229 struct nm_hash_ent ht[NM_BDG_HASH];
233 #endif /* CONFIG_NET_NS */
237 netmap_bdg_name(struct netmap_vp_adapter *vp)
239 struct nm_bridge *b = vp->na_bdg;
242 return b->bdg_basename;
246 #ifndef CONFIG_NET_NS
248 * XXX in principle nm_bridges could be created dynamically
249 * Right now we have a static array and deletions are protected
250 * by an exclusive lock.
252 static struct nm_bridge *nm_bridges;
253 #endif /* !CONFIG_NET_NS */
257 * this is a slightly optimized copy routine which rounds
258 * to multiple of 64 bytes and is often faster than dealing
259 * with other odd sizes. We assume there is enough room
260 * in the source and destination buffers.
262 * XXX only for multiples of 64 bytes, non overlapped.
265 pkt_copy(void *_src, void *_dst, int l)
267 uint64_t *src = _src;
268 uint64_t *dst = _dst;
269 if (unlikely(l >= 1024)) {
273 for (; likely(l > 0); l-=64) {
287 nm_is_id_char(const char c)
289 return (c >= 'a' && c <= 'z') ||
290 (c >= 'A' && c <= 'Z') ||
291 (c >= '0' && c <= '9') ||
295 /* Validate the name of a VALE bridge port and return the
296 * position of the ":" character. */
298 nm_vale_name_validate(const char *name)
303 if (!name || strlen(name) < strlen(NM_BDG_NAME)) {
307 for (i = 0; name[i]; i++) {
308 if (name[i] == ':') {
309 if (colon_pos != -1) {
313 } else if (!nm_is_id_char(name[i])) {
326 * locate a bridge among the existing ones.
327 * MUST BE CALLED WITH NMG_LOCK()
329 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
330 * We assume that this is called with a name of at least NM_NAME chars.
332 static struct nm_bridge *
333 nm_find_bridge(const char *name, int create)
336 struct nm_bridge *b = NULL, *bridges;
341 netmap_bns_getbridges(&bridges, &num_bridges);
343 namelen = nm_vale_name_validate(name);
345 D("invalid bridge name %s", name ? name : NULL);
349 /* lookup the name, remember empty slot if there is one */
350 for (i = 0; i < num_bridges; i++) {
351 struct nm_bridge *x = bridges + i;
353 if (x->bdg_active_ports == 0) {
354 if (create && b == NULL)
355 b = x; /* record empty slot */
356 } else if (x->bdg_namelen != namelen) {
358 } else if (strncmp(name, x->bdg_basename, namelen) == 0) {
359 ND("found '%.*s' at %d", namelen, name, i);
364 if (i == num_bridges && b) { /* name not found, can create entry */
365 /* initialize the bridge */
366 strncpy(b->bdg_basename, name, namelen);
367 ND("create new bridge %s with ports %d", b->bdg_basename,
368 b->bdg_active_ports);
369 b->bdg_namelen = namelen;
370 b->bdg_active_ports = 0;
371 for (i = 0; i < NM_BDG_MAXPORTS; i++)
372 b->bdg_port_index[i] = i;
373 /* set the default function */
374 b->bdg_ops.lookup = netmap_bdg_learning;
375 /* reset the MAC address table */
376 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
384 * Free the forwarding tables for rings attached to switch ports.
387 nm_free_bdgfwd(struct netmap_adapter *na)
390 struct netmap_kring *kring;
393 nrings = na->num_tx_rings;
394 kring = na->tx_rings;
395 for (i = 0; i < nrings; i++) {
396 if (kring[i].nkr_ft) {
397 nm_os_free(kring[i].nkr_ft);
398 kring[i].nkr_ft = NULL; /* protect from freeing twice */
405 * Allocate the forwarding tables for the rings attached to the bridge ports.
408 nm_alloc_bdgfwd(struct netmap_adapter *na)
410 int nrings, l, i, num_dstq;
411 struct netmap_kring *kring;
414 /* all port:rings + broadcast */
415 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
416 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
417 l += sizeof(struct nm_bdg_q) * num_dstq;
418 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
420 nrings = netmap_real_rings(na, NR_TX);
421 kring = na->tx_rings;
422 for (i = 0; i < nrings; i++) {
423 struct nm_bdg_fwd *ft;
424 struct nm_bdg_q *dstq;
427 ft = nm_os_malloc(l);
432 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
433 for (j = 0; j < num_dstq; j++) {
434 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
437 kring[i].nkr_ft = ft;
443 /* remove from bridge b the ports in slots hw and sw
444 * (sw can be -1 if not needed)
447 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
449 int s_hw = hw, s_sw = sw;
450 int i, lim =b->bdg_active_ports;
451 uint8_t tmp[NM_BDG_MAXPORTS];
455 make a copy of bdg_port_index;
456 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
457 in the array of bdg_port_index, replacing them with
458 entries from the bottom of the array;
459 decrement bdg_active_ports;
460 acquire BDG_WLOCK() and copy back the array.
464 D("detach %d and %d (lim %d)", hw, sw, lim);
465 /* make a copy of the list of active ports, update it,
466 * and then copy back within BDG_WLOCK().
468 memcpy(tmp, b->bdg_port_index, sizeof(tmp));
469 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
470 if (hw >= 0 && tmp[i] == hw) {
471 ND("detach hw %d at %d", hw, i);
472 lim--; /* point to last active port */
473 tmp[i] = tmp[lim]; /* swap with i */
474 tmp[lim] = hw; /* now this is inactive */
476 } else if (sw >= 0 && tmp[i] == sw) {
477 ND("detach sw %d at %d", sw, i);
486 if (hw >= 0 || sw >= 0) {
487 D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
492 b->bdg_ops.dtor(b->bdg_ports[s_hw]);
493 b->bdg_ports[s_hw] = NULL;
495 b->bdg_ports[s_sw] = NULL;
497 memcpy(b->bdg_port_index, tmp, sizeof(tmp));
498 b->bdg_active_ports = lim;
501 ND("now %d active ports", lim);
503 ND("marking bridge %s as free", b->bdg_basename);
504 bzero(&b->bdg_ops, sizeof(b->bdg_ops));
509 /* nm_bdg_ctl callback for VALE ports */
511 netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
513 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
514 struct nm_bridge *b = vpna->na_bdg;
516 (void)nmr; // XXX merge ?
518 return 0; /* nothing to do */
520 netmap_set_all_rings(na, 0 /* disable */);
521 netmap_bdg_detach_common(b, vpna->bdg_port, -1);
523 netmap_set_all_rings(na, 1 /* enable */);
525 /* I have took reference just for attach */
526 netmap_adapter_put(na);
530 /* nm_dtor callback for ephemeral VALE ports */
532 netmap_vp_dtor(struct netmap_adapter *na)
534 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
535 struct nm_bridge *b = vpna->na_bdg;
537 ND("%s has %d references", na->name, na->na_refcount);
540 netmap_bdg_detach_common(b, vpna->bdg_port, -1);
543 if (vpna->autodelete && na->ifp != NULL) {
544 ND("releasing %s", na->ifp->if_xname);
546 nm_os_vi_detach(na->ifp);
551 /* remove a persistent VALE port from the system */
553 nm_vi_destroy(const char *name)
556 struct netmap_vp_adapter *vpna;
559 ifp = ifunit_ref(name);
563 /* make sure this is actually a VALE port */
564 if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
569 vpna = (struct netmap_vp_adapter *)NA(ifp);
571 /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
572 if (vpna->autodelete) {
577 /* also make sure that nobody is using the inferface */
578 if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
579 vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
586 D("destroying a persistent vale interface %s", ifp->if_xname);
587 /* Linux requires all the references are released
592 nm_os_vi_detach(ifp);
602 nm_update_info(struct nmreq *nmr, struct netmap_adapter *na)
604 nmr->nr_rx_rings = na->num_rx_rings;
605 nmr->nr_tx_rings = na->num_tx_rings;
606 nmr->nr_rx_slots = na->num_rx_desc;
607 nmr->nr_tx_slots = na->num_tx_desc;
608 return netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, NULL, &nmr->nr_arg2);
612 * Create a virtual interface registered to the system.
613 * The interface will be attached to a bridge later.
616 netmap_vi_create(struct nmreq *nmr, int autodelete)
619 struct netmap_vp_adapter *vpna;
620 struct netmap_mem_d *nmd = NULL;
623 /* don't include VALE prefix */
624 if (!strncmp(nmr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
626 ifp = ifunit_ref(nmr->nr_name);
627 if (ifp) { /* already exist, cannot create new one */
630 if (NM_NA_VALID(ifp)) {
631 int update_err = nm_update_info(nmr, NA(ifp));
639 error = nm_os_vi_persist(nmr->nr_name, &ifp);
645 nmd = netmap_mem_find(nmr->nr_arg2);
651 /* netmap_vp_create creates a struct netmap_vp_adapter */
652 error = netmap_vp_create(nmr, ifp, nmd, &vpna);
654 D("error %d", error);
657 /* persist-specific routines */
658 vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
660 netmap_adapter_get(&vpna->up);
662 vpna->autodelete = 1;
664 NM_ATTACH_NA(ifp, &vpna->up);
665 /* return the updated info */
666 error = nm_update_info(nmr, &vpna->up);
670 D("returning nr_arg2 %d", nmr->nr_arg2);
674 D("created %s", ifp->if_xname);
683 nm_os_vi_detach(ifp);
688 /* Try to get a reference to a netmap adapter attached to a VALE switch.
689 * If the adapter is found (or is created), this function returns 0, a
690 * non NULL pointer is returned into *na, and the caller holds a
691 * reference to the adapter.
692 * If an adapter is not found, then no reference is grabbed and the
693 * function returns an error code, or 0 if there is just a VALE prefix
694 * mismatch. Therefore the caller holds a reference when
695 * (*na != NULL && return == 0).
698 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na,
699 struct netmap_mem_d *nmd, int create)
701 char *nr_name = nmr->nr_name;
703 struct ifnet *ifp = NULL;
705 struct netmap_vp_adapter *vpna, *hostna = NULL;
707 int i, j, cand = -1, cand2 = -1;
710 *na = NULL; /* default return value */
712 /* first try to see if this is a bridge port. */
714 if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) {
715 return 0; /* no error, but no VALE prefix */
718 b = nm_find_bridge(nr_name, create);
720 D("no bridges available for '%s'", nr_name);
721 return (create ? ENOMEM : ENXIO);
723 if (strlen(nr_name) < b->bdg_namelen) /* impossible */
726 /* Now we are sure that name starts with the bridge's name,
727 * lookup the port in the bridge. We need to scan the entire
728 * list. It is not important to hold a WLOCK on the bridge
729 * during the search because NMG_LOCK already guarantees
730 * that there are no other possible writers.
733 /* lookup in the local list of ports */
734 for (j = 0; j < b->bdg_active_ports; j++) {
735 i = b->bdg_port_index[j];
736 vpna = b->bdg_ports[i];
737 // KASSERT(na != NULL);
738 ND("checking %s", vpna->up.name);
739 if (!strcmp(vpna->up.name, nr_name)) {
740 netmap_adapter_get(&vpna->up);
741 ND("found existing if %s refs %d", nr_name)
746 /* not found, should we create it? */
749 /* yes we should, see if we have space to attach entries */
750 needed = 2; /* in some cases we only need 1 */
751 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
752 D("bridge full %d, cannot create new port", b->bdg_active_ports);
755 /* record the next two ports available, but do not allocate yet */
756 cand = b->bdg_port_index[b->bdg_active_ports];
757 cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
758 ND("+++ bridge %s port %s used %d avail %d %d",
759 b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
762 * try see if there is a matching NIC with this name
763 * (after the bridge's name)
765 ifname = nr_name + b->bdg_namelen + 1;
766 ifp = ifunit_ref(ifname);
768 /* Create an ephemeral virtual port
769 * This block contains all the ephemeral-specific logics
772 /* nr_cmd must be 0 for a virtual port */
777 /* bdg_netmap_attach creates a struct netmap_adapter */
778 error = netmap_vp_create(nmr, NULL, nmd, &vpna);
780 D("error %d", error);
783 /* shortcut - we can skip get_hw_na(),
784 * ownership check and nm_bdg_attach()
787 struct netmap_adapter *hw;
789 error = netmap_get_hw_na(ifp, nmd, &hw);
790 if (error || hw == NULL)
793 /* host adapter might not be created */
794 error = hw->nm_bdg_attach(nr_name, hw);
798 hostna = hw->na_hostvp;
799 if (nmr->nr_arg1 != NETMAP_BDG_HOST)
804 vpna->bdg_port = cand;
805 ND("NIC %p to bridge port %d", vpna, cand);
806 /* bind the port to the bridge (virtual ports are not active) */
807 b->bdg_ports[cand] = vpna;
809 b->bdg_active_ports++;
810 if (hostna != NULL) {
811 /* also bind the host stack to the bridge */
812 b->bdg_ports[cand2] = hostna;
813 hostna->bdg_port = cand2;
815 b->bdg_active_ports++;
816 ND("host %p to bridge port %d", hostna, cand2);
818 ND("if %s refs %d", ifname, vpna->up.na_refcount);
821 netmap_adapter_get(*na);
831 /* Process NETMAP_BDG_ATTACH */
833 nm_bdg_ctl_attach(struct nmreq *nmr)
835 struct netmap_adapter *na;
836 struct netmap_mem_d *nmd = NULL;
842 nmd = netmap_mem_find(nmr->nr_arg2);
849 error = netmap_get_bdg_na(nmr, &na, nmd, 1 /* create if not exists */);
850 if (error) /* no device */
853 if (na == NULL) { /* VALE prefix missing */
858 if (NETMAP_OWNED_BY_ANY(na)) {
863 if (na->nm_bdg_ctl) {
864 /* nop for VALE ports. The bwrap needs to put the hwna
865 * in netmap mode (see netmap_bwrap_bdg_ctl)
867 error = na->nm_bdg_ctl(na, nmr, 1);
870 ND("registered %s to netmap-mode", na->name);
876 netmap_adapter_put(na);
883 nm_is_bwrap(struct netmap_adapter *na)
885 return na->nm_register == netmap_bwrap_reg;
888 /* process NETMAP_BDG_DETACH */
890 nm_bdg_ctl_detach(struct nmreq *nmr)
892 struct netmap_adapter *na;
896 error = netmap_get_bdg_na(nmr, &na, NULL, 0 /* don't create */);
897 if (error) { /* no device, or another bridge or user owns the device */
901 if (na == NULL) { /* VALE prefix missing */
904 } else if (nm_is_bwrap(na) &&
905 ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
906 /* Don't detach a NIC with polling */
908 netmap_adapter_put(na);
911 if (na->nm_bdg_ctl) {
912 /* remove the port from bridge. The bwrap
913 * also needs to put the hwna in normal mode
915 error = na->nm_bdg_ctl(na, nmr, 0);
918 netmap_adapter_put(na);
925 struct nm_bdg_polling_state;
931 struct nm_bdg_polling_state *bps;
934 struct nm_bdg_polling_state {
937 struct netmap_bwrap_adapter *bna;
943 struct nm_bdg_kthread *kthreads;
947 netmap_bwrap_polling(void *data, int is_kthread)
949 struct nm_bdg_kthread *nbk = data;
950 struct netmap_bwrap_adapter *bna;
951 u_int qfirst, qlast, i;
952 struct netmap_kring *kring0, *kring;
956 qfirst = nbk->qfirst;
959 kring0 = NMR(bna->hwna, NR_RX);
961 for (i = qfirst; i < qlast; i++) {
963 kring->nm_notify(kring, 0);
968 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)
970 struct nm_kctx_cfg kcfg;
973 bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus);
974 if (bps->kthreads == NULL)
977 bzero(&kcfg, sizeof(kcfg));
978 kcfg.worker_fn = netmap_bwrap_polling;
979 kcfg.use_kthread = 1;
980 for (i = 0; i < bps->ncpus; i++) {
981 struct nm_bdg_kthread *t = bps->kthreads + i;
982 int all = (bps->ncpus == 1 && bps->reg == NR_REG_ALL_NIC);
983 int affinity = bps->cpu_from + i;
986 t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;
987 t->qlast = all ? bps->qlast : t->qfirst + 1;
988 D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,
992 kcfg.worker_private = t;
993 t->nmk = nm_os_kctx_create(&kcfg, 0, NULL);
994 if (t->nmk == NULL) {
997 nm_os_kctx_worker_setaff(t->nmk, affinity);
1002 for (j = 0; j < i; j++) {
1003 struct nm_bdg_kthread *t = bps->kthreads + i;
1004 nm_os_kctx_destroy(t->nmk);
1006 nm_os_free(bps->kthreads);
1010 /* A variant of ptnetmap_start_kthreads() */
1012 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)
1017 D("polling is not configured");
1020 bps->stopped = false;
1022 for (i = 0; i < bps->ncpus; i++) {
1023 struct nm_bdg_kthread *t = bps->kthreads + i;
1024 error = nm_os_kctx_worker_start(t->nmk);
1026 D("error in nm_kthread_start()");
1033 for (j = 0; j < i; j++) {
1034 struct nm_bdg_kthread *t = bps->kthreads + i;
1035 nm_os_kctx_worker_stop(t->nmk);
1037 bps->stopped = true;
1042 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)
1049 for (i = 0; i < bps->ncpus; i++) {
1050 struct nm_bdg_kthread *t = bps->kthreads + i;
1051 nm_os_kctx_worker_stop(t->nmk);
1052 nm_os_kctx_destroy(t->nmk);
1054 bps->stopped = true;
1058 get_polling_cfg(struct nmreq *nmr, struct netmap_adapter *na,
1059 struct nm_bdg_polling_state *bps)
1061 int req_cpus, avail_cpus, core_from;
1062 u_int reg, i, qfirst, qlast;
1064 avail_cpus = nm_os_ncpus();
1065 req_cpus = nmr->nr_arg1;
1067 if (req_cpus == 0) {
1068 D("req_cpus must be > 0");
1070 } else if (req_cpus >= avail_cpus) {
1071 D("for safety, we need at least one core left in the system");
1074 reg = nmr->nr_flags & NR_REG_MASK;
1075 i = nmr->nr_ringid & NETMAP_RING_MASK;
1077 * ONE_NIC: dedicate one core to one ring. If multiple cores
1078 * are specified, consecutive rings are also polled.
1079 * For example, if ringid=2 and 2 cores are given,
1080 * ring 2 and 3 are polled by core 2 and 3, respectively.
1081 * ALL_NIC: poll all the rings using a core specified by ringid.
1082 * the number of cores must be 1.
1084 if (reg == NR_REG_ONE_NIC) {
1085 if (i + req_cpus > nma_get_nrings(na, NR_RX)) {
1086 D("only %d rings exist (ring %u-%u is given)",
1087 nma_get_nrings(na, NR_RX), i, i+req_cpus);
1091 qlast = qfirst + req_cpus;
1093 } else if (reg == NR_REG_ALL_NIC) {
1094 if (req_cpus != 1) {
1095 D("ncpus must be 1 not %d for REG_ALL_NIC", req_cpus);
1099 qlast = nma_get_nrings(na, NR_RX);
1102 D("reg must be ALL_NIC or ONE_NIC");
1107 bps->qfirst = qfirst;
1109 bps->cpu_from = core_from;
1110 bps->ncpus = req_cpus;
1111 D("%s qfirst %u qlast %u cpu_from %u ncpus %u",
1112 reg == NR_REG_ALL_NIC ? "REG_ALL_NIC" : "REG_ONE_NIC",
1113 qfirst, qlast, core_from, req_cpus);
1118 nm_bdg_ctl_polling_start(struct nmreq *nmr, struct netmap_adapter *na)
1120 struct nm_bdg_polling_state *bps;
1121 struct netmap_bwrap_adapter *bna;
1124 bna = (struct netmap_bwrap_adapter *)na;
1125 if (bna->na_polling_state) {
1126 D("ERROR adapter already in polling mode");
1130 bps = nm_os_malloc(sizeof(*bps));
1133 bps->configured = false;
1134 bps->stopped = true;
1136 if (get_polling_cfg(nmr, na, bps)) {
1141 if (nm_bdg_create_kthreads(bps)) {
1146 bps->configured = true;
1147 bna->na_polling_state = bps;
1150 /* disable interrupt if possible */
1151 if (bna->hwna->nm_intr)
1152 bna->hwna->nm_intr(bna->hwna, 0);
1153 /* start kthread now */
1154 error = nm_bdg_polling_start_kthreads(bps);
1156 D("ERROR nm_bdg_polling_start_kthread()");
1157 nm_os_free(bps->kthreads);
1159 bna->na_polling_state = NULL;
1160 if (bna->hwna->nm_intr)
1161 bna->hwna->nm_intr(bna->hwna, 1);
1167 nm_bdg_ctl_polling_stop(struct nmreq *nmr, struct netmap_adapter *na)
1169 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;
1170 struct nm_bdg_polling_state *bps;
1172 if (!bna->na_polling_state) {
1173 D("ERROR adapter is not in polling mode");
1176 bps = bna->na_polling_state;
1177 nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);
1178 bps->configured = false;
1180 bna->na_polling_state = NULL;
1181 /* reenable interrupt */
1182 if (bna->hwna->nm_intr)
1183 bna->hwna->nm_intr(bna->hwna, 1);
1187 /* Called by either user's context (netmap_ioctl())
1188 * or external kernel modules (e.g., Openvswitch).
1189 * Operation is indicated in nmr->nr_cmd.
1190 * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge
1191 * requires bdg_ops argument; the other commands ignore this argument.
1193 * Called without NMG_LOCK.
1196 netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)
1198 struct nm_bridge *b, *bridges;
1199 struct netmap_adapter *na;
1200 struct netmap_vp_adapter *vpna;
1201 char *name = nmr->nr_name;
1202 int cmd = nmr->nr_cmd, namelen = strlen(name);
1203 int error = 0, i, j;
1206 netmap_bns_getbridges(&bridges, &num_bridges);
1209 case NETMAP_BDG_NEWIF:
1210 error = netmap_vi_create(nmr, 0 /* no autodelete */);
1213 case NETMAP_BDG_DELIF:
1214 error = nm_vi_destroy(nmr->nr_name);
1217 case NETMAP_BDG_ATTACH:
1218 error = nm_bdg_ctl_attach(nmr);
1221 case NETMAP_BDG_DETACH:
1222 error = nm_bdg_ctl_detach(nmr);
1225 case NETMAP_BDG_LIST:
1226 /* this is used to enumerate bridges and ports */
1227 if (namelen) { /* look up indexes of bridge and port */
1228 if (strncmp(name, NM_BDG_NAME, strlen(NM_BDG_NAME))) {
1233 b = nm_find_bridge(name, 0 /* don't create */);
1241 nmr->nr_arg1 = b - bridges; /* bridge index */
1242 nmr->nr_arg2 = NM_BDG_NOPORT;
1243 for (j = 0; j < b->bdg_active_ports; j++) {
1244 i = b->bdg_port_index[j];
1245 vpna = b->bdg_ports[i];
1247 D("---AAAAAAAAARGH-------");
1250 /* the former and the latter identify a
1251 * virtual port and a NIC, respectively
1253 if (!strcmp(vpna->up.name, name)) {
1254 nmr->nr_arg2 = i; /* port index */
1260 /* return the first non-empty entry starting from
1261 * bridge nr_arg1 and port nr_arg2.
1263 * Users can detect the end of the same bridge by
1264 * seeing the new and old value of nr_arg1, and can
1265 * detect the end of all the bridge by error != 0
1271 for (error = ENOENT; i < NM_BRIDGES; i++) {
1273 for ( ; j < NM_BDG_MAXPORTS; j++) {
1274 if (b->bdg_ports[j] == NULL)
1276 vpna = b->bdg_ports[j];
1277 strncpy(name, vpna->up.name, (size_t)IFNAMSIZ);
1281 j = 0; /* following bridges scan from 0 */
1290 case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */
1291 /* register callbacks to the given bridge.
1292 * nmr->nr_name may be just bridge's name (including ':'
1293 * if it is not just NM_NAME).
1300 b = nm_find_bridge(name, 0 /* don't create */);
1304 b->bdg_ops = *bdg_ops;
1309 case NETMAP_BDG_VNET_HDR:
1310 /* Valid lengths for the virtio-net header are 0 (no header),
1312 if (nmr->nr_arg1 != 0 &&
1313 nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) &&
1314 nmr->nr_arg1 != 12) {
1319 error = netmap_get_bdg_na(nmr, &na, NULL, 0);
1321 vpna = (struct netmap_vp_adapter *)na;
1322 na->virt_hdr_len = nmr->nr_arg1;
1323 if (na->virt_hdr_len) {
1324 vpna->mfs = NETMAP_BUF_SIZE(na);
1326 D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
1327 netmap_adapter_put(na);
1334 case NETMAP_BDG_POLLING_ON:
1335 case NETMAP_BDG_POLLING_OFF:
1337 error = netmap_get_bdg_na(nmr, &na, NULL, 0);
1339 if (!nm_is_bwrap(na)) {
1341 } else if (cmd == NETMAP_BDG_POLLING_ON) {
1342 error = nm_bdg_ctl_polling_start(nmr, na);
1344 netmap_adapter_get(na);
1346 error = nm_bdg_ctl_polling_stop(nmr, na);
1348 netmap_adapter_put(na);
1350 netmap_adapter_put(na);
1356 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
1364 netmap_bdg_config(struct nmreq *nmr)
1366 struct nm_bridge *b;
1370 b = nm_find_bridge(nmr->nr_name, 0);
1376 /* Don't call config() with NMG_LOCK() held */
1378 if (b->bdg_ops.config != NULL)
1379 error = b->bdg_ops.config((struct nm_ifreq *)nmr);
1385 /* nm_krings_create callback for VALE ports.
1386 * Calls the standard netmap_krings_create, then adds leases on rx
1387 * rings and bdgfwd on tx rings.
1390 netmap_vp_krings_create(struct netmap_adapter *na)
1395 u_int nrx = netmap_real_rings(na, NR_RX);
1398 * Leases are attached to RX rings on vale ports
1400 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
1402 error = netmap_krings_create(na, tailroom);
1406 leases = na->tailroom;
1408 for (i = 0; i < nrx; i++) { /* Receive rings */
1409 na->rx_rings[i].nkr_leases = leases;
1410 leases += na->num_rx_desc;
1413 error = nm_alloc_bdgfwd(na);
1415 netmap_krings_delete(na);
1423 /* nm_krings_delete callback for VALE ports. */
1425 netmap_vp_krings_delete(struct netmap_adapter *na)
1428 netmap_krings_delete(na);
1433 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
1434 struct netmap_vp_adapter *na, u_int ring_nr);
1438 * main dispatch routine for the bridge.
1439 * Grab packets from a kring, move them into the ft structure
1440 * associated to the tx (input) port. Max one instance per port,
1441 * filtered on input (ioctl, poll or XXX).
1442 * Returns the next position in the ring.
1445 nm_bdg_preflush(struct netmap_kring *kring, u_int end)
1447 struct netmap_vp_adapter *na =
1448 (struct netmap_vp_adapter*)kring->na;
1449 struct netmap_ring *ring = kring->ring;
1450 struct nm_bdg_fwd *ft;
1451 u_int ring_nr = kring->ring_id;
1452 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
1453 u_int ft_i = 0; /* start from 0 */
1454 u_int frags = 1; /* how many frags ? */
1455 struct nm_bridge *b = na->na_bdg;
1457 /* To protect against modifications to the bridge we acquire a
1458 * shared lock, waiting if we can sleep (if the source port is
1459 * attached to a user process) or with a trylock otherwise (NICs).
1461 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1462 if (na->up.na_flags & NAF_BDG_MAYSLEEP)
1464 else if (!BDG_RTRYLOCK(b))
1466 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1469 for (; likely(j != end); j = nm_next(j, lim)) {
1470 struct netmap_slot *slot = &ring->slot[j];
1473 ft[ft_i].ft_len = slot->len;
1474 ft[ft_i].ft_flags = slot->flags;
1476 ND("flags is 0x%x", slot->flags);
1477 /* we do not use the buf changed flag, but we still need to reset it */
1478 slot->flags &= ~NS_BUF_CHANGED;
1480 /* this slot goes into a list so initialize the link field */
1481 ft[ft_i].ft_next = NM_FT_NULL;
1482 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
1483 (void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
1484 if (unlikely(buf == NULL)) {
1485 RD(5, "NULL %s buffer pointer from %s slot %d len %d",
1486 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
1487 kring->name, j, ft[ft_i].ft_len);
1488 buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
1489 ft[ft_i].ft_len = 0;
1490 ft[ft_i].ft_flags = 0;
1492 __builtin_prefetch(buf);
1494 if (slot->flags & NS_MOREFRAG) {
1498 if (unlikely(netmap_verbose && frags > 1))
1499 RD(5, "%d frags at %d", frags, ft_i - frags);
1500 ft[ft_i - frags].ft_frags = frags;
1502 if (unlikely((int)ft_i >= bridge_batch))
1503 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1506 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
1507 * have to fix frags count. */
1509 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
1510 ft[ft_i - frags].ft_frags = frags;
1511 D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
1514 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1520 /* ----- FreeBSD if_bridge hash function ------- */
1523 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1524 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1526 * http://www.burtleburtle.net/bob/hash/spooky.html
1528 #define mix(a, b, c) \
1530 a -= b; a -= c; a ^= (c >> 13); \
1531 b -= c; b -= a; b ^= (a << 8); \
1532 c -= a; c -= b; c ^= (b >> 13); \
1533 a -= b; a -= c; a ^= (c >> 12); \
1534 b -= c; b -= a; b ^= (a << 16); \
1535 c -= a; c -= b; c ^= (b >> 5); \
1536 a -= b; a -= c; a ^= (c >> 3); \
1537 b -= c; b -= a; b ^= (a << 10); \
1538 c -= a; c -= b; c ^= (b >> 15); \
1539 } while (/*CONSTCOND*/0)
1542 static __inline uint32_t
1543 nm_bridge_rthash(const uint8_t *addr)
1545 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1555 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1)
1556 return (c & BRIDGE_RTHASH_MASK);
1562 /* nm_register callback for VALE ports */
1564 netmap_vp_reg(struct netmap_adapter *na, int onoff)
1566 struct netmap_vp_adapter *vpna =
1567 (struct netmap_vp_adapter*)na;
1571 /* persistent ports may be put in netmap mode
1572 * before being attached to a bridge
1575 BDG_WLOCK(vpna->na_bdg);
1578 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
1579 struct netmap_kring *kring = &NMR(na, t)[i];
1581 if (nm_kring_pending_on(kring))
1582 kring->nr_mode = NKR_NETMAP_ON;
1585 if (na->active_fds == 0)
1586 na->na_flags |= NAF_NETMAP_ON;
1587 /* XXX on FreeBSD, persistent VALE ports should also
1588 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
1591 if (na->active_fds == 0)
1592 na->na_flags &= ~NAF_NETMAP_ON;
1594 for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
1595 struct netmap_kring *kring = &NMR(na, t)[i];
1597 if (nm_kring_pending_off(kring))
1598 kring->nr_mode = NKR_NETMAP_OFF;
1603 BDG_WUNLOCK(vpna->na_bdg);
1609 * Lookup function for a learning bridge.
1610 * Update the hash table with the source address,
1611 * and then returns the destination port index, and the
1612 * ring in *dst_ring (at the moment, always use ring 0)
1615 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
1616 struct netmap_vp_adapter *na)
1618 uint8_t *buf = ft->ft_buf;
1619 u_int buf_len = ft->ft_len;
1620 struct nm_hash_ent *ht = na->na_bdg->ht;
1622 u_int dst, mysrc = na->bdg_port;
1623 uint64_t smac, dmac;
1626 /* safety check, unfortunately we have many cases */
1627 if (buf_len >= 14 + na->up.virt_hdr_len) {
1628 /* virthdr + mac_hdr in the same slot */
1629 buf += na->up.virt_hdr_len;
1630 buf_len -= na->up.virt_hdr_len;
1631 } else if (buf_len == na->up.virt_hdr_len && ft->ft_flags & NS_MOREFRAG) {
1632 /* only header in first fragment */
1635 buf_len = ft->ft_len;
1637 RD(5, "invalid buf format, length %d", buf_len);
1638 return NM_BDG_NOPORT;
1641 if (ft->ft_flags & NS_INDIRECT) {
1642 if (copyin(buf, indbuf, sizeof(indbuf))) {
1643 return NM_BDG_NOPORT;
1648 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1649 smac = le64toh(*(uint64_t *)(buf + 4));
1653 * The hash is somewhat expensive, there might be some
1654 * worthwhile optimizations here.
1656 if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
1658 sh = nm_bridge_rthash(s); // XXX hash of source
1659 /* update source port forwarding entry */
1660 na->last_smac = ht[sh].mac = smac; /* XXX expire ? */
1661 ht[sh].ports = mysrc;
1663 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1664 s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1666 dst = NM_BDG_BROADCAST;
1667 if ((buf[0] & 1) == 0) { /* unicast */
1668 dh = nm_bridge_rthash(buf); // XXX hash of dst
1669 if (ht[dh].mac == dmac) { /* found dst */
1672 /* XXX otherwise return NM_BDG_UNKNOWN ? */
1679 * Available space in the ring. Only used in VALE code
1680 * and only with is_rx = 1
1682 static inline uint32_t
1683 nm_kr_space(struct netmap_kring *k, int is_rx)
1688 int busy = k->nkr_hwlease - k->nr_hwcur;
1690 busy += k->nkr_num_slots;
1691 space = k->nkr_num_slots - 1 - busy;
1693 /* XXX never used in this branch */
1694 space = k->nr_hwtail - k->nkr_hwlease;
1696 space += k->nkr_num_slots;
1700 if (k->nkr_hwlease >= k->nkr_num_slots ||
1701 k->nr_hwcur >= k->nkr_num_slots ||
1702 k->nr_tail >= k->nkr_num_slots ||
1704 busy >= k->nkr_num_slots) {
1705 D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1706 k->nkr_lease_idx, k->nkr_num_slots);
1715 /* make a lease on the kring for N positions. return the
1717 * XXX only used in VALE code and with is_rx = 1
1719 static inline uint32_t
1720 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1722 uint32_t lim = k->nkr_num_slots - 1;
1723 uint32_t lease_idx = k->nkr_lease_idx;
1725 k->nkr_leases[lease_idx] = NR_NOSLOT;
1726 k->nkr_lease_idx = nm_next(lease_idx, lim);
1728 if (n > nm_kr_space(k, is_rx)) {
1729 D("invalid request for %d slots", n);
1732 /* XXX verify that there are n slots */
1733 k->nkr_hwlease += n;
1734 if (k->nkr_hwlease > lim)
1735 k->nkr_hwlease -= lim + 1;
1737 if (k->nkr_hwlease >= k->nkr_num_slots ||
1738 k->nr_hwcur >= k->nkr_num_slots ||
1739 k->nr_hwtail >= k->nkr_num_slots ||
1740 k->nkr_lease_idx >= k->nkr_num_slots) {
1741 D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1743 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1744 k->nkr_lease_idx, k->nkr_num_slots);
1751 * This flush routine supports only unicast and broadcast but a large
1752 * number of ports, and lets us replace the learn and dispatch functions.
1755 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1758 struct nm_bdg_q *dst_ents, *brddst;
1759 uint16_t num_dsts = 0, *dsts;
1760 struct nm_bridge *b = na->na_bdg;
1761 u_int i, me = na->bdg_port;
1764 * The work area (pointed by ft) is followed by an array of
1765 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1766 * queues per port plus one for the broadcast traffic.
1767 * Then we have an array of destination indexes.
1769 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1770 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1772 /* first pass: find a destination for each packet in the batch */
1773 for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1774 uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1775 uint16_t dst_port, d_i;
1778 ND("slot %d frags %d", i, ft[i].ft_frags);
1779 /* Drop the packet if the virtio-net header is not into the first
1780 fragment nor at the very beginning of the second. */
1781 if (unlikely(na->up.virt_hdr_len > ft[i].ft_len))
1783 dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na);
1784 if (netmap_verbose > 255)
1785 RD(5, "slot %d port %d -> %d", i, me, dst_port);
1786 if (dst_port == NM_BDG_NOPORT)
1787 continue; /* this packet is identified to be dropped */
1788 else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1790 else if (dst_port == NM_BDG_BROADCAST)
1791 dst_ring = 0; /* broadcasts always go to ring 0 */
1792 else if (unlikely(dst_port == me ||
1793 !b->bdg_ports[dst_port]))
1796 /* get a position in the scratch pad */
1797 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1800 /* append the first fragment to the list */
1801 if (d->bq_head == NM_FT_NULL) { /* new destination */
1802 d->bq_head = d->bq_tail = i;
1803 /* remember this position to be scanned later */
1804 if (dst_port != NM_BDG_BROADCAST)
1805 dsts[num_dsts++] = d_i;
1807 ft[d->bq_tail].ft_next = i;
1810 d->bq_len += ft[i].ft_frags;
1814 * Broadcast traffic goes to ring 0 on all destinations.
1815 * So we need to add these rings to the list of ports to scan.
1816 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1817 * expensive. We should keep a compact list of active destinations
1818 * so we could shorten this loop.
1820 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1821 if (brddst->bq_head != NM_FT_NULL) {
1823 for (j = 0; likely(j < b->bdg_active_ports); j++) {
1825 i = b->bdg_port_index[j];
1826 if (unlikely(i == me))
1828 d_i = i * NM_BDG_MAXRINGS;
1829 if (dst_ents[d_i].bq_head == NM_FT_NULL)
1830 dsts[num_dsts++] = d_i;
1834 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1835 /* second pass: scan destinations */
1836 for (i = 0; i < num_dsts; i++) {
1837 struct netmap_vp_adapter *dst_na;
1838 struct netmap_kring *kring;
1839 struct netmap_ring *ring;
1840 u_int dst_nr, lim, j, d_i, next, brd_next;
1841 u_int needed, howmany;
1842 int retry = netmap_txsync_retry;
1844 uint32_t my_start = 0, lease_idx = 0;
1846 int virt_hdr_mismatch = 0;
1849 ND("second pass %d port %d", i, d_i);
1851 // XXX fix the division
1852 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1853 /* protect from the lookup function returning an inactive
1856 if (unlikely(dst_na == NULL))
1858 if (dst_na->up.na_flags & NAF_SW_ONLY)
1861 * The interface may be in !netmap mode in two cases:
1862 * - when na is attached but not activated yet;
1863 * - when na is being deactivated but is still attached.
1865 if (unlikely(!nm_netmap_on(&dst_na->up))) {
1866 ND("not in netmap mode!");
1870 /* there is at least one either unicast or broadcast packet */
1871 brd_next = brddst->bq_head;
1873 /* we need to reserve this many slots. If fewer are
1874 * available, some packets will be dropped.
1875 * Packets may have multiple fragments, so we may not use
1876 * there is a chance that we may not use all of the slots
1877 * we have claimed, so we will need to handle the leftover
1878 * ones when we regain the lock.
1880 needed = d->bq_len + brddst->bq_len;
1882 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
1883 if (netmap_verbose) {
1884 RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
1885 dst_na->up.virt_hdr_len);
1887 /* There is a virtio-net header/offloadings mismatch between
1888 * source and destination. The slower mismatch datapath will
1889 * be used to cope with all the mismatches.
1891 virt_hdr_mismatch = 1;
1892 if (dst_na->mfs < na->mfs) {
1893 /* We may need to do segmentation offloadings, and so
1894 * we may need a number of destination slots greater
1895 * than the number of input slots ('needed').
1896 * We look for the smallest integer 'x' which satisfies:
1897 * needed * na->mfs + x * H <= x * na->mfs
1898 * where 'H' is the length of the longest header that may
1899 * be replicated in the segmentation process (e.g. for
1900 * TCPv4 we must account for ethernet header, IP header
1901 * and TCPv4 header).
1903 needed = (needed * na->mfs) /
1904 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1905 ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1909 ND(5, "pass 2 dst %d is %x %s",
1910 i, d_i, is_vp ? "virtual" : "nic/host");
1911 dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1912 nrings = dst_na->up.num_rx_rings;
1913 if (dst_nr >= nrings)
1914 dst_nr = dst_nr % nrings;
1915 kring = &dst_na->up.rx_rings[dst_nr];
1917 lim = kring->nkr_num_slots - 1;
1921 if (dst_na->retry && retry) {
1922 /* try to get some free slot from the previous run */
1923 kring->nm_notify(kring, 0);
1924 /* actually useful only for bwraps, since there
1925 * the notify will trigger a txsync on the hwna. VALE ports
1926 * have dst_na->retry == 0
1929 /* reserve the buffers in the queue and an entry
1930 * to report completion, and drop lock.
1931 * XXX this might become a helper function.
1933 mtx_lock(&kring->q_lock);
1934 if (kring->nkr_stopped) {
1935 mtx_unlock(&kring->q_lock);
1938 my_start = j = kring->nkr_hwlease;
1939 howmany = nm_kr_space(kring, 1);
1940 if (needed < howmany)
1942 lease_idx = nm_kr_lease(kring, howmany, 1);
1943 mtx_unlock(&kring->q_lock);
1945 /* only retry if we need more than available slots */
1946 if (retry && needed <= howmany)
1949 /* copy to the destination queue */
1950 while (howmany > 0) {
1951 struct netmap_slot *slot;
1952 struct nm_bdg_fwd *ft_p, *ft_end;
1955 /* find the queue from which we pick next packet.
1956 * NM_FT_NULL is always higher than valid indexes
1957 * so we never dereference it if the other list
1958 * has packets (and if both are empty we never
1961 if (next < brd_next) {
1963 next = ft_p->ft_next;
1964 } else { /* insert broadcast */
1965 ft_p = ft + brd_next;
1966 brd_next = ft_p->ft_next;
1968 cnt = ft_p->ft_frags; // cnt > 0
1969 if (unlikely(cnt > howmany))
1970 break; /* no more space */
1971 if (netmap_verbose && cnt > 1)
1972 RD(5, "rx %d frags to %d", cnt, j);
1973 ft_end = ft_p + cnt;
1974 if (unlikely(virt_hdr_mismatch)) {
1975 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
1979 char *dst, *src = ft_p->ft_buf;
1980 size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1982 slot = &ring->slot[j];
1983 dst = NMB(&dst_na->up, slot);
1985 ND("send [%d] %d(%d) bytes at %s:%d",
1986 i, (int)copy_len, (int)dst_len,
1987 NM_IFPNAME(dst_ifp), j);
1988 /* round to a multiple of 64 */
1989 copy_len = (copy_len + 63) & ~63;
1991 if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
1992 copy_len > NETMAP_BUF_SIZE(&na->up))) {
1993 RD(5, "invalid len %d, down to 64", (int)copy_len);
1994 copy_len = dst_len = 64; // XXX
1996 if (ft_p->ft_flags & NS_INDIRECT) {
1997 if (copyin(src, dst, copy_len)) {
1998 // invalid user pointer, pretend len is 0
2002 //memcpy(dst, src, copy_len);
2003 pkt_copy(src, dst, (int)copy_len);
2005 slot->len = dst_len;
2006 slot->flags = (cnt << 8)| NS_MOREFRAG;
2007 j = nm_next(j, lim);
2010 } while (ft_p != ft_end);
2011 slot->flags = (cnt << 8); /* clear flag on last entry */
2014 if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
2018 /* current position */
2019 uint32_t *p = kring->nkr_leases; /* shorthand */
2020 uint32_t update_pos;
2021 int still_locked = 1;
2023 mtx_lock(&kring->q_lock);
2024 if (unlikely(howmany > 0)) {
2025 /* not used all bufs. If i am the last one
2026 * i can recover the slots, otherwise must
2027 * fill them with 0 to mark empty packets.
2029 ND("leftover %d bufs", howmany);
2030 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
2031 /* yes i am the last one */
2032 ND("roll back nkr_hwlease to %d", j);
2033 kring->nkr_hwlease = j;
2035 while (howmany-- > 0) {
2036 ring->slot[j].len = 0;
2037 ring->slot[j].flags = 0;
2038 j = nm_next(j, lim);
2042 p[lease_idx] = j; /* report I am done */
2044 update_pos = kring->nr_hwtail;
2046 if (my_start == update_pos) {
2047 /* all slots before my_start have been reported,
2048 * so scan subsequent leases to see if other ranges
2049 * have been completed, and to a selwakeup or txsync.
2051 while (lease_idx != kring->nkr_lease_idx &&
2052 p[lease_idx] != NR_NOSLOT) {
2054 p[lease_idx] = NR_NOSLOT;
2055 lease_idx = nm_next(lease_idx, lim);
2057 /* j is the new 'write' position. j != my_start
2058 * means there are new buffers to report
2060 if (likely(j != my_start)) {
2061 kring->nr_hwtail = j;
2063 mtx_unlock(&kring->q_lock);
2064 kring->nm_notify(kring, 0);
2065 /* this is netmap_notify for VALE ports and
2066 * netmap_bwrap_notify for bwrap. The latter will
2067 * trigger a txsync on the underlying hwna
2069 if (dst_na->retry && retry--) {
2070 /* XXX this is going to call nm_notify again.
2071 * Only useful for bwrap in virtual machines
2078 mtx_unlock(&kring->q_lock);
2081 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
2084 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
2089 /* nm_txsync callback for VALE ports */
2091 netmap_vp_txsync(struct netmap_kring *kring, int flags)
2093 struct netmap_vp_adapter *na =
2094 (struct netmap_vp_adapter *)kring->na;
2096 u_int const lim = kring->nkr_num_slots - 1;
2097 u_int const head = kring->rhead;
2099 if (bridge_batch <= 0) { /* testing only */
2100 done = head; // used all
2107 if (bridge_batch > NM_BDG_BATCH)
2108 bridge_batch = NM_BDG_BATCH;
2110 done = nm_bdg_preflush(kring, head);
2113 D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
2115 * packets between 'done' and 'cur' are left unsent.
2117 kring->nr_hwcur = done;
2118 kring->nr_hwtail = nm_prev(done, lim);
2120 D("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
2125 /* rxsync code used by VALE ports nm_rxsync callback and also
2126 * internally by the brwap
2129 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
2131 struct netmap_adapter *na = kring->na;
2132 struct netmap_ring *ring = kring->ring;
2133 u_int nm_i, lim = kring->nkr_num_slots - 1;
2134 u_int head = kring->rhead;
2138 D("ouch dangerous reset!!!");
2139 n = netmap_ring_reinit(kring);
2143 /* First part, import newly received packets. */
2144 /* actually nothing to do here, they are already in the kring */
2146 /* Second part, skip past packets that userspace has released. */
2147 nm_i = kring->nr_hwcur;
2149 /* consistency check, but nothing really important here */
2150 for (n = 0; likely(nm_i != head); n++) {
2151 struct netmap_slot *slot = &ring->slot[nm_i];
2152 void *addr = NMB(na, slot);
2154 if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
2155 D("bad buffer index %d, ignore ?",
2158 slot->flags &= ~NS_BUF_CHANGED;
2159 nm_i = nm_next(nm_i, lim);
2161 kring->nr_hwcur = head;
2170 * nm_rxsync callback for VALE ports
2171 * user process reading from a VALE switch.
2172 * Already protected against concurrent calls from userspace,
2173 * but we must acquire the queue's lock to protect against
2174 * writers on the same queue.
2177 netmap_vp_rxsync(struct netmap_kring *kring, int flags)
2181 mtx_lock(&kring->q_lock);
2182 n = netmap_vp_rxsync_locked(kring, flags);
2183 mtx_unlock(&kring->q_lock);
2188 /* nm_bdg_attach callback for VALE ports
2189 * The na_vp port is this same netmap_adapter. There is no host port.
2192 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na)
2194 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
2199 strncpy(na->name, name, sizeof(na->name));
2200 na->na_hostvp = NULL;
2204 /* create a netmap_vp_adapter that describes a VALE port.
2205 * Only persistent VALE ports have a non-null ifp.
2208 netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp,
2209 struct netmap_mem_d *nmd,
2210 struct netmap_vp_adapter **ret)
2212 struct netmap_vp_adapter *vpna;
2213 struct netmap_adapter *na;
2217 vpna = nm_os_malloc(sizeof(*vpna));
2224 strncpy(na->name, nmr->nr_name, sizeof(na->name));
2226 /* bound checking */
2227 na->num_tx_rings = nmr->nr_tx_rings;
2228 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2229 nmr->nr_tx_rings = na->num_tx_rings; // write back
2230 na->num_rx_rings = nmr->nr_rx_rings;
2231 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2232 nmr->nr_rx_rings = na->num_rx_rings; // write back
2233 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
2234 1, NM_BDG_MAXSLOTS, NULL);
2235 na->num_tx_desc = nmr->nr_tx_slots;
2236 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
2237 1, NM_BDG_MAXSLOTS, NULL);
2238 /* validate number of pipes. We want at least 1,
2239 * but probably can do with some more.
2240 * So let's use 2 as default (when 0 is supplied)
2242 npipes = nmr->nr_arg1;
2243 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
2244 nmr->nr_arg1 = npipes; /* write back */
2245 /* validate extra bufs */
2246 nm_bound_var(&nmr->nr_arg3, 0, 0,
2247 128*NM_BDG_MAXSLOTS, NULL);
2248 na->num_rx_desc = nmr->nr_rx_slots;
2250 vpna->last_smac = ~0llu;
2251 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero??
2252 vpna->mfs = netmap_buf_size; */
2254 D("max frame size %u", vpna->mfs);
2256 na->na_flags |= NAF_BDG_MAYSLEEP;
2257 /* persistent VALE ports look like hw devices
2258 * with a native netmap adapter
2261 na->na_flags |= NAF_NATIVE;
2262 na->nm_txsync = netmap_vp_txsync;
2263 na->nm_rxsync = netmap_vp_rxsync;
2264 na->nm_register = netmap_vp_reg;
2265 na->nm_krings_create = netmap_vp_krings_create;
2266 na->nm_krings_delete = netmap_vp_krings_delete;
2267 na->nm_dtor = netmap_vp_dtor;
2268 D("nr_arg2 %d", nmr->nr_arg2);
2270 netmap_mem_get(nmd):
2271 netmap_mem_private_new(
2272 na->num_tx_rings, na->num_tx_desc,
2273 na->num_rx_rings, na->num_rx_desc,
2274 nmr->nr_arg3, npipes, &error);
2275 if (na->nm_mem == NULL)
2277 na->nm_bdg_attach = netmap_vp_bdg_attach;
2278 /* other nmd fields are set in the common routine */
2279 error = netmap_attach_common(na);
2286 if (na->nm_mem != NULL)
2287 netmap_mem_put(na->nm_mem);
2292 /* Bridge wrapper code (bwrap).
2293 * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
2295 * The main task is to swap the meaning of tx and rx rings to match the
2296 * expectations of the VALE switch code (see nm_bdg_flush).
2298 * The bwrap works by interposing a netmap_bwrap_adapter between the
2299 * rest of the system and the hwna. The netmap_bwrap_adapter looks like
2300 * a netmap_vp_adapter to the rest the system, but, internally, it
2301 * translates all callbacks to what the hwna expects.
2303 * Note that we have to intercept callbacks coming from two sides:
2305 * - callbacks coming from the netmap module are intercepted by
2306 * passing around the netmap_bwrap_adapter instead of the hwna
2308 * - callbacks coming from outside of the netmap module only know
2309 * about the hwna. This, however, only happens in interrupt
2310 * handlers, where only the hwna->nm_notify callback is called.
2311 * What the bwrap does is to overwrite the hwna->nm_notify callback
2312 * with its own netmap_bwrap_intr_notify.
2313 * XXX This assumes that the hwna->nm_notify callback was the
2314 * standard netmap_notify(), as it is the case for nic adapters.
2315 * Any additional action performed by hwna->nm_notify will not be
2316 * performed by netmap_bwrap_intr_notify.
2318 * Additionally, the bwrap can optionally attach the host rings pair
2319 * of the wrapped adapter to a different port of the switch.
2324 netmap_bwrap_dtor(struct netmap_adapter *na)
2326 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2327 struct netmap_adapter *hwna = bna->hwna;
2328 struct nm_bridge *b = bna->up.na_bdg,
2329 *bh = bna->host.na_bdg;
2331 netmap_mem_put(bna->host.up.nm_mem);
2334 netmap_bdg_detach_common(b, bna->up.bdg_port,
2335 (bh ? bna->host.bdg_port : -1));
2340 bna->host.up.ifp = NULL;
2341 hwna->na_private = NULL;
2342 hwna->na_vp = hwna->na_hostvp = NULL;
2343 hwna->na_flags &= ~NAF_BUSY;
2344 netmap_adapter_put(hwna);
2350 * Intr callback for NICs connected to a bridge.
2351 * Simply ignore tx interrupts (maybe we could try to recover space ?)
2352 * and pass received packets from nic to the bridge.
2354 * XXX TODO check locking: this is called from the interrupt
2355 * handler so we should make sure that the interface is not
2356 * disconnected while passing down an interrupt.
2358 * Note, no user process can access this NIC or the host stack.
2359 * The only part of the ring that is significant are the slots,
2360 * and head/cur/tail are set from the kring as needed
2361 * (part as a receive ring, part as a transmit ring).
2363 * callback that overwrites the hwna notify callback.
2364 * Packets come from the outside or from the host stack and are put on an
2366 * The bridge wrapper then sends the packets through the bridge.
2369 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
2371 struct netmap_adapter *na = kring->na;
2372 struct netmap_bwrap_adapter *bna = na->na_private;
2373 struct netmap_kring *bkring;
2374 struct netmap_vp_adapter *vpna = &bna->up;
2375 u_int ring_nr = kring->ring_id;
2376 int ret = NM_IRQ_COMPLETED;
2380 D("%s %s 0x%x", na->name, kring->name, flags);
2382 bkring = &vpna->up.tx_rings[ring_nr];
2384 /* make sure the ring is not disabled */
2385 if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {
2390 D("%s head %d cur %d tail %d", na->name,
2391 kring->rhead, kring->rcur, kring->rtail);
2393 /* simulate a user wakeup on the rx ring
2394 * fetch packets that have arrived.
2396 error = kring->nm_sync(kring, 0);
2399 if (kring->nr_hwcur == kring->nr_hwtail) {
2401 D("how strange, interrupt with no packets on %s",
2406 /* new packets are kring->rcur to kring->nr_hwtail, and the bkring
2407 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
2408 * to push all packets out.
2410 bkring->rhead = bkring->rcur = kring->nr_hwtail;
2412 netmap_vp_txsync(bkring, flags);
2414 /* mark all buffers as released on this ring */
2415 kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
2416 /* another call to actually release the buffers */
2417 error = kring->nm_sync(kring, 0);
2419 /* The second rxsync may have further advanced hwtail. If this happens,
2420 * return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */
2421 if (kring->rcur != kring->nr_hwtail) {
2422 ret = NM_IRQ_RESCHED;
2427 return error ? error : ret;
2431 /* nm_register callback for bwrap */
2433 netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
2435 struct netmap_bwrap_adapter *bna =
2436 (struct netmap_bwrap_adapter *)na;
2437 struct netmap_adapter *hwna = bna->hwna;
2438 struct netmap_vp_adapter *hostna = &bna->host;
2442 ND("%s %s", na->name, onoff ? "on" : "off");
2445 /* netmap_do_regif has been called on the bwrap na.
2446 * We need to pass the information about the
2447 * memory allocator down to the hwna before
2448 * putting it in netmap mode
2450 hwna->na_lut = na->na_lut;
2452 if (hostna->na_bdg) {
2453 /* if the host rings have been attached to switch,
2454 * we need to copy the memory allocator information
2455 * in the hostna also
2457 hostna->up.na_lut = na->na_lut;
2460 /* cross-link the netmap rings
2461 * The original number of rings comes from hwna,
2462 * rx rings on one side equals tx rings on the other.
2465 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2466 for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {
2467 NMR(hwna, r)[i].ring = NMR(na, t)[i].ring;
2471 if (na->na_flags & NAF_HOST_RINGS) {
2472 struct netmap_adapter *hna = &hostna->up;
2473 /* the hostna rings are the host rings of the bwrap.
2474 * The corresponding krings must point back to the
2477 hna->tx_rings = &na->tx_rings[na->num_tx_rings];
2478 hna->tx_rings[0].na = hna;
2479 hna->rx_rings = &na->rx_rings[na->num_rx_rings];
2480 hna->rx_rings[0].na = hna;
2484 /* pass down the pending ring state information */
2486 for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
2487 NMR(hwna, t)[i].nr_pending_mode =
2488 NMR(na, t)[i].nr_pending_mode;
2491 /* forward the request to the hwna */
2492 error = hwna->nm_register(hwna, onoff);
2496 /* copy up the current ring state information */
2498 for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
2499 NMR(na, t)[i].nr_mode =
2500 NMR(hwna, t)[i].nr_mode;
2503 /* impersonate a netmap_vp_adapter */
2504 netmap_vp_reg(na, onoff);
2506 netmap_vp_reg(&hostna->up, onoff);
2510 /* intercept the hwna nm_nofify callback on the hw rings */
2511 for (i = 0; i < hwna->num_rx_rings; i++) {
2512 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2513 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2515 i = hwna->num_rx_rings; /* for safety */
2516 /* save the host ring notify unconditionally */
2517 hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2518 if (hostna->na_bdg) {
2519 /* also intercept the host ring notify */
2520 hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2522 if (na->active_fds == 0)
2523 na->na_flags |= NAF_NETMAP_ON;
2527 if (na->active_fds == 0)
2528 na->na_flags &= ~NAF_NETMAP_ON;
2530 /* reset all notify callbacks (including host ring) */
2531 for (i = 0; i <= hwna->num_rx_rings; i++) {
2532 hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify;
2533 hwna->rx_rings[i].save_notify = NULL;
2535 hwna->na_lut.lut = NULL;
2536 hwna->na_lut.objtotal = 0;
2537 hwna->na_lut.objsize = 0;
2543 /* nm_config callback for bwrap */
2545 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
2546 u_int *rxr, u_int *rxd)
2548 struct netmap_bwrap_adapter *bna =
2549 (struct netmap_bwrap_adapter *)na;
2550 struct netmap_adapter *hwna = bna->hwna;
2552 /* forward the request */
2553 netmap_update_config(hwna);
2554 /* swap the results */
2555 *txr = hwna->num_rx_rings;
2556 *txd = hwna->num_rx_desc;
2557 *rxr = hwna->num_tx_rings;
2558 *rxd = hwna->num_rx_desc;
2564 /* nm_krings_create callback for bwrap */
2566 netmap_bwrap_krings_create(struct netmap_adapter *na)
2568 struct netmap_bwrap_adapter *bna =
2569 (struct netmap_bwrap_adapter *)na;
2570 struct netmap_adapter *hwna = bna->hwna;
2576 /* impersonate a netmap_vp_adapter */
2577 error = netmap_vp_krings_create(na);
2581 /* also create the hwna krings */
2582 error = hwna->nm_krings_create(hwna);
2584 goto err_del_vp_rings;
2587 /* get each ring slot number from the corresponding hwna ring */
2589 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2590 for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {
2591 NMR(na, t)[i].nkr_num_slots = NMR(hwna, r)[i].nkr_num_slots;
2598 netmap_vp_krings_delete(na);
2605 netmap_bwrap_krings_delete(struct netmap_adapter *na)
2607 struct netmap_bwrap_adapter *bna =
2608 (struct netmap_bwrap_adapter *)na;
2609 struct netmap_adapter *hwna = bna->hwna;
2613 hwna->nm_krings_delete(hwna);
2614 netmap_vp_krings_delete(na);
2618 /* notify method for the bridge-->hwna direction */
2620 netmap_bwrap_notify(struct netmap_kring *kring, int flags)
2622 struct netmap_adapter *na = kring->na;
2623 struct netmap_bwrap_adapter *bna = na->na_private;
2624 struct netmap_adapter *hwna = bna->hwna;
2625 u_int ring_n = kring->ring_id;
2626 u_int lim = kring->nkr_num_slots - 1;
2627 struct netmap_kring *hw_kring;
2630 ND("%s: na %s hwna %s",
2631 (kring ? kring->name : "NULL!"),
2632 (na ? na->name : "NULL!"),
2633 (hwna ? hwna->name : "NULL!"));
2634 hw_kring = &hwna->tx_rings[ring_n];
2636 if (nm_kr_tryget(hw_kring, 0, NULL)) {
2640 /* first step: simulate a user wakeup on the rx ring */
2641 netmap_vp_rxsync(kring, flags);
2642 ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2644 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2645 ring->head, ring->cur, ring->tail,
2646 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
2647 /* second step: the new packets are sent on the tx ring
2648 * (which is actually the same ring)
2650 hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
2651 error = hw_kring->nm_sync(hw_kring, flags);
2655 /* third step: now we are back the rx ring */
2656 /* claim ownership on all hw owned bufs */
2657 kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
2659 /* fourth step: the user goes to sleep again, causing another rxsync */
2660 netmap_vp_rxsync(kring, flags);
2661 ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2663 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2664 ring->head, ring->cur, ring->tail,
2665 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
2667 nm_kr_put(hw_kring);
2669 return error ? error : NM_IRQ_COMPLETED;
2673 /* nm_bdg_ctl callback for the bwrap.
2674 * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd].
2675 * On attach, it needs to provide a fake netmap_priv_d structure and
2676 * perform a netmap_do_regif() on the bwrap. This will put both the
2677 * bwrap and the hwna in netmap mode, with the netmap rings shared
2678 * and cross linked. Moroever, it will start intercepting interrupts
2682 netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
2684 struct netmap_priv_d *npriv;
2685 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2689 if (NETMAP_OWNED_BY_ANY(na)) {
2692 if (bna->na_kpriv) {
2696 npriv = netmap_priv_new();
2699 npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */
2700 error = netmap_do_regif(npriv, na, 0, NR_REG_NIC_SW);
2702 netmap_priv_delete(npriv);
2705 bna->na_kpriv = npriv;
2706 na->na_flags |= NAF_BUSY;
2708 if (na->active_fds == 0) /* not registered */
2710 netmap_priv_delete(bna->na_kpriv);
2711 bna->na_kpriv = NULL;
2712 na->na_flags &= ~NAF_BUSY;
2718 /* attach a bridge wrapper to the 'real' device */
2720 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
2722 struct netmap_bwrap_adapter *bna;
2723 struct netmap_adapter *na = NULL;
2724 struct netmap_adapter *hostna = NULL;
2728 /* make sure the NIC is not already in use */
2729 if (NETMAP_OWNED_BY_ANY(hwna)) {
2730 D("NIC %s busy, cannot attach to bridge", hwna->name);
2734 bna = nm_os_malloc(sizeof(*bna));
2740 /* make bwrap ifp point to the real ifp */
2741 na->ifp = hwna->ifp;
2743 na->na_private = bna;
2744 strncpy(na->name, nr_name, sizeof(na->name));
2745 /* fill the ring data for the bwrap adapter with rx/tx meanings
2746 * swapped. The real cross-linking will be done during register,
2747 * when all the krings will have been created.
2750 enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2751 nma_set_nrings(na, t, nma_get_nrings(hwna, r));
2752 nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
2754 na->nm_dtor = netmap_bwrap_dtor;
2755 na->nm_register = netmap_bwrap_reg;
2756 // na->nm_txsync = netmap_bwrap_txsync;
2757 // na->nm_rxsync = netmap_bwrap_rxsync;
2758 na->nm_config = netmap_bwrap_config;
2759 na->nm_krings_create = netmap_bwrap_krings_create;
2760 na->nm_krings_delete = netmap_bwrap_krings_delete;
2761 na->nm_notify = netmap_bwrap_notify;
2762 na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
2763 na->pdev = hwna->pdev;
2764 na->nm_mem = netmap_mem_get(hwna->nm_mem);
2765 na->virt_hdr_len = hwna->virt_hdr_len;
2766 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2769 netmap_adapter_get(hwna);
2770 hwna->na_private = bna; /* weak reference */
2771 hwna->na_vp = &bna->up;
2773 if (hwna->na_flags & NAF_HOST_RINGS) {
2774 if (hwna->na_flags & NAF_SW_ONLY)
2775 na->na_flags |= NAF_SW_ONLY;
2776 na->na_flags |= NAF_HOST_RINGS;
2777 hostna = &bna->host.up;
2778 snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name);
2779 hostna->ifp = hwna->ifp;
2781 enum txrx r = nm_txrx_swap(t);
2782 nma_set_nrings(hostna, t, 1);
2783 nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
2785 // hostna->nm_txsync = netmap_bwrap_host_txsync;
2786 // hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2787 hostna->nm_notify = netmap_bwrap_notify;
2788 hostna->nm_mem = netmap_mem_get(na->nm_mem);
2789 hostna->na_private = bna;
2790 hostna->na_vp = &bna->up;
2791 na->na_hostvp = hwna->na_hostvp =
2792 hostna->na_hostvp = &bna->host;
2793 hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
2796 ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2797 na->name, ifp->if_xname,
2798 na->num_tx_rings, na->num_tx_desc,
2799 na->num_rx_rings, na->num_rx_desc);
2801 error = netmap_attach_common(na);
2805 hwna->na_flags |= NAF_BUSY;
2809 hwna->na_vp = hwna->na_hostvp = NULL;
2810 netmap_adapter_put(hwna);
2817 netmap_init_bridges2(u_int n)
2820 struct nm_bridge *b;
2822 b = nm_os_malloc(sizeof(struct nm_bridge) * n);
2825 for (i = 0; i < n; i++)
2831 netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
2838 for (i = 0; i < n; i++)
2839 BDG_RWDESTROY(&b[i]);
2844 netmap_init_bridges(void)
2846 #ifdef CONFIG_NET_NS
2847 return netmap_bns_register();
2849 nm_bridges = netmap_init_bridges2(NM_BRIDGES);
2850 if (nm_bridges == NULL)
2857 netmap_uninit_bridges(void)
2859 #ifdef CONFIG_NET_NS
2860 netmap_bns_unregister();
2862 netmap_uninit_bridges2(nm_bridges, NM_BRIDGES);
2865 #endif /* WITH_VALE */