2 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * This module implements the VALE switch for netmap
32 NMG_LOCK() serializes all modifications to switches and ports.
33 A switch cannot be deleted until all ports are gone.
35 For each switch, an SX lock (RWlock on linux) protects
36 deletion of ports. When configuring or deleting a new port, the
37 lock is acquired in exclusive mode (after holding NMG_LOCK).
38 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
39 The lock is held throughout the entire forwarding cycle,
40 during which the thread may incur in a page fault.
41 Hence it is important that sleepable shared locks are used.
43 On the rx ring, the per-port lock is grabbed initially to reserve
44 a number of slot in the ring, then the lock is released,
45 packets are copied from source to destination, and then
46 the lock is acquired again and the receive ring is updated.
47 (A similar thing is done on the tx ring for NIC and host stack
48 ports attached to the switch)
53 * OS-specific code that is used only within this file.
54 * Other OS-specific code that must be accessed by drivers
55 * is present in netmap_kern.h
58 #if defined(__FreeBSD__)
59 #include <sys/cdefs.h> /* prerequisite */
60 __FBSDID("$FreeBSD$");
62 #include <sys/types.h>
63 #include <sys/errno.h>
64 #include <sys/param.h> /* defines used in kernel.h */
65 #include <sys/kernel.h> /* types used in module initialization */
66 #include <sys/conf.h> /* cdevsw struct, UID, GID */
67 #include <sys/sockio.h>
68 #include <sys/socketvar.h> /* struct socket */
69 #include <sys/malloc.h>
71 #include <sys/rwlock.h>
72 #include <sys/socket.h> /* sockaddrs */
73 #include <sys/selinfo.h>
74 #include <sys/sysctl.h>
76 #include <net/if_var.h>
77 #include <net/bpf.h> /* BIOCIMMEDIATE */
78 #include <machine/bus.h> /* bus_dmamap_* */
79 #include <sys/endian.h>
80 #include <sys/refcount.h>
83 #define BDG_RWLOCK_T struct rwlock // struct rwlock
85 #define BDG_RWINIT(b) \
86 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
87 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock)
88 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock)
89 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock)
90 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock)
91 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock)
92 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock)
99 #elif defined(__APPLE__)
101 #warning OSX support is only partial
102 #include "osx_glue.h"
106 #error Unsupported platform
108 #endif /* unsupported */
114 #include <net/netmap.h>
115 #include <dev/netmap/netmap_kern.h>
116 #include <dev/netmap/netmap_mem2.h>
121 * system parameters (most of them in netmap_kern.h)
122 * NM_NAME prefix for switch port names, default "vale"
123 * NM_BDG_MAXPORTS number of ports
124 * NM_BRIDGES max number of switches in the system.
125 * XXX should become a sysctl or tunable
127 * Switch ports are named valeX:Y where X is the switch name and Y
128 * is the port. If Y matches a physical interface name, the port is
129 * connected to a physical device.
131 * Unlike physical interfaces, switch ports use their own memory region
132 * for rings and buffers.
133 * The virtual interfaces use per-queue lock instead of core lock.
134 * In the tx loop, we aggregate traffic in batches to make all operations
135 * faster. The batch size is bridge_batch.
137 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */
138 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */
139 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */
140 #define NM_BDG_HASH 1024 /* forwarding table entries */
141 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */
142 #define NM_MULTISEG 64 /* max size of a chain of bufs */
143 /* actual size of the tables */
144 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG)
145 /* NM_FT_NULL terminates a list of slots in the ft */
146 #define NM_FT_NULL NM_BDG_BATCH_MAX
147 #define NM_BRIDGES 8 /* number of bridges */
151 * bridge_batch is set via sysctl to the max batch size to be
152 * used in the bridge. The actual value may be larger as the
153 * last packet in the block may overflow the size.
155 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
156 SYSCTL_DECL(_dev_netmap);
157 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
160 static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp);
161 static int bdg_netmap_reg(struct netmap_adapter *na, int onoff);
162 static int netmap_bwrap_attach(struct ifnet *, struct ifnet *);
163 static int netmap_bwrap_register(struct netmap_adapter *, int onoff);
164 int kern_netmap_regif(struct nmreq *nmr);
167 * Each transmit queue accumulates a batch of packets into
168 * a structure before forwarding. Packets to the same
169 * destination are put in a list using ft_next as a link field.
170 * ft_frags and ft_next are valid only on the first fragment.
172 struct nm_bdg_fwd { /* forwarding entry for a bridge */
173 void *ft_buf; /* netmap or indirect buffer */
174 uint8_t ft_frags; /* how many fragments (only on 1st frag) */
175 uint8_t _ft_port; /* dst port (unused) */
176 uint16_t ft_flags; /* flags, e.g. indirect */
177 uint16_t ft_len; /* src fragment len */
178 uint16_t ft_next; /* next packet to same destination */
182 * For each output interface, nm_bdg_q is used to construct a list.
183 * bq_len is the number of output buffers (we can have coalescing
189 uint32_t bq_len; /* number of buffers */
192 /* XXX revise this */
194 uint64_t mac; /* the top 2 bytes are the epoch */
199 * nm_bridge is a descriptor for a VALE switch.
200 * Interfaces for a bridge are all in bdg_ports[].
201 * The array has fixed size, an empty entry does not terminate
202 * the search, but lookups only occur on attach/detach so we
203 * don't mind if they are slow.
205 * The bridge is non blocking on the transmit ports: excess
206 * packets are dropped if there is no room on the output port.
208 * bdg_lock protects accesses to the bdg_ports array.
209 * This is a rw lock (or equivalent).
212 /* XXX what is the proper alignment/layout ? */
213 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */
215 uint32_t bdg_active_ports; /* 0 means free */
216 char bdg_basename[IFNAMSIZ];
218 /* Indexes of active ports (up to active_ports)
219 * and all other remaining ports.
221 uint8_t bdg_port_index[NM_BDG_MAXPORTS];
223 struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
227 * The function to decide the destination port.
228 * It returns either of an index of the destination port,
229 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
230 * forward this packet. ring_nr is the source ring index, and the
231 * function may overwrite this value to forward this packet to a
232 * different ring index.
233 * This function must be set by netmap_bdgctl().
235 bdg_lookup_fn_t nm_bdg_lookup;
237 /* the forwarding table, MAC+ports.
238 * XXX should be changed to an argument to be passed to
239 * the lookup function, and allocated on attach
241 struct nm_hash_ent ht[NM_BDG_HASH];
246 * XXX in principle nm_bridges could be created dynamically
247 * Right now we have a static array and deletions are protected
248 * by an exclusive lock.
250 struct nm_bridge nm_bridges[NM_BRIDGES];
254 * this is a slightly optimized copy routine which rounds
255 * to multiple of 64 bytes and is often faster than dealing
256 * with other odd sizes. We assume there is enough room
257 * in the source and destination buffers.
259 * XXX only for multiples of 64 bytes, non overlapped.
262 pkt_copy(void *_src, void *_dst, int l)
264 uint64_t *src = _src;
265 uint64_t *dst = _dst;
266 if (unlikely(l >= 1024)) {
270 for (; likely(l > 0); l-=64) {
284 * locate a bridge among the existing ones.
285 * MUST BE CALLED WITH NMG_LOCK()
287 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
288 * We assume that this is called with a name of at least NM_NAME chars.
290 static struct nm_bridge *
291 nm_find_bridge(const char *name, int create)
294 struct nm_bridge *b = NULL;
298 namelen = strlen(NM_NAME); /* base length */
299 l = name ? strlen(name) : 0; /* actual length */
301 D("invalid bridge name %s", name ? name : NULL);
304 for (i = namelen + 1; i < l; i++) {
305 if (name[i] == ':') {
310 if (namelen >= IFNAMSIZ)
312 ND("--- prefix is '%.*s' ---", namelen, name);
314 /* lookup the name, remember empty slot if there is one */
315 for (i = 0; i < NM_BRIDGES; i++) {
316 struct nm_bridge *x = nm_bridges + i;
318 if (x->bdg_active_ports == 0) {
319 if (create && b == NULL)
320 b = x; /* record empty slot */
321 } else if (x->bdg_namelen != namelen) {
323 } else if (strncmp(name, x->bdg_basename, namelen) == 0) {
324 ND("found '%.*s' at %d", namelen, name, i);
329 if (i == NM_BRIDGES && b) { /* name not found, can create entry */
330 /* initialize the bridge */
331 strncpy(b->bdg_basename, name, namelen);
332 ND("create new bridge %s with ports %d", b->bdg_basename,
333 b->bdg_active_ports);
334 b->bdg_namelen = namelen;
335 b->bdg_active_ports = 0;
336 for (i = 0; i < NM_BDG_MAXPORTS; i++)
337 b->bdg_port_index[i] = i;
338 /* set the default function */
339 b->nm_bdg_lookup = netmap_bdg_learning;
340 /* reset the MAC address table */
341 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
348 * Free the forwarding tables for rings attached to switch ports.
351 nm_free_bdgfwd(struct netmap_adapter *na)
354 struct netmap_kring *kring;
357 nrings = na->num_tx_rings;
358 kring = na->tx_rings;
359 for (i = 0; i < nrings; i++) {
360 if (kring[i].nkr_ft) {
361 free(kring[i].nkr_ft, M_DEVBUF);
362 kring[i].nkr_ft = NULL; /* protect from freeing twice */
369 * Allocate the forwarding tables for the rings attached to the bridge ports.
372 nm_alloc_bdgfwd(struct netmap_adapter *na)
374 int nrings, l, i, num_dstq;
375 struct netmap_kring *kring;
378 /* all port:rings + broadcast */
379 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
380 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
381 l += sizeof(struct nm_bdg_q) * num_dstq;
382 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
384 nrings = na->num_tx_rings + 1;
385 kring = na->tx_rings;
386 for (i = 0; i < nrings; i++) {
387 struct nm_bdg_fwd *ft;
388 struct nm_bdg_q *dstq;
391 ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
396 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
397 for (j = 0; j < num_dstq; j++) {
398 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
401 kring[i].nkr_ft = ft;
408 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
410 int s_hw = hw, s_sw = sw;
411 int i, lim =b->bdg_active_ports;
412 uint8_t tmp[NM_BDG_MAXPORTS];
416 make a copy of bdg_port_index;
417 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
418 in the array of bdg_port_index, replacing them with
419 entries from the bottom of the array;
420 decrement bdg_active_ports;
421 acquire BDG_WLOCK() and copy back the array.
424 D("detach %d and %d (lim %d)", hw, sw, lim);
425 /* make a copy of the list of active ports, update it,
426 * and then copy back within BDG_WLOCK().
428 memcpy(tmp, b->bdg_port_index, sizeof(tmp));
429 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
430 if (hw >= 0 && tmp[i] == hw) {
431 ND("detach hw %d at %d", hw, i);
432 lim--; /* point to last active port */
433 tmp[i] = tmp[lim]; /* swap with i */
434 tmp[lim] = hw; /* now this is inactive */
436 } else if (sw >= 0 && tmp[i] == sw) {
437 ND("detach sw %d at %d", sw, i);
446 if (hw >= 0 || sw >= 0) {
447 D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
451 b->bdg_ports[s_hw] = NULL;
453 b->bdg_ports[s_sw] = NULL;
455 memcpy(b->bdg_port_index, tmp, sizeof(tmp));
456 b->bdg_active_ports = lim;
459 ND("now %d active ports", lim);
461 ND("marking bridge %s as free", b->bdg_basename);
462 b->nm_bdg_lookup = NULL;
468 netmap_adapter_vp_dtor(struct netmap_adapter *na)
470 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
471 struct nm_bridge *b = vpna->na_bdg;
472 struct ifnet *ifp = na->ifp;
474 ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount);
477 netmap_bdg_detach_common(b, vpna->bdg_port, -1);
480 bzero(ifp, sizeof(*ifp));
486 /* Try to get a reference to a netmap adapter attached to a VALE switch.
487 * If the adapter is found (or is created), this function returns 0, a
488 * non NULL pointer is returned into *na, and the caller holds a
489 * reference to the adapter.
490 * If an adapter is not found, then no reference is grabbed and the
491 * function returns an error code, or 0 if there is just a VALE prefix
492 * mismatch. Therefore the caller holds a reference when
493 * (*na != NULL && return == 0).
496 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
498 const char *name = nmr->nr_name;
501 struct netmap_adapter *ret;
502 struct netmap_vp_adapter *vpna;
504 int i, j, cand = -1, cand2 = -1;
507 *na = NULL; /* default return value */
509 /* first try to see if this is a bridge port. */
511 if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
512 return 0; /* no error, but no VALE prefix */
515 b = nm_find_bridge(name, create);
517 D("no bridges available for '%s'", name);
518 return (create ? ENOMEM : ENXIO);
521 /* Now we are sure that name starts with the bridge's name,
522 * lookup the port in the bridge. We need to scan the entire
523 * list. It is not important to hold a WLOCK on the bridge
524 * during the search because NMG_LOCK already guarantees
525 * that there are no other possible writers.
528 /* lookup in the local list of ports */
529 for (j = 0; j < b->bdg_active_ports; j++) {
530 i = b->bdg_port_index[j];
531 vpna = b->bdg_ports[i];
532 // KASSERT(na != NULL);
534 /* XXX make sure the name only contains one : */
535 if (!strcmp(NM_IFPNAME(ifp), name)) {
536 netmap_adapter_get(&vpna->up);
537 ND("found existing if %s refs %d", name,
538 vpna->na_bdg_refcount);
539 *na = (struct netmap_adapter *)vpna;
543 /* not found, should we create it? */
546 /* yes we should, see if we have space to attach entries */
547 needed = 2; /* in some cases we only need 1 */
548 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
549 D("bridge full %d, cannot create new port", b->bdg_active_ports);
552 /* record the next two ports available, but do not allocate yet */
553 cand = b->bdg_port_index[b->bdg_active_ports];
554 cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
555 ND("+++ bridge %s port %s used %d avail %d %d",
556 b->bdg_basename, name, b->bdg_active_ports, cand, cand2);
559 * try see if there is a matching NIC with this name
560 * (after the bridge's name)
562 ifp = ifunit_ref(name + b->bdg_namelen + 1);
563 if (!ifp) { /* this is a virtual port */
565 /* nr_cmd must be 0 for a virtual port */
569 /* create a struct ifnet for the new port.
570 * need M_NOWAIT as we are under nma_lock
572 ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
576 strcpy(ifp->if_xname, name);
577 /* bdg_netmap_attach creates a struct netmap_adapter */
578 error = bdg_netmap_attach(nmr, ifp);
580 D("error %d", error);
585 cand2 = -1; /* only need one port */
586 } else { /* this is a NIC */
587 struct ifnet *fake_ifp;
589 error = netmap_get_hw_na(ifp, &ret);
590 if (error || ret == NULL)
593 /* make sure the NIC is not already in use */
594 if (NETMAP_OWNED_BY_ANY(ret)) {
595 D("NIC %s busy, cannot attach to bridge",
600 /* create a fake interface */
601 fake_ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
606 strcpy(fake_ifp->if_xname, name);
607 error = netmap_bwrap_attach(fake_ifp, ifp);
609 free(fake_ifp, M_DEVBUF);
613 if (nmr->nr_arg1 != NETMAP_BDG_HOST)
614 cand2 = -1; /* only need one port */
617 vpna = (struct netmap_vp_adapter *)ret;
620 vpna->bdg_port = cand;
621 ND("NIC %p to bridge port %d", vpna, cand);
622 /* bind the port to the bridge (virtual ports are not active) */
623 b->bdg_ports[cand] = vpna;
625 b->bdg_active_ports++;
627 struct netmap_vp_adapter *hostna = vpna + 1;
628 /* also bind the host stack to the bridge */
629 b->bdg_ports[cand2] = hostna;
630 hostna->bdg_port = cand2;
632 b->bdg_active_ports++;
633 ND("host %p to bridge port %d", hostna, cand2);
635 ND("if %s refs %d", name, vpna->up.na_refcount);
638 netmap_adapter_get(ret);
648 /* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
650 nm_bdg_attach(struct nmreq *nmr)
652 struct netmap_adapter *na;
653 struct netmap_if *nifp;
654 struct netmap_priv_d *npriv;
655 struct netmap_bwrap_adapter *bna;
658 npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
664 error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */);
665 if (error) /* no device, or another bridge or user owns the device */
668 if (na == NULL) { /* VALE prefix missing */
673 if (na->active_fds > 0) { /* already registered */
678 nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, &error);
683 bna = (struct netmap_bwrap_adapter*)na;
684 bna->na_kpriv = npriv;
686 ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp));
690 netmap_adapter_put(na);
693 bzero(npriv, sizeof(*npriv));
694 free(npriv, M_DEVBUF);
700 nm_bdg_detach(struct nmreq *nmr)
702 struct netmap_adapter *na;
704 struct netmap_bwrap_adapter *bna;
708 error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */);
709 if (error) { /* no device, or another bridge or user owns the device */
713 if (na == NULL) { /* VALE prefix missing */
718 bna = (struct netmap_bwrap_adapter *)na;
720 if (na->active_fds == 0) { /* not registered */
725 last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */
726 if (!last_instance) {
727 D("--- error, trying to detach an entry with active mmaps");
730 struct netmap_priv_d *npriv = bna->na_kpriv;
732 bna->na_kpriv = NULL;
735 bzero(npriv, sizeof(*npriv));
736 free(npriv, M_DEVBUF);
740 netmap_adapter_put(na);
748 /* exported to kernel callers, e.g. OVS ?
750 * Called without NMG_LOCK.
753 netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
756 struct netmap_adapter *na;
757 struct netmap_vp_adapter *vpna;
759 char *name = nmr->nr_name;
760 int cmd = nmr->nr_cmd, namelen = strlen(name);
764 case NETMAP_BDG_ATTACH:
765 error = nm_bdg_attach(nmr);
768 case NETMAP_BDG_DETACH:
769 error = nm_bdg_detach(nmr);
772 case NETMAP_BDG_LIST:
773 /* this is used to enumerate bridges and ports */
774 if (namelen) { /* look up indexes of bridge and port */
775 if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
780 b = nm_find_bridge(name, 0 /* don't create */);
788 for (j = 0; j < b->bdg_active_ports; j++) {
789 i = b->bdg_port_index[j];
790 vpna = b->bdg_ports[i];
792 D("---AAAAAAAAARGH-------");
796 /* the former and the latter identify a
797 * virtual port and a NIC, respectively
799 if (!strcmp(iter->if_xname, name)) {
801 nmr->nr_arg1 = b - nm_bridges;
802 nmr->nr_arg2 = i; /* port index */
809 /* return the first non-empty entry starting from
810 * bridge nr_arg1 and port nr_arg2.
812 * Users can detect the end of the same bridge by
813 * seeing the new and old value of nr_arg1, and can
814 * detect the end of all the bridge by error != 0
820 for (error = ENOENT; i < NM_BRIDGES; i++) {
822 if (j >= b->bdg_active_ports) {
823 j = 0; /* following bridges scan from 0 */
828 j = b->bdg_port_index[j];
829 vpna = b->bdg_ports[j];
831 strncpy(name, iter->if_xname, (size_t)IFNAMSIZ);
839 case NETMAP_BDG_LOOKUP_REG:
840 /* register a lookup function to the given bridge.
841 * nmr->nr_name may be just bridge's name (including ':'
842 * if it is not just NM_NAME).
849 b = nm_find_bridge(name, 0 /* don't create */);
853 b->nm_bdg_lookup = func;
858 case NETMAP_BDG_OFFSET:
860 error = netmap_get_bdg_na(nmr, &na, 0);
862 vpna = (struct netmap_vp_adapter *)na;
863 if (nmr->nr_arg1 > NETMAP_BDG_MAX_OFFSET)
864 nmr->nr_arg1 = NETMAP_BDG_MAX_OFFSET;
865 vpna->offset = nmr->nr_arg1;
866 D("Using offset %d for %p", vpna->offset, vpna);
867 netmap_adapter_put(na);
873 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
882 netmap_vp_krings_create(struct netmap_adapter *na)
884 u_int ntx, nrx, tailroom;
888 /* XXX vps do not need host rings,
889 * but we crash if we don't have one
891 ntx = na->num_tx_rings + 1;
892 nrx = na->num_rx_rings + 1;
895 * Leases are attached to RX rings on vale ports
897 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
899 error = netmap_krings_create(na, ntx, nrx, tailroom);
903 leases = na->tailroom;
905 for (i = 0; i < nrx; i++) { /* Receive rings */
906 na->rx_rings[i].nkr_leases = leases;
907 leases += na->num_rx_desc;
910 error = nm_alloc_bdgfwd(na);
912 netmap_krings_delete(na);
921 netmap_vp_krings_delete(struct netmap_adapter *na)
924 netmap_krings_delete(na);
929 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
930 struct netmap_vp_adapter *na, u_int ring_nr);
934 * Grab packets from a kring, move them into the ft structure
935 * associated to the tx (input) port. Max one instance per port,
936 * filtered on input (ioctl, poll or XXX).
937 * Returns the next position in the ring.
940 nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr,
941 struct netmap_kring *kring, u_int end)
943 struct netmap_ring *ring = kring->ring;
944 struct nm_bdg_fwd *ft;
945 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
946 u_int ft_i = 0; /* start from 0 */
947 u_int frags = 1; /* how many frags ? */
948 struct nm_bridge *b = na->na_bdg;
950 /* To protect against modifications to the bridge we acquire a
951 * shared lock, waiting if we can sleep (if the source port is
952 * attached to a user process) or with a trylock otherwise (NICs).
954 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
955 if (na->up.na_flags & NAF_BDG_MAYSLEEP)
957 else if (!BDG_RTRYLOCK(b))
959 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
962 for (; likely(j != end); j = nm_next(j, lim)) {
963 struct netmap_slot *slot = &ring->slot[j];
966 ft[ft_i].ft_len = slot->len;
967 ft[ft_i].ft_flags = slot->flags;
969 ND("flags is 0x%x", slot->flags);
970 /* this slot goes into a list so initialize the link field */
971 ft[ft_i].ft_next = NM_FT_NULL;
972 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
973 (void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot);
974 __builtin_prefetch(buf);
976 if (slot->flags & NS_MOREFRAG) {
980 if (unlikely(netmap_verbose && frags > 1))
981 RD(5, "%d frags at %d", frags, ft_i - frags);
982 ft[ft_i - frags].ft_frags = frags;
984 if (unlikely((int)ft_i >= bridge_batch))
985 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
988 D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
989 // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
990 ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
991 ft[ft_i - frags].ft_frags = frags - 1;
994 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1000 /* ----- FreeBSD if_bridge hash function ------- */
1003 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1004 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1006 * http://www.burtleburtle.net/bob/hash/spooky.html
1008 #define mix(a, b, c) \
1010 a -= b; a -= c; a ^= (c >> 13); \
1011 b -= c; b -= a; b ^= (a << 8); \
1012 c -= a; c -= b; c ^= (b >> 13); \
1013 a -= b; a -= c; a ^= (c >> 12); \
1014 b -= c; b -= a; b ^= (a << 16); \
1015 c -= a; c -= b; c ^= (b >> 5); \
1016 a -= b; a -= c; a ^= (c >> 3); \
1017 b -= c; b -= a; b ^= (a << 10); \
1018 c -= a; c -= b; c ^= (b >> 15); \
1019 } while (/*CONSTCOND*/0)
1022 static __inline uint32_t
1023 nm_bridge_rthash(const uint8_t *addr)
1025 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1035 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1)
1036 return (c & BRIDGE_RTHASH_MASK);
1043 bdg_netmap_reg(struct netmap_adapter *na, int onoff)
1045 struct netmap_vp_adapter *vpna =
1046 (struct netmap_vp_adapter*)na;
1047 struct ifnet *ifp = na->ifp;
1049 /* the interface is already attached to the bridge,
1050 * so we only need to toggle IFCAP_NETMAP.
1052 BDG_WLOCK(vpna->na_bdg);
1054 ifp->if_capenable |= IFCAP_NETMAP;
1056 ifp->if_capenable &= ~IFCAP_NETMAP;
1058 BDG_WUNLOCK(vpna->na_bdg);
1064 * Lookup function for a learning bridge.
1065 * Update the hash table with the source address,
1066 * and then returns the destination port index, and the
1067 * ring in *dst_ring (at the moment, always use ring 0)
1070 netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring,
1071 struct netmap_vp_adapter *na)
1073 struct nm_hash_ent *ht = na->na_bdg->ht;
1075 u_int dst, mysrc = na->bdg_port;
1076 uint64_t smac, dmac;
1079 D("invalid buf length %d", buf_len);
1080 return NM_BDG_NOPORT;
1082 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1083 smac = le64toh(*(uint64_t *)(buf + 4));
1087 * The hash is somewhat expensive, there might be some
1088 * worthwhile optimizations here.
1090 if ((buf[6] & 1) == 0) { /* valid src */
1092 sh = nm_bridge_rthash(s); // XXX hash of source
1093 /* update source port forwarding entry */
1094 ht[sh].mac = smac; /* XXX expire ? */
1095 ht[sh].ports = mysrc;
1097 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1098 s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1100 dst = NM_BDG_BROADCAST;
1101 if ((buf[0] & 1) == 0) { /* unicast */
1102 dh = nm_bridge_rthash(buf); // XXX hash of dst
1103 if (ht[dh].mac == dmac) { /* found dst */
1106 /* XXX otherwise return NM_BDG_UNKNOWN ? */
1114 * Available space in the ring. Only used in VALE code
1115 * and only with is_rx = 1
1117 static inline uint32_t
1118 nm_kr_space(struct netmap_kring *k, int is_rx)
1123 int busy = k->nkr_hwlease - k->nr_hwcur;
1125 busy += k->nkr_num_slots;
1126 space = k->nkr_num_slots - 1 - busy;
1128 /* XXX never used in this branch */
1129 space = k->nr_hwtail - k->nkr_hwlease;
1131 space += k->nkr_num_slots;
1135 if (k->nkr_hwlease >= k->nkr_num_slots ||
1136 k->nr_hwcur >= k->nkr_num_slots ||
1137 k->nr_tail >= k->nkr_num_slots ||
1139 busy >= k->nkr_num_slots) {
1140 D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1141 k->nkr_lease_idx, k->nkr_num_slots);
1150 /* make a lease on the kring for N positions. return the
1152 * XXX only used in VALE code and with is_rx = 1
1154 static inline uint32_t
1155 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1157 uint32_t lim = k->nkr_num_slots - 1;
1158 uint32_t lease_idx = k->nkr_lease_idx;
1160 k->nkr_leases[lease_idx] = NR_NOSLOT;
1161 k->nkr_lease_idx = nm_next(lease_idx, lim);
1163 if (n > nm_kr_space(k, is_rx)) {
1164 D("invalid request for %d slots", n);
1167 /* XXX verify that there are n slots */
1168 k->nkr_hwlease += n;
1169 if (k->nkr_hwlease > lim)
1170 k->nkr_hwlease -= lim + 1;
1172 if (k->nkr_hwlease >= k->nkr_num_slots ||
1173 k->nr_hwcur >= k->nkr_num_slots ||
1174 k->nr_hwtail >= k->nkr_num_slots ||
1175 k->nkr_lease_idx >= k->nkr_num_slots) {
1176 D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1177 k->na->ifp->if_xname,
1178 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1179 k->nkr_lease_idx, k->nkr_num_slots);
1185 * This flush routine supports only unicast and broadcast but a large
1186 * number of ports, and lets us replace the learn and dispatch functions.
1189 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1192 struct nm_bdg_q *dst_ents, *brddst;
1193 uint16_t num_dsts = 0, *dsts;
1194 struct nm_bridge *b = na->na_bdg;
1195 u_int i, j, me = na->bdg_port;
1198 * The work area (pointed by ft) is followed by an array of
1199 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1200 * queues per port plus one for the broadcast traffic.
1201 * Then we have an array of destination indexes.
1203 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1204 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1206 /* first pass: find a destination for each packet in the batch */
1207 for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1208 uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1209 uint16_t dst_port, d_i;
1211 uint8_t *buf = ft[i].ft_buf;
1212 u_int len = ft[i].ft_len;
1214 ND("slot %d frags %d", i, ft[i].ft_frags);
1215 /* Drop the packet if the offset is not into the first
1216 fragment nor at the very beginning of the second. */
1217 if (unlikely(na->offset > len))
1219 if (len == na->offset) {
1220 buf = ft[i+1].ft_buf;
1221 len = ft[i+1].ft_len;
1226 dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na);
1227 if (netmap_verbose > 255)
1228 RD(5, "slot %d port %d -> %d", i, me, dst_port);
1229 if (dst_port == NM_BDG_NOPORT)
1230 continue; /* this packet is identified to be dropped */
1231 else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1233 else if (dst_port == NM_BDG_BROADCAST)
1234 dst_ring = 0; /* broadcasts always go to ring 0 */
1235 else if (unlikely(dst_port == me ||
1236 !b->bdg_ports[dst_port]))
1239 /* get a position in the scratch pad */
1240 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1243 /* append the first fragment to the list */
1244 if (d->bq_head == NM_FT_NULL) { /* new destination */
1245 d->bq_head = d->bq_tail = i;
1246 /* remember this position to be scanned later */
1247 if (dst_port != NM_BDG_BROADCAST)
1248 dsts[num_dsts++] = d_i;
1250 ft[d->bq_tail].ft_next = i;
1253 d->bq_len += ft[i].ft_frags;
1257 * Broadcast traffic goes to ring 0 on all destinations.
1258 * So we need to add these rings to the list of ports to scan.
1259 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1260 * expensive. We should keep a compact list of active destinations
1261 * so we could shorten this loop.
1263 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1264 if (brddst->bq_head != NM_FT_NULL) {
1265 for (j = 0; likely(j < b->bdg_active_ports); j++) {
1267 i = b->bdg_port_index[j];
1268 if (unlikely(i == me))
1270 d_i = i * NM_BDG_MAXRINGS;
1271 if (dst_ents[d_i].bq_head == NM_FT_NULL)
1272 dsts[num_dsts++] = d_i;
1276 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1277 /* second pass: scan destinations (XXX will be modular somehow) */
1278 for (i = 0; i < num_dsts; i++) {
1279 struct ifnet *dst_ifp;
1280 struct netmap_vp_adapter *dst_na;
1281 struct netmap_kring *kring;
1282 struct netmap_ring *ring;
1283 u_int dst_nr, lim, j, sent = 0, d_i, next, brd_next;
1284 u_int needed, howmany;
1285 int retry = netmap_txsync_retry;
1287 uint32_t my_start = 0, lease_idx = 0;
1289 int offset_mismatch;
1292 ND("second pass %d port %d", i, d_i);
1294 // XXX fix the division
1295 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1296 /* protect from the lookup function returning an inactive
1299 if (unlikely(dst_na == NULL))
1301 if (dst_na->up.na_flags & NAF_SW_ONLY)
1303 dst_ifp = dst_na->up.ifp;
1305 * The interface may be in !netmap mode in two cases:
1306 * - when na is attached but not activated yet;
1307 * - when na is being deactivated but is still attached.
1309 if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) {
1310 ND("not in netmap mode!");
1314 offset_mismatch = (dst_na->offset != na->offset);
1316 /* there is at least one either unicast or broadcast packet */
1317 brd_next = brddst->bq_head;
1319 /* we need to reserve this many slots. If fewer are
1320 * available, some packets will be dropped.
1321 * Packets may have multiple fragments, so we may not use
1322 * there is a chance that we may not use all of the slots
1323 * we have claimed, so we will need to handle the leftover
1324 * ones when we regain the lock.
1326 needed = d->bq_len + brddst->bq_len;
1328 ND(5, "pass 2 dst %d is %x %s",
1329 i, d_i, is_vp ? "virtual" : "nic/host");
1330 dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1331 nrings = dst_na->up.num_rx_rings;
1332 if (dst_nr >= nrings)
1333 dst_nr = dst_nr % nrings;
1334 kring = &dst_na->up.rx_rings[dst_nr];
1336 lim = kring->nkr_num_slots - 1;
1340 /* reserve the buffers in the queue and an entry
1341 * to report completion, and drop lock.
1342 * XXX this might become a helper function.
1344 mtx_lock(&kring->q_lock);
1345 if (kring->nkr_stopped) {
1346 mtx_unlock(&kring->q_lock);
1349 if (dst_na->retry) {
1350 dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1352 my_start = j = kring->nkr_hwlease;
1353 howmany = nm_kr_space(kring, 1);
1354 if (needed < howmany)
1356 lease_idx = nm_kr_lease(kring, howmany, 1);
1357 mtx_unlock(&kring->q_lock);
1359 /* only retry if we need more than available slots */
1360 if (retry && needed <= howmany)
1363 /* copy to the destination queue */
1364 while (howmany > 0) {
1365 struct netmap_slot *slot;
1366 struct nm_bdg_fwd *ft_p, *ft_end;
1368 int fix_mismatch = offset_mismatch;
1370 /* find the queue from which we pick next packet.
1371 * NM_FT_NULL is always higher than valid indexes
1372 * so we never dereference it if the other list
1373 * has packets (and if both are empty we never
1376 if (next < brd_next) {
1378 next = ft_p->ft_next;
1379 } else { /* insert broadcast */
1380 ft_p = ft + brd_next;
1381 brd_next = ft_p->ft_next;
1383 cnt = ft_p->ft_frags; // cnt > 0
1384 if (unlikely(cnt > howmany))
1385 break; /* no more space */
1387 if (netmap_verbose && cnt > 1)
1388 RD(5, "rx %d frags to %d", cnt, j);
1389 ft_end = ft_p + cnt;
1391 char *dst, *src = ft_p->ft_buf;
1392 size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1394 slot = &ring->slot[j];
1395 dst = BDG_NMB(&dst_na->up, slot);
1397 if (unlikely(fix_mismatch)) {
1398 /* We are processing the first fragment
1399 * and there is a mismatch between source
1400 * and destination offsets. Create a zeroed
1401 * header for the destination, independently
1402 * of the source header length and content.
1405 copy_len -= na->offset;
1406 bzero(dst, dst_na->offset);
1407 dst += dst_na->offset;
1408 dst_len = dst_na->offset + copy_len;
1409 /* fix the first fragment only */
1411 /* Here it could be copy_len == dst_len == 0,
1412 * and so a zero length fragment is passed.
1416 ND("send [%d] %d(%d) bytes at %s:%d",
1417 i, (int)copy_len, (int)dst_len,
1418 NM_IFPNAME(dst_ifp), j);
1419 /* round to a multiple of 64 */
1420 copy_len = (copy_len + 63) & ~63;
1422 if (ft_p->ft_flags & NS_INDIRECT) {
1423 if (copyin(src, dst, copy_len)) {
1424 // invalid user pointer, pretend len is 0
1428 //memcpy(dst, src, copy_len);
1429 pkt_copy(src, dst, (int)copy_len);
1431 slot->len = dst_len;
1432 slot->flags = (cnt << 8)| NS_MOREFRAG;
1433 j = nm_next(j, lim);
1436 } while (ft_p != ft_end);
1437 slot->flags = (cnt << 8); /* clear flag on last entry */
1439 if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1443 /* current position */
1444 uint32_t *p = kring->nkr_leases; /* shorthand */
1445 uint32_t update_pos;
1446 int still_locked = 1;
1448 mtx_lock(&kring->q_lock);
1449 if (unlikely(howmany > 0)) {
1450 /* not used all bufs. If i am the last one
1451 * i can recover the slots, otherwise must
1452 * fill them with 0 to mark empty packets.
1454 ND("leftover %d bufs", howmany);
1455 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1456 /* yes i am the last one */
1457 ND("roll back nkr_hwlease to %d", j);
1458 kring->nkr_hwlease = j;
1460 while (howmany-- > 0) {
1461 ring->slot[j].len = 0;
1462 ring->slot[j].flags = 0;
1463 j = nm_next(j, lim);
1467 p[lease_idx] = j; /* report I am done */
1469 update_pos = kring->nr_hwtail;
1471 if (my_start == update_pos) {
1472 /* all slots before my_start have been reported,
1473 * so scan subsequent leases to see if other ranges
1474 * have been completed, and to a selwakeup or txsync.
1476 while (lease_idx != kring->nkr_lease_idx &&
1477 p[lease_idx] != NR_NOSLOT) {
1479 p[lease_idx] = NR_NOSLOT;
1480 lease_idx = nm_next(lease_idx, lim);
1482 /* j is the new 'write' position. j != my_start
1483 * means there are new buffers to report
1485 if (likely(j != my_start)) {
1486 kring->nr_hwtail = j;
1487 dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1489 mtx_unlock(&kring->q_lock);
1490 if (dst_na->retry && retry--)
1495 mtx_unlock(&kring->q_lock);
1498 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1501 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1508 netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags)
1510 struct netmap_kring *kring = &na->up.tx_rings[ring_nr];
1512 u_int const lim = kring->nkr_num_slots - 1;
1513 u_int const cur = kring->rcur;
1515 if (bridge_batch <= 0) { /* testing only */
1516 done = cur; // used all
1519 if (bridge_batch > NM_BDG_BATCH)
1520 bridge_batch = NM_BDG_BATCH;
1522 done = nm_bdg_preflush(na, ring_nr, kring, cur);
1525 D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail);
1527 * packets between 'done' and 'cur' are left unsent.
1529 kring->nr_hwcur = done;
1530 kring->nr_hwtail = nm_prev(done, lim);
1531 nm_txsync_finalize(kring);
1533 D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags);
1539 * main dispatch routine for the bridge.
1540 * We already know that only one thread is running this.
1541 * we must run nm_bdg_preflush without lock.
1544 bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1546 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
1547 return netmap_vp_txsync(vpna, ring_nr, flags);
1551 netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1553 struct netmap_kring *kring = &na->rx_rings[ring_nr];
1554 struct netmap_ring *ring = kring->ring;
1555 u_int nm_i, lim = kring->nkr_num_slots - 1;
1556 u_int head = nm_rxsync_prologue(kring);
1560 D("ouch dangerous reset!!!");
1561 n = netmap_ring_reinit(kring);
1565 /* First part, import newly received packets. */
1566 /* actually nothing to do here, they are already in the kring */
1568 /* Second part, skip past packets that userspace has released. */
1569 nm_i = kring->nr_hwcur;
1571 /* consistency check, but nothing really important here */
1572 for (n = 0; likely(nm_i != head); n++) {
1573 struct netmap_slot *slot = &ring->slot[nm_i];
1574 void *addr = BDG_NMB(na, slot);
1576 if (addr == netmap_buffer_base) { /* bad buf */
1577 D("bad buffer index %d, ignore ?",
1580 slot->flags &= ~NS_BUF_CHANGED;
1581 nm_i = nm_next(nm_i, lim);
1583 kring->nr_hwcur = head;
1586 /* tell userspace that there are new packets */
1587 nm_rxsync_finalize(kring);
1594 * user process reading from a VALE switch.
1595 * Already protected against concurrent calls from userspace,
1596 * but we must acquire the queue's lock to protect against
1597 * writers on the same queue.
1600 bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1602 struct netmap_kring *kring = &na->rx_rings[ring_nr];
1605 mtx_lock(&kring->q_lock);
1606 n = netmap_vp_rxsync(na, ring_nr, flags);
1607 mtx_unlock(&kring->q_lock);
1613 bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
1615 struct netmap_vp_adapter *vpna;
1616 struct netmap_adapter *na;
1619 vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO);
1627 /* bound checking */
1628 na->num_tx_rings = nmr->nr_tx_rings;
1629 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1630 nmr->nr_tx_rings = na->num_tx_rings; // write back
1631 na->num_rx_rings = nmr->nr_rx_rings;
1632 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1633 nmr->nr_rx_rings = na->num_rx_rings; // write back
1634 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1635 1, NM_BDG_MAXSLOTS, NULL);
1636 na->num_tx_desc = nmr->nr_tx_slots;
1637 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1638 1, NM_BDG_MAXSLOTS, NULL);
1639 na->num_rx_desc = nmr->nr_rx_slots;
1642 na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER;
1643 na->nm_txsync = bdg_netmap_txsync;
1644 na->nm_rxsync = bdg_netmap_rxsync;
1645 na->nm_register = bdg_netmap_reg;
1646 na->nm_dtor = netmap_adapter_vp_dtor;
1647 na->nm_krings_create = netmap_vp_krings_create;
1648 na->nm_krings_delete = netmap_vp_krings_delete;
1649 na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp),
1650 na->num_tx_rings, na->num_tx_desc,
1651 na->num_rx_rings, na->num_rx_desc);
1652 /* other nmd fields are set in the common routine */
1653 error = netmap_attach_common(na);
1655 free(vpna, M_DEVBUF);
1663 netmap_bwrap_dtor(struct netmap_adapter *na)
1665 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1666 struct netmap_adapter *hwna = bna->hwna;
1667 struct nm_bridge *b = bna->up.na_bdg,
1668 *bh = bna->host.na_bdg;
1669 struct ifnet *ifp = na->ifp;
1674 netmap_bdg_detach_common(b, bna->up.bdg_port,
1675 (bh ? bna->host.bdg_port : -1));
1678 hwna->na_private = NULL;
1679 netmap_adapter_put(hwna);
1681 bzero(ifp, sizeof(*ifp));
1682 free(ifp, M_DEVBUF);
1689 * Intr callback for NICs connected to a bridge.
1690 * Simply ignore tx interrupts (maybe we could try to recover space ?)
1691 * and pass received packets from nic to the bridge.
1693 * XXX TODO check locking: this is called from the interrupt
1694 * handler so we should make sure that the interface is not
1695 * disconnected while passing down an interrupt.
1697 * Note, no user process can access this NIC or the host stack.
1698 * The only part of the ring that is significant are the slots,
1699 * and head/cur/tail are set from the kring as needed
1700 * (part as a receive ring, part as a transmit ring).
1702 * callback that overwrites the hwna notify callback.
1703 * Packets come from the outside or from the host stack and are put on an hwna rx ring.
1704 * The bridge wrapper then sends the packets through the bridge.
1707 netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags)
1709 struct ifnet *ifp = na->ifp;
1710 struct netmap_bwrap_adapter *bna = na->na_private;
1711 struct netmap_vp_adapter *hostna = &bna->host;
1712 struct netmap_kring *kring, *bkring;
1713 struct netmap_ring *ring;
1714 int is_host_ring = ring_nr == na->num_rx_rings;
1715 struct netmap_vp_adapter *vpna = &bna->up;
1719 D("%s %s%d 0x%x", NM_IFPNAME(ifp),
1720 (tx == NR_TX ? "TX" : "RX"), ring_nr, flags);
1722 if (flags & NAF_DISABLE_NOTIFY) {
1723 kring = tx == NR_TX ? na->tx_rings : na->rx_rings;
1724 bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings;
1725 if (kring[ring_nr].nkr_stopped)
1726 netmap_disable_ring(&bkring[ring_nr]);
1728 bkring[ring_nr].nkr_stopped = 0;
1732 if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP))
1735 /* we only care about receive interrupts */
1739 kring = &na->rx_rings[ring_nr];
1742 /* make sure the ring is not disabled */
1743 if (nm_kr_tryget(kring))
1746 if (is_host_ring && hostna->na_bdg == NULL) {
1747 error = bna->save_notify(na, ring_nr, tx, flags);
1751 /* Here we expect ring->head = ring->cur = ring->tail
1752 * because everything has been released from the previous round.
1753 * However the ring is shared and we might have info from
1754 * the wrong side (the tx ring). Hence we overwrite with
1755 * the info from the rx kring.
1758 D("%s head %d cur %d tail %d (kring %d %d %d)", NM_IFPNAME(ifp),
1759 ring->head, ring->cur, ring->tail,
1760 kring->rhead, kring->rcur, kring->rtail);
1762 ring->head = kring->rhead;
1763 ring->cur = kring->rcur;
1764 ring->tail = kring->rtail;
1766 /* simulate a user wakeup on the rx ring */
1768 netmap_rxsync_from_host(na, NULL, NULL);
1772 /* fetch packets that have arrived.
1773 * XXX maybe do this in a loop ?
1775 error = na->nm_rxsync(na, ring_nr, 0);
1779 if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
1780 D("how strange, interrupt with no packets on %s",
1785 /* new packets are ring->cur to ring->tail, and the bkring
1786 * had hwcur == ring->cur. So advance ring->cur to ring->tail
1787 * to push all packets out.
1789 ring->head = ring->cur = ring->tail;
1791 /* also set tail to what the bwrap expects */
1792 bkring = &vpna->up.tx_rings[ring_nr];
1793 ring->tail = bkring->nr_hwtail; // rtail too ?
1795 /* pass packets to the switch */
1796 nm_txsync_prologue(bkring); // XXX error checking ?
1797 netmap_vp_txsync(vpna, ring_nr, flags);
1799 /* mark all buffers as released on this ring */
1800 ring->head = ring->cur = kring->nr_hwtail;
1801 ring->tail = kring->rtail;
1802 /* another call to actually release the buffers */
1803 if (!is_host_ring) {
1804 error = na->nm_rxsync(na, ring_nr, 0);
1806 /* mark all packets as released, as in the
1807 * second part of netmap_rxsync_from_host()
1809 kring->nr_hwcur = kring->nr_hwtail;
1810 nm_rxsync_finalize(kring);
1820 netmap_bwrap_register(struct netmap_adapter *na, int onoff)
1822 struct netmap_bwrap_adapter *bna =
1823 (struct netmap_bwrap_adapter *)na;
1824 struct netmap_adapter *hwna = bna->hwna;
1825 struct netmap_vp_adapter *hostna = &bna->host;
1828 ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off");
1833 hwna->na_lut = na->na_lut;
1834 hwna->na_lut_objtotal = na->na_lut_objtotal;
1836 if (hostna->na_bdg) {
1837 hostna->up.na_lut = na->na_lut;
1838 hostna->up.na_lut_objtotal = na->na_lut_objtotal;
1841 /* cross-link the netmap rings
1842 * The original number of rings comes from hwna,
1843 * rx rings on one side equals tx rings on the other.
1845 for (i = 0; i <= na->num_rx_rings; i++) {
1846 hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots;
1847 hwna->tx_rings[i].ring = na->rx_rings[i].ring;
1849 for (i = 0; i <= na->num_tx_rings; i++) {
1850 hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots;
1851 hwna->rx_rings[i].ring = na->tx_rings[i].ring;
1856 error = hwna->nm_register(hwna, onoff);
1861 bdg_netmap_reg(na, onoff);
1864 bna->save_notify = hwna->nm_notify;
1865 hwna->nm_notify = netmap_bwrap_intr_notify;
1867 hwna->nm_notify = bna->save_notify;
1868 hwna->na_lut = NULL;
1869 hwna->na_lut_objtotal = 0;
1877 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
1878 u_int *rxr, u_int *rxd)
1880 struct netmap_bwrap_adapter *bna =
1881 (struct netmap_bwrap_adapter *)na;
1882 struct netmap_adapter *hwna = bna->hwna;
1884 /* forward the request */
1885 netmap_update_config(hwna);
1886 /* swap the results */
1887 *txr = hwna->num_rx_rings;
1888 *txd = hwna->num_rx_desc;
1889 *rxr = hwna->num_tx_rings;
1890 *rxd = hwna->num_rx_desc;
1897 netmap_bwrap_krings_create(struct netmap_adapter *na)
1899 struct netmap_bwrap_adapter *bna =
1900 (struct netmap_bwrap_adapter *)na;
1901 struct netmap_adapter *hwna = bna->hwna;
1902 struct netmap_adapter *hostna = &bna->host.up;
1905 ND("%s", NM_IFPNAME(na->ifp));
1907 error = netmap_vp_krings_create(na);
1911 error = hwna->nm_krings_create(hwna);
1913 netmap_vp_krings_delete(na);
1917 hostna->tx_rings = na->tx_rings + na->num_tx_rings;
1918 hostna->rx_rings = na->rx_rings + na->num_rx_rings;
1925 netmap_bwrap_krings_delete(struct netmap_adapter *na)
1927 struct netmap_bwrap_adapter *bna =
1928 (struct netmap_bwrap_adapter *)na;
1929 struct netmap_adapter *hwna = bna->hwna;
1931 ND("%s", NM_IFPNAME(na->ifp));
1933 hwna->nm_krings_delete(hwna);
1934 netmap_vp_krings_delete(na);
1938 /* notify method for the bridge-->hwna direction */
1940 netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
1942 struct netmap_bwrap_adapter *bna =
1943 (struct netmap_bwrap_adapter *)na;
1944 struct netmap_adapter *hwna = bna->hwna;
1945 struct netmap_kring *kring, *hw_kring;
1946 struct netmap_ring *ring;
1953 kring = &na->rx_rings[ring_n];
1954 hw_kring = &hwna->tx_rings[ring_n];
1956 lim = kring->nkr_num_slots - 1;
1958 if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP))
1960 /* first step: simulate a user wakeup on the rx ring */
1961 netmap_vp_rxsync(na, ring_n, flags);
1962 ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1963 NM_IFPNAME(na->ifp), ring_n,
1964 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1965 ring->head, ring->cur, ring->tail,
1966 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
1967 /* second step: the simulated user consumes all new packets */
1968 ring->head = ring->cur = ring->tail;
1970 /* third step: the new packets are sent on the tx ring
1971 * (which is actually the same ring)
1973 /* set tail to what the hw expects */
1974 ring->tail = hw_kring->rtail;
1975 if (ring_n == na->num_rx_rings) {
1976 netmap_txsync_to_host(hwna);
1978 nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ?
1979 error = hwna->nm_txsync(hwna, ring_n, flags);
1982 /* fourth step: now we are back the rx ring */
1983 /* claim ownership on all hw owned bufs */
1984 ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */
1985 ring->tail = kring->rtail; /* restore saved value of tail, for safety */
1987 /* fifth step: the user goes to sleep again, causing another rxsync */
1988 netmap_vp_rxsync(na, ring_n, flags);
1989 ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1990 NM_IFPNAME(na->ifp), ring_n,
1991 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1992 ring->head, ring->cur, ring->tail,
1993 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
2000 netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
2002 struct netmap_bwrap_adapter *bna = na->na_private;
2003 struct netmap_adapter *port_na = &bna->up.up;
2004 if (tx == NR_TX || ring_n != 0)
2006 return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags);
2010 /* attach a bridge wrapper to the 'real' device */
2012 netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
2014 struct netmap_bwrap_adapter *bna;
2015 struct netmap_adapter *na;
2016 struct netmap_adapter *hwna = NA(real);
2017 struct netmap_adapter *hostna;
2021 bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO);
2027 /* fill the ring data for the bwrap adapter with rx/tx meanings
2028 * swapped. The real cross-linking will be done during register,
2029 * when all the krings will have been created.
2031 na->num_rx_rings = hwna->num_tx_rings;
2032 na->num_tx_rings = hwna->num_rx_rings;
2033 na->num_tx_desc = hwna->num_rx_desc;
2034 na->num_rx_desc = hwna->num_tx_desc;
2035 na->nm_dtor = netmap_bwrap_dtor;
2036 na->nm_register = netmap_bwrap_register;
2037 // na->nm_txsync = netmap_bwrap_txsync;
2038 // na->nm_rxsync = netmap_bwrap_rxsync;
2039 na->nm_config = netmap_bwrap_config;
2040 na->nm_krings_create = netmap_bwrap_krings_create;
2041 na->nm_krings_delete = netmap_bwrap_krings_delete;
2042 na->nm_notify = netmap_bwrap_notify;
2043 na->nm_mem = hwna->nm_mem;
2044 na->na_private = na; /* prevent NIOCREGIF */
2045 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2048 netmap_adapter_get(hwna);
2049 hwna->na_private = bna; /* weak reference */
2051 hostna = &bna->host.up;
2052 hostna->ifp = hwna->ifp;
2053 hostna->num_tx_rings = 1;
2054 hostna->num_tx_desc = hwna->num_rx_desc;
2055 hostna->num_rx_rings = 1;
2056 hostna->num_rx_desc = hwna->num_tx_desc;
2057 // hostna->nm_txsync = netmap_bwrap_host_txsync;
2058 // hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2059 hostna->nm_notify = netmap_bwrap_host_notify;
2060 hostna->nm_mem = na->nm_mem;
2061 hostna->na_private = bna;
2063 ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2064 fake->if_xname, real->if_xname,
2065 na->num_tx_rings, na->num_tx_desc,
2066 na->num_rx_rings, na->num_rx_desc);
2068 error = netmap_attach_common(na);
2070 netmap_adapter_put(hwna);
2071 free(bna, M_DEVBUF);
2079 netmap_init_bridges(void)
2082 bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */
2083 for (i = 0; i < NM_BRIDGES; i++)
2084 BDG_RWINIT(&nm_bridges[i]);
2086 #endif /* WITH_VALE */