2 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * This module implements the VALE switch for netmap
32 NMG_LOCK() serializes all modifications to switches and ports.
33 A switch cannot be deleted until all ports are gone.
35 For each switch, an SX lock (RWlock on linux) protects
36 deletion of ports. When configuring or deleting a new port, the
37 lock is acquired in exclusive mode (after holding NMG_LOCK).
38 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
39 The lock is held throughout the entire forwarding cycle,
40 during which the thread may incur in a page fault.
41 Hence it is important that sleepable shared locks are used.
43 On the rx ring, the per-port lock is grabbed initially to reserve
44 a number of slot in the ring, then the lock is released,
45 packets are copied from source to destination, and then
46 the lock is acquired again and the receive ring is updated.
47 (A similar thing is done on the tx ring for NIC and host stack
48 ports attached to the switch)
53 * OS-specific code that is used only within this file.
54 * Other OS-specific code that must be accessed by drivers
55 * is present in netmap_kern.h
58 #if defined(__FreeBSD__)
59 #include <sys/cdefs.h> /* prerequisite */
60 __FBSDID("$FreeBSD$");
62 #include <sys/types.h>
63 #include <sys/errno.h>
64 #include <sys/param.h> /* defines used in kernel.h */
65 #include <sys/kernel.h> /* types used in module initialization */
66 #include <sys/conf.h> /* cdevsw struct, UID, GID */
67 #include <sys/sockio.h>
68 #include <sys/socketvar.h> /* struct socket */
69 #include <sys/malloc.h>
71 #include <sys/rwlock.h>
72 #include <sys/socket.h> /* sockaddrs */
73 #include <sys/selinfo.h>
74 #include <sys/sysctl.h>
76 #include <net/if_var.h>
77 #include <net/bpf.h> /* BIOCIMMEDIATE */
78 #include <machine/bus.h> /* bus_dmamap_* */
79 #include <sys/endian.h>
80 #include <sys/refcount.h>
83 #define BDG_RWLOCK_T struct rwlock // struct rwlock
85 #define BDG_RWINIT(b) \
86 rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
87 #define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock)
88 #define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock)
89 #define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock)
90 #define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock)
91 #define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock)
92 #define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock)
99 #elif defined(__APPLE__)
101 #warning OSX support is only partial
102 #include "osx_glue.h"
106 #error Unsupported platform
108 #endif /* unsupported */
114 #include <net/netmap.h>
115 #include <dev/netmap/netmap_kern.h>
116 #include <dev/netmap/netmap_mem2.h>
121 * system parameters (most of them in netmap_kern.h)
122 * NM_NAME prefix for switch port names, default "vale"
123 * NM_BDG_MAXPORTS number of ports
124 * NM_BRIDGES max number of switches in the system.
125 * XXX should become a sysctl or tunable
127 * Switch ports are named valeX:Y where X is the switch name and Y
128 * is the port. If Y matches a physical interface name, the port is
129 * connected to a physical device.
131 * Unlike physical interfaces, switch ports use their own memory region
132 * for rings and buffers.
133 * The virtual interfaces use per-queue lock instead of core lock.
134 * In the tx loop, we aggregate traffic in batches to make all operations
135 * faster. The batch size is bridge_batch.
137 #define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */
138 #define NM_BDG_MAXSLOTS 4096 /* XXX same as above */
139 #define NM_BRIDGE_RINGSIZE 1024 /* in the device */
140 #define NM_BDG_HASH 1024 /* forwarding table entries */
141 #define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */
142 #define NM_MULTISEG 64 /* max size of a chain of bufs */
143 /* actual size of the tables */
144 #define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG)
145 /* NM_FT_NULL terminates a list of slots in the ft */
146 #define NM_FT_NULL NM_BDG_BATCH_MAX
147 #define NM_BRIDGES 8 /* number of bridges */
151 * bridge_batch is set via sysctl to the max batch size to be
152 * used in the bridge. The actual value may be larger as the
153 * last packet in the block may overflow the size.
155 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
156 SYSCTL_DECL(_dev_netmap);
157 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
160 static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp);
161 static int bdg_netmap_reg(struct netmap_adapter *na, int onoff);
162 static int netmap_bwrap_attach(struct ifnet *, struct ifnet *);
163 static int netmap_bwrap_register(struct netmap_adapter *, int onoff);
164 int kern_netmap_regif(struct nmreq *nmr);
167 * For each output interface, nm_bdg_q is used to construct a list.
168 * bq_len is the number of output buffers (we can have coalescing
174 uint32_t bq_len; /* number of buffers */
177 /* XXX revise this */
179 uint64_t mac; /* the top 2 bytes are the epoch */
184 * nm_bridge is a descriptor for a VALE switch.
185 * Interfaces for a bridge are all in bdg_ports[].
186 * The array has fixed size, an empty entry does not terminate
187 * the search, but lookups only occur on attach/detach so we
188 * don't mind if they are slow.
190 * The bridge is non blocking on the transmit ports: excess
191 * packets are dropped if there is no room on the output port.
193 * bdg_lock protects accesses to the bdg_ports array.
194 * This is a rw lock (or equivalent).
197 /* XXX what is the proper alignment/layout ? */
198 BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */
200 uint32_t bdg_active_ports; /* 0 means free */
201 char bdg_basename[IFNAMSIZ];
203 /* Indexes of active ports (up to active_ports)
204 * and all other remaining ports.
206 uint8_t bdg_port_index[NM_BDG_MAXPORTS];
208 struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
212 * The function to decide the destination port.
213 * It returns either of an index of the destination port,
214 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
215 * forward this packet. ring_nr is the source ring index, and the
216 * function may overwrite this value to forward this packet to a
217 * different ring index.
218 * This function must be set by netmap_bdgctl().
220 bdg_lookup_fn_t nm_bdg_lookup;
222 /* the forwarding table, MAC+ports.
223 * XXX should be changed to an argument to be passed to
224 * the lookup function, and allocated on attach
226 struct nm_hash_ent ht[NM_BDG_HASH];
231 * XXX in principle nm_bridges could be created dynamically
232 * Right now we have a static array and deletions are protected
233 * by an exclusive lock.
235 struct nm_bridge nm_bridges[NM_BRIDGES];
239 * this is a slightly optimized copy routine which rounds
240 * to multiple of 64 bytes and is often faster than dealing
241 * with other odd sizes. We assume there is enough room
242 * in the source and destination buffers.
244 * XXX only for multiples of 64 bytes, non overlapped.
247 pkt_copy(void *_src, void *_dst, int l)
249 uint64_t *src = _src;
250 uint64_t *dst = _dst;
251 if (unlikely(l >= 1024)) {
255 for (; likely(l > 0); l-=64) {
269 * locate a bridge among the existing ones.
270 * MUST BE CALLED WITH NMG_LOCK()
272 * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
273 * We assume that this is called with a name of at least NM_NAME chars.
275 static struct nm_bridge *
276 nm_find_bridge(const char *name, int create)
279 struct nm_bridge *b = NULL;
283 namelen = strlen(NM_NAME); /* base length */
284 l = name ? strlen(name) : 0; /* actual length */
286 D("invalid bridge name %s", name ? name : NULL);
289 for (i = namelen + 1; i < l; i++) {
290 if (name[i] == ':') {
295 if (namelen >= IFNAMSIZ)
297 ND("--- prefix is '%.*s' ---", namelen, name);
299 /* lookup the name, remember empty slot if there is one */
300 for (i = 0; i < NM_BRIDGES; i++) {
301 struct nm_bridge *x = nm_bridges + i;
303 if (x->bdg_active_ports == 0) {
304 if (create && b == NULL)
305 b = x; /* record empty slot */
306 } else if (x->bdg_namelen != namelen) {
308 } else if (strncmp(name, x->bdg_basename, namelen) == 0) {
309 ND("found '%.*s' at %d", namelen, name, i);
314 if (i == NM_BRIDGES && b) { /* name not found, can create entry */
315 /* initialize the bridge */
316 strncpy(b->bdg_basename, name, namelen);
317 ND("create new bridge %s with ports %d", b->bdg_basename,
318 b->bdg_active_ports);
319 b->bdg_namelen = namelen;
320 b->bdg_active_ports = 0;
321 for (i = 0; i < NM_BDG_MAXPORTS; i++)
322 b->bdg_port_index[i] = i;
323 /* set the default function */
324 b->nm_bdg_lookup = netmap_bdg_learning;
325 /* reset the MAC address table */
326 bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
333 * Free the forwarding tables for rings attached to switch ports.
336 nm_free_bdgfwd(struct netmap_adapter *na)
339 struct netmap_kring *kring;
342 nrings = na->num_tx_rings;
343 kring = na->tx_rings;
344 for (i = 0; i < nrings; i++) {
345 if (kring[i].nkr_ft) {
346 free(kring[i].nkr_ft, M_DEVBUF);
347 kring[i].nkr_ft = NULL; /* protect from freeing twice */
354 * Allocate the forwarding tables for the rings attached to the bridge ports.
357 nm_alloc_bdgfwd(struct netmap_adapter *na)
359 int nrings, l, i, num_dstq;
360 struct netmap_kring *kring;
363 /* all port:rings + broadcast */
364 num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
365 l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
366 l += sizeof(struct nm_bdg_q) * num_dstq;
367 l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
369 nrings = netmap_real_tx_rings(na);
370 kring = na->tx_rings;
371 for (i = 0; i < nrings; i++) {
372 struct nm_bdg_fwd *ft;
373 struct nm_bdg_q *dstq;
376 ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
381 dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
382 for (j = 0; j < num_dstq; j++) {
383 dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
386 kring[i].nkr_ft = ft;
393 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
395 int s_hw = hw, s_sw = sw;
396 int i, lim =b->bdg_active_ports;
397 uint8_t tmp[NM_BDG_MAXPORTS];
401 make a copy of bdg_port_index;
402 lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
403 in the array of bdg_port_index, replacing them with
404 entries from the bottom of the array;
405 decrement bdg_active_ports;
406 acquire BDG_WLOCK() and copy back the array.
410 D("detach %d and %d (lim %d)", hw, sw, lim);
411 /* make a copy of the list of active ports, update it,
412 * and then copy back within BDG_WLOCK().
414 memcpy(tmp, b->bdg_port_index, sizeof(tmp));
415 for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
416 if (hw >= 0 && tmp[i] == hw) {
417 ND("detach hw %d at %d", hw, i);
418 lim--; /* point to last active port */
419 tmp[i] = tmp[lim]; /* swap with i */
420 tmp[lim] = hw; /* now this is inactive */
422 } else if (sw >= 0 && tmp[i] == sw) {
423 ND("detach sw %d at %d", sw, i);
432 if (hw >= 0 || sw >= 0) {
433 D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
437 b->bdg_ports[s_hw] = NULL;
439 b->bdg_ports[s_sw] = NULL;
441 memcpy(b->bdg_port_index, tmp, sizeof(tmp));
442 b->bdg_active_ports = lim;
445 ND("now %d active ports", lim);
447 ND("marking bridge %s as free", b->bdg_basename);
448 b->nm_bdg_lookup = NULL;
454 netmap_adapter_vp_dtor(struct netmap_adapter *na)
456 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
457 struct nm_bridge *b = vpna->na_bdg;
458 struct ifnet *ifp = na->ifp;
460 ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount);
463 netmap_bdg_detach_common(b, vpna->bdg_port, -1);
466 bzero(ifp, sizeof(*ifp));
472 /* Try to get a reference to a netmap adapter attached to a VALE switch.
473 * If the adapter is found (or is created), this function returns 0, a
474 * non NULL pointer is returned into *na, and the caller holds a
475 * reference to the adapter.
476 * If an adapter is not found, then no reference is grabbed and the
477 * function returns an error code, or 0 if there is just a VALE prefix
478 * mismatch. Therefore the caller holds a reference when
479 * (*na != NULL && return == 0).
482 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
484 const char *name = nmr->nr_name;
487 struct netmap_adapter *ret;
488 struct netmap_vp_adapter *vpna;
490 int i, j, cand = -1, cand2 = -1;
493 *na = NULL; /* default return value */
495 /* first try to see if this is a bridge port. */
497 if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
498 return 0; /* no error, but no VALE prefix */
501 b = nm_find_bridge(name, create);
503 D("no bridges available for '%s'", name);
504 return (create ? ENOMEM : ENXIO);
507 /* Now we are sure that name starts with the bridge's name,
508 * lookup the port in the bridge. We need to scan the entire
509 * list. It is not important to hold a WLOCK on the bridge
510 * during the search because NMG_LOCK already guarantees
511 * that there are no other possible writers.
514 /* lookup in the local list of ports */
515 for (j = 0; j < b->bdg_active_ports; j++) {
516 i = b->bdg_port_index[j];
517 vpna = b->bdg_ports[i];
518 // KASSERT(na != NULL);
520 /* XXX make sure the name only contains one : */
521 if (!strcmp(NM_IFPNAME(ifp), name)) {
522 netmap_adapter_get(&vpna->up);
523 ND("found existing if %s refs %d", name,
524 vpna->na_bdg_refcount);
525 *na = (struct netmap_adapter *)vpna;
529 /* not found, should we create it? */
532 /* yes we should, see if we have space to attach entries */
533 needed = 2; /* in some cases we only need 1 */
534 if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
535 D("bridge full %d, cannot create new port", b->bdg_active_ports);
538 /* record the next two ports available, but do not allocate yet */
539 cand = b->bdg_port_index[b->bdg_active_ports];
540 cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
541 ND("+++ bridge %s port %s used %d avail %d %d",
542 b->bdg_basename, name, b->bdg_active_ports, cand, cand2);
545 * try see if there is a matching NIC with this name
546 * (after the bridge's name)
548 ifp = ifunit_ref(name + b->bdg_namelen + 1);
549 if (!ifp) { /* this is a virtual port */
551 /* nr_cmd must be 0 for a virtual port */
555 /* create a struct ifnet for the new port.
556 * need M_NOWAIT as we are under nma_lock
558 ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
562 strcpy(ifp->if_xname, name);
563 /* bdg_netmap_attach creates a struct netmap_adapter */
564 error = bdg_netmap_attach(nmr, ifp);
566 D("error %d", error);
571 cand2 = -1; /* only need one port */
572 } else { /* this is a NIC */
573 struct ifnet *fake_ifp;
575 error = netmap_get_hw_na(ifp, &ret);
576 if (error || ret == NULL)
579 /* make sure the NIC is not already in use */
580 if (NETMAP_OWNED_BY_ANY(ret)) {
581 D("NIC %s busy, cannot attach to bridge",
586 /* create a fake interface */
587 fake_ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
592 strcpy(fake_ifp->if_xname, name);
593 error = netmap_bwrap_attach(fake_ifp, ifp);
595 free(fake_ifp, M_DEVBUF);
599 if (nmr->nr_arg1 != NETMAP_BDG_HOST)
600 cand2 = -1; /* only need one port */
603 vpna = (struct netmap_vp_adapter *)ret;
606 vpna->bdg_port = cand;
607 ND("NIC %p to bridge port %d", vpna, cand);
608 /* bind the port to the bridge (virtual ports are not active) */
609 b->bdg_ports[cand] = vpna;
611 b->bdg_active_ports++;
613 struct netmap_vp_adapter *hostna = vpna + 1;
614 /* also bind the host stack to the bridge */
615 b->bdg_ports[cand2] = hostna;
616 hostna->bdg_port = cand2;
618 b->bdg_active_ports++;
619 ND("host %p to bridge port %d", hostna, cand2);
621 ND("if %s refs %d", name, vpna->up.na_refcount);
624 netmap_adapter_get(ret);
634 /* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
636 nm_bdg_attach(struct nmreq *nmr)
638 struct netmap_adapter *na;
639 struct netmap_if *nifp;
640 struct netmap_priv_d *npriv;
641 struct netmap_bwrap_adapter *bna;
644 npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
650 error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */);
651 if (error) /* no device, or another bridge or user owns the device */
654 if (na == NULL) { /* VALE prefix missing */
659 if (na->active_fds > 0) { /* already registered */
664 nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error);
669 bna = (struct netmap_bwrap_adapter*)na;
670 bna->na_kpriv = npriv;
672 ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp));
676 netmap_adapter_put(na);
679 bzero(npriv, sizeof(*npriv));
680 free(npriv, M_DEVBUF);
686 nm_bdg_detach(struct nmreq *nmr)
688 struct netmap_adapter *na;
690 struct netmap_bwrap_adapter *bna;
694 error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */);
695 if (error) { /* no device, or another bridge or user owns the device */
699 if (na == NULL) { /* VALE prefix missing */
704 bna = (struct netmap_bwrap_adapter *)na;
706 if (na->active_fds == 0) { /* not registered */
711 last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */
712 if (!last_instance) {
713 D("--- error, trying to detach an entry with active mmaps");
716 struct netmap_priv_d *npriv = bna->na_kpriv;
718 bna->na_kpriv = NULL;
721 bzero(npriv, sizeof(*npriv));
722 free(npriv, M_DEVBUF);
726 netmap_adapter_put(na);
734 /* exported to kernel callers, e.g. OVS ?
736 * Called without NMG_LOCK.
739 netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
742 struct netmap_adapter *na;
743 struct netmap_vp_adapter *vpna;
745 char *name = nmr->nr_name;
746 int cmd = nmr->nr_cmd, namelen = strlen(name);
750 case NETMAP_BDG_ATTACH:
751 error = nm_bdg_attach(nmr);
754 case NETMAP_BDG_DETACH:
755 error = nm_bdg_detach(nmr);
758 case NETMAP_BDG_LIST:
759 /* this is used to enumerate bridges and ports */
760 if (namelen) { /* look up indexes of bridge and port */
761 if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
766 b = nm_find_bridge(name, 0 /* don't create */);
774 for (j = 0; j < b->bdg_active_ports; j++) {
775 i = b->bdg_port_index[j];
776 vpna = b->bdg_ports[i];
778 D("---AAAAAAAAARGH-------");
782 /* the former and the latter identify a
783 * virtual port and a NIC, respectively
785 if (!strcmp(iter->if_xname, name)) {
787 nmr->nr_arg1 = b - nm_bridges;
788 nmr->nr_arg2 = i; /* port index */
795 /* return the first non-empty entry starting from
796 * bridge nr_arg1 and port nr_arg2.
798 * Users can detect the end of the same bridge by
799 * seeing the new and old value of nr_arg1, and can
800 * detect the end of all the bridge by error != 0
806 for (error = ENOENT; i < NM_BRIDGES; i++) {
808 if (j >= b->bdg_active_ports) {
809 j = 0; /* following bridges scan from 0 */
814 j = b->bdg_port_index[j];
815 vpna = b->bdg_ports[j];
817 strncpy(name, iter->if_xname, (size_t)IFNAMSIZ);
825 case NETMAP_BDG_LOOKUP_REG:
826 /* register a lookup function to the given bridge.
827 * nmr->nr_name may be just bridge's name (including ':'
828 * if it is not just NM_NAME).
835 b = nm_find_bridge(name, 0 /* don't create */);
839 b->nm_bdg_lookup = func;
844 case NETMAP_BDG_VNET_HDR:
845 /* Valid lengths for the virtio-net header are 0 (no header),
847 if (nmr->nr_arg1 != 0 &&
848 nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) &&
849 nmr->nr_arg1 != 12) {
854 error = netmap_get_bdg_na(nmr, &na, 0);
856 vpna = (struct netmap_vp_adapter *)na;
857 vpna->virt_hdr_len = nmr->nr_arg1;
858 if (vpna->virt_hdr_len)
859 vpna->mfs = NETMAP_BDG_BUF_SIZE(na->nm_mem);
860 D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna);
861 netmap_adapter_put(na);
867 D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
875 netmap_vp_krings_create(struct netmap_adapter *na)
880 u_int nrx = netmap_real_rx_rings(na);
883 * Leases are attached to RX rings on vale ports
885 tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
887 error = netmap_krings_create(na, tailroom);
891 leases = na->tailroom;
893 for (i = 0; i < nrx; i++) { /* Receive rings */
894 na->rx_rings[i].nkr_leases = leases;
895 leases += na->num_rx_desc;
898 error = nm_alloc_bdgfwd(na);
900 netmap_krings_delete(na);
909 netmap_vp_krings_delete(struct netmap_adapter *na)
912 netmap_krings_delete(na);
917 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
918 struct netmap_vp_adapter *na, u_int ring_nr);
922 * Grab packets from a kring, move them into the ft structure
923 * associated to the tx (input) port. Max one instance per port,
924 * filtered on input (ioctl, poll or XXX).
925 * Returns the next position in the ring.
928 nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr,
929 struct netmap_kring *kring, u_int end)
931 struct netmap_ring *ring = kring->ring;
932 struct nm_bdg_fwd *ft;
933 u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
934 u_int ft_i = 0; /* start from 0 */
935 u_int frags = 1; /* how many frags ? */
936 struct nm_bridge *b = na->na_bdg;
938 /* To protect against modifications to the bridge we acquire a
939 * shared lock, waiting if we can sleep (if the source port is
940 * attached to a user process) or with a trylock otherwise (NICs).
942 ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
943 if (na->up.na_flags & NAF_BDG_MAYSLEEP)
945 else if (!BDG_RTRYLOCK(b))
947 ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
950 for (; likely(j != end); j = nm_next(j, lim)) {
951 struct netmap_slot *slot = &ring->slot[j];
954 ft[ft_i].ft_len = slot->len;
955 ft[ft_i].ft_flags = slot->flags;
957 ND("flags is 0x%x", slot->flags);
958 /* this slot goes into a list so initialize the link field */
959 ft[ft_i].ft_next = NM_FT_NULL;
960 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
961 (void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot);
962 __builtin_prefetch(buf);
964 if (slot->flags & NS_MOREFRAG) {
968 if (unlikely(netmap_verbose && frags > 1))
969 RD(5, "%d frags at %d", frags, ft_i - frags);
970 ft[ft_i - frags].ft_frags = frags;
972 if (unlikely((int)ft_i >= bridge_batch))
973 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
976 D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
977 // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
978 ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
979 ft[ft_i - frags].ft_frags = frags - 1;
982 ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
988 /* ----- FreeBSD if_bridge hash function ------- */
991 * The following hash function is adapted from "Hash Functions" by Bob Jenkins
992 * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
994 * http://www.burtleburtle.net/bob/hash/spooky.html
996 #define mix(a, b, c) \
998 a -= b; a -= c; a ^= (c >> 13); \
999 b -= c; b -= a; b ^= (a << 8); \
1000 c -= a; c -= b; c ^= (b >> 13); \
1001 a -= b; a -= c; a ^= (c >> 12); \
1002 b -= c; b -= a; b ^= (a << 16); \
1003 c -= a; c -= b; c ^= (b >> 5); \
1004 a -= b; a -= c; a ^= (c >> 3); \
1005 b -= c; b -= a; b ^= (a << 10); \
1006 c -= a; c -= b; c ^= (b >> 15); \
1007 } while (/*CONSTCOND*/0)
1010 static __inline uint32_t
1011 nm_bridge_rthash(const uint8_t *addr)
1013 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1023 #define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1)
1024 return (c & BRIDGE_RTHASH_MASK);
1031 bdg_netmap_reg(struct netmap_adapter *na, int onoff)
1033 struct netmap_vp_adapter *vpna =
1034 (struct netmap_vp_adapter*)na;
1035 struct ifnet *ifp = na->ifp;
1037 /* the interface is already attached to the bridge,
1038 * so we only need to toggle IFCAP_NETMAP.
1040 BDG_WLOCK(vpna->na_bdg);
1042 ifp->if_capenable |= IFCAP_NETMAP;
1044 ifp->if_capenable &= ~IFCAP_NETMAP;
1046 BDG_WUNLOCK(vpna->na_bdg);
1052 * Lookup function for a learning bridge.
1053 * Update the hash table with the source address,
1054 * and then returns the destination port index, and the
1055 * ring in *dst_ring (at the moment, always use ring 0)
1058 netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring,
1059 struct netmap_vp_adapter *na)
1061 struct nm_hash_ent *ht = na->na_bdg->ht;
1063 u_int dst, mysrc = na->bdg_port;
1064 uint64_t smac, dmac;
1067 D("invalid buf length %d", buf_len);
1068 return NM_BDG_NOPORT;
1070 dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1071 smac = le64toh(*(uint64_t *)(buf + 4));
1075 * The hash is somewhat expensive, there might be some
1076 * worthwhile optimizations here.
1078 if ((buf[6] & 1) == 0) { /* valid src */
1080 sh = nm_bridge_rthash(s); // XXX hash of source
1081 /* update source port forwarding entry */
1082 ht[sh].mac = smac; /* XXX expire ? */
1083 ht[sh].ports = mysrc;
1085 D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1086 s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1088 dst = NM_BDG_BROADCAST;
1089 if ((buf[0] & 1) == 0) { /* unicast */
1090 dh = nm_bridge_rthash(buf); // XXX hash of dst
1091 if (ht[dh].mac == dmac) { /* found dst */
1094 /* XXX otherwise return NM_BDG_UNKNOWN ? */
1102 * Available space in the ring. Only used in VALE code
1103 * and only with is_rx = 1
1105 static inline uint32_t
1106 nm_kr_space(struct netmap_kring *k, int is_rx)
1111 int busy = k->nkr_hwlease - k->nr_hwcur;
1113 busy += k->nkr_num_slots;
1114 space = k->nkr_num_slots - 1 - busy;
1116 /* XXX never used in this branch */
1117 space = k->nr_hwtail - k->nkr_hwlease;
1119 space += k->nkr_num_slots;
1123 if (k->nkr_hwlease >= k->nkr_num_slots ||
1124 k->nr_hwcur >= k->nkr_num_slots ||
1125 k->nr_tail >= k->nkr_num_slots ||
1127 busy >= k->nkr_num_slots) {
1128 D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1129 k->nkr_lease_idx, k->nkr_num_slots);
1138 /* make a lease on the kring for N positions. return the
1140 * XXX only used in VALE code and with is_rx = 1
1142 static inline uint32_t
1143 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1145 uint32_t lim = k->nkr_num_slots - 1;
1146 uint32_t lease_idx = k->nkr_lease_idx;
1148 k->nkr_leases[lease_idx] = NR_NOSLOT;
1149 k->nkr_lease_idx = nm_next(lease_idx, lim);
1151 if (n > nm_kr_space(k, is_rx)) {
1152 D("invalid request for %d slots", n);
1155 /* XXX verify that there are n slots */
1156 k->nkr_hwlease += n;
1157 if (k->nkr_hwlease > lim)
1158 k->nkr_hwlease -= lim + 1;
1160 if (k->nkr_hwlease >= k->nkr_num_slots ||
1161 k->nr_hwcur >= k->nkr_num_slots ||
1162 k->nr_hwtail >= k->nkr_num_slots ||
1163 k->nkr_lease_idx >= k->nkr_num_slots) {
1164 D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1165 k->na->ifp->if_xname,
1166 k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1167 k->nkr_lease_idx, k->nkr_num_slots);
1173 * This flush routine supports only unicast and broadcast but a large
1174 * number of ports, and lets us replace the learn and dispatch functions.
1177 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1180 struct nm_bdg_q *dst_ents, *brddst;
1181 uint16_t num_dsts = 0, *dsts;
1182 struct nm_bridge *b = na->na_bdg;
1183 u_int i, j, me = na->bdg_port;
1186 * The work area (pointed by ft) is followed by an array of
1187 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1188 * queues per port plus one for the broadcast traffic.
1189 * Then we have an array of destination indexes.
1191 dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1192 dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1194 /* first pass: find a destination for each packet in the batch */
1195 for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1196 uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1197 uint16_t dst_port, d_i;
1199 uint8_t *buf = ft[i].ft_buf;
1200 u_int len = ft[i].ft_len;
1202 ND("slot %d frags %d", i, ft[i].ft_frags);
1203 /* Drop the packet if the virtio-net header is not into the first
1204 fragment nor at the very beginning of the second. */
1205 if (unlikely(na->virt_hdr_len > len))
1207 if (len == na->virt_hdr_len) {
1208 buf = ft[i+1].ft_buf;
1209 len = ft[i+1].ft_len;
1211 buf += na->virt_hdr_len;
1212 len -= na->virt_hdr_len;
1214 dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na);
1215 if (netmap_verbose > 255)
1216 RD(5, "slot %d port %d -> %d", i, me, dst_port);
1217 if (dst_port == NM_BDG_NOPORT)
1218 continue; /* this packet is identified to be dropped */
1219 else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1221 else if (dst_port == NM_BDG_BROADCAST)
1222 dst_ring = 0; /* broadcasts always go to ring 0 */
1223 else if (unlikely(dst_port == me ||
1224 !b->bdg_ports[dst_port]))
1227 /* get a position in the scratch pad */
1228 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1231 /* append the first fragment to the list */
1232 if (d->bq_head == NM_FT_NULL) { /* new destination */
1233 d->bq_head = d->bq_tail = i;
1234 /* remember this position to be scanned later */
1235 if (dst_port != NM_BDG_BROADCAST)
1236 dsts[num_dsts++] = d_i;
1238 ft[d->bq_tail].ft_next = i;
1241 d->bq_len += ft[i].ft_frags;
1245 * Broadcast traffic goes to ring 0 on all destinations.
1246 * So we need to add these rings to the list of ports to scan.
1247 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1248 * expensive. We should keep a compact list of active destinations
1249 * so we could shorten this loop.
1251 brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1252 if (brddst->bq_head != NM_FT_NULL) {
1253 for (j = 0; likely(j < b->bdg_active_ports); j++) {
1255 i = b->bdg_port_index[j];
1256 if (unlikely(i == me))
1258 d_i = i * NM_BDG_MAXRINGS;
1259 if (dst_ents[d_i].bq_head == NM_FT_NULL)
1260 dsts[num_dsts++] = d_i;
1264 ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1265 /* second pass: scan destinations (XXX will be modular somehow) */
1266 for (i = 0; i < num_dsts; i++) {
1267 struct ifnet *dst_ifp;
1268 struct netmap_vp_adapter *dst_na;
1269 struct netmap_kring *kring;
1270 struct netmap_ring *ring;
1271 u_int dst_nr, lim, j, d_i, next, brd_next;
1272 u_int needed, howmany;
1273 int retry = netmap_txsync_retry;
1275 uint32_t my_start = 0, lease_idx = 0;
1277 int virt_hdr_mismatch = 0;
1280 ND("second pass %d port %d", i, d_i);
1282 // XXX fix the division
1283 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1284 /* protect from the lookup function returning an inactive
1287 if (unlikely(dst_na == NULL))
1289 if (dst_na->up.na_flags & NAF_SW_ONLY)
1291 dst_ifp = dst_na->up.ifp;
1293 * The interface may be in !netmap mode in two cases:
1294 * - when na is attached but not activated yet;
1295 * - when na is being deactivated but is still attached.
1297 if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) {
1298 ND("not in netmap mode!");
1302 /* there is at least one either unicast or broadcast packet */
1303 brd_next = brddst->bq_head;
1305 /* we need to reserve this many slots. If fewer are
1306 * available, some packets will be dropped.
1307 * Packets may have multiple fragments, so we may not use
1308 * there is a chance that we may not use all of the slots
1309 * we have claimed, so we will need to handle the leftover
1310 * ones when we regain the lock.
1312 needed = d->bq_len + brddst->bq_len;
1314 if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) {
1315 /* There is a virtio-net header/offloadings mismatch between
1316 * source and destination. The slower mismatch datapath will
1317 * be used to cope with all the mismatches.
1319 virt_hdr_mismatch = 1;
1320 if (dst_na->mfs < na->mfs) {
1321 /* We may need to do segmentation offloadings, and so
1322 * we may need a number of destination slots greater
1323 * than the number of input slots ('needed').
1324 * We look for the smallest integer 'x' which satisfies:
1325 * needed * na->mfs + x * H <= x * na->mfs
1326 * where 'H' is the length of the longest header that may
1327 * be replicated in the segmentation process (e.g. for
1328 * TCPv4 we must account for ethernet header, IP header
1329 * and TCPv4 header).
1331 needed = (needed * na->mfs) /
1332 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1333 ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1337 ND(5, "pass 2 dst %d is %x %s",
1338 i, d_i, is_vp ? "virtual" : "nic/host");
1339 dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1340 nrings = dst_na->up.num_rx_rings;
1341 if (dst_nr >= nrings)
1342 dst_nr = dst_nr % nrings;
1343 kring = &dst_na->up.rx_rings[dst_nr];
1345 lim = kring->nkr_num_slots - 1;
1349 if (dst_na->retry && retry) {
1350 /* try to get some free slot from the previous run */
1351 dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1353 /* reserve the buffers in the queue and an entry
1354 * to report completion, and drop lock.
1355 * XXX this might become a helper function.
1357 mtx_lock(&kring->q_lock);
1358 if (kring->nkr_stopped) {
1359 mtx_unlock(&kring->q_lock);
1362 my_start = j = kring->nkr_hwlease;
1363 howmany = nm_kr_space(kring, 1);
1364 if (needed < howmany)
1366 lease_idx = nm_kr_lease(kring, howmany, 1);
1367 mtx_unlock(&kring->q_lock);
1369 /* only retry if we need more than available slots */
1370 if (retry && needed <= howmany)
1373 /* copy to the destination queue */
1374 while (howmany > 0) {
1375 struct netmap_slot *slot;
1376 struct nm_bdg_fwd *ft_p, *ft_end;
1379 /* find the queue from which we pick next packet.
1380 * NM_FT_NULL is always higher than valid indexes
1381 * so we never dereference it if the other list
1382 * has packets (and if both are empty we never
1385 if (next < brd_next) {
1387 next = ft_p->ft_next;
1388 } else { /* insert broadcast */
1389 ft_p = ft + brd_next;
1390 brd_next = ft_p->ft_next;
1392 cnt = ft_p->ft_frags; // cnt > 0
1393 if (unlikely(cnt > howmany))
1394 break; /* no more space */
1395 if (netmap_verbose && cnt > 1)
1396 RD(5, "rx %d frags to %d", cnt, j);
1397 ft_end = ft_p + cnt;
1398 if (unlikely(virt_hdr_mismatch)) {
1399 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
1403 char *dst, *src = ft_p->ft_buf;
1404 size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1406 slot = &ring->slot[j];
1407 dst = BDG_NMB(&dst_na->up, slot);
1409 ND("send [%d] %d(%d) bytes at %s:%d",
1410 i, (int)copy_len, (int)dst_len,
1411 NM_IFPNAME(dst_ifp), j);
1412 /* round to a multiple of 64 */
1413 copy_len = (copy_len + 63) & ~63;
1415 if (ft_p->ft_flags & NS_INDIRECT) {
1416 if (copyin(src, dst, copy_len)) {
1417 // invalid user pointer, pretend len is 0
1421 //memcpy(dst, src, copy_len);
1422 pkt_copy(src, dst, (int)copy_len);
1424 slot->len = dst_len;
1425 slot->flags = (cnt << 8)| NS_MOREFRAG;
1426 j = nm_next(j, lim);
1429 } while (ft_p != ft_end);
1430 slot->flags = (cnt << 8); /* clear flag on last entry */
1433 if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1437 /* current position */
1438 uint32_t *p = kring->nkr_leases; /* shorthand */
1439 uint32_t update_pos;
1440 int still_locked = 1;
1442 mtx_lock(&kring->q_lock);
1443 if (unlikely(howmany > 0)) {
1444 /* not used all bufs. If i am the last one
1445 * i can recover the slots, otherwise must
1446 * fill them with 0 to mark empty packets.
1448 ND("leftover %d bufs", howmany);
1449 if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1450 /* yes i am the last one */
1451 ND("roll back nkr_hwlease to %d", j);
1452 kring->nkr_hwlease = j;
1454 while (howmany-- > 0) {
1455 ring->slot[j].len = 0;
1456 ring->slot[j].flags = 0;
1457 j = nm_next(j, lim);
1461 p[lease_idx] = j; /* report I am done */
1463 update_pos = kring->nr_hwtail;
1465 if (my_start == update_pos) {
1466 /* all slots before my_start have been reported,
1467 * so scan subsequent leases to see if other ranges
1468 * have been completed, and to a selwakeup or txsync.
1470 while (lease_idx != kring->nkr_lease_idx &&
1471 p[lease_idx] != NR_NOSLOT) {
1473 p[lease_idx] = NR_NOSLOT;
1474 lease_idx = nm_next(lease_idx, lim);
1476 /* j is the new 'write' position. j != my_start
1477 * means there are new buffers to report
1479 if (likely(j != my_start)) {
1480 kring->nr_hwtail = j;
1482 mtx_unlock(&kring->q_lock);
1483 dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1484 if (dst_na->retry && retry--)
1489 mtx_unlock(&kring->q_lock);
1492 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1495 brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1502 netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags)
1504 struct netmap_kring *kring = &na->up.tx_rings[ring_nr];
1506 u_int const lim = kring->nkr_num_slots - 1;
1507 u_int const cur = kring->rcur;
1509 if (bridge_batch <= 0) { /* testing only */
1510 done = cur; // used all
1513 if (bridge_batch > NM_BDG_BATCH)
1514 bridge_batch = NM_BDG_BATCH;
1516 done = nm_bdg_preflush(na, ring_nr, kring, cur);
1519 D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail);
1521 * packets between 'done' and 'cur' are left unsent.
1523 kring->nr_hwcur = done;
1524 kring->nr_hwtail = nm_prev(done, lim);
1525 nm_txsync_finalize(kring);
1527 D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags);
1533 * main dispatch routine for the bridge.
1534 * We already know that only one thread is running this.
1535 * we must run nm_bdg_preflush without lock.
1538 bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1540 struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
1541 return netmap_vp_txsync(vpna, ring_nr, flags);
1545 netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1547 struct netmap_kring *kring = &na->rx_rings[ring_nr];
1548 struct netmap_ring *ring = kring->ring;
1549 u_int nm_i, lim = kring->nkr_num_slots - 1;
1550 u_int head = nm_rxsync_prologue(kring);
1554 D("ouch dangerous reset!!!");
1555 n = netmap_ring_reinit(kring);
1559 /* First part, import newly received packets. */
1560 /* actually nothing to do here, they are already in the kring */
1562 /* Second part, skip past packets that userspace has released. */
1563 nm_i = kring->nr_hwcur;
1565 /* consistency check, but nothing really important here */
1566 for (n = 0; likely(nm_i != head); n++) {
1567 struct netmap_slot *slot = &ring->slot[nm_i];
1568 void *addr = BDG_NMB(na, slot);
1570 if (addr == netmap_buffer_base) { /* bad buf */
1571 D("bad buffer index %d, ignore ?",
1574 slot->flags &= ~NS_BUF_CHANGED;
1575 nm_i = nm_next(nm_i, lim);
1577 kring->nr_hwcur = head;
1580 /* tell userspace that there are new packets */
1581 nm_rxsync_finalize(kring);
1588 * user process reading from a VALE switch.
1589 * Already protected against concurrent calls from userspace,
1590 * but we must acquire the queue's lock to protect against
1591 * writers on the same queue.
1594 bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1596 struct netmap_kring *kring = &na->rx_rings[ring_nr];
1599 mtx_lock(&kring->q_lock);
1600 n = netmap_vp_rxsync(na, ring_nr, flags);
1601 mtx_unlock(&kring->q_lock);
1607 bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
1609 struct netmap_vp_adapter *vpna;
1610 struct netmap_adapter *na;
1614 vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO);
1622 /* bound checking */
1623 na->num_tx_rings = nmr->nr_tx_rings;
1624 nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1625 nmr->nr_tx_rings = na->num_tx_rings; // write back
1626 na->num_rx_rings = nmr->nr_rx_rings;
1627 nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1628 nmr->nr_rx_rings = na->num_rx_rings; // write back
1629 nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1630 1, NM_BDG_MAXSLOTS, NULL);
1631 na->num_tx_desc = nmr->nr_tx_slots;
1632 nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1633 1, NM_BDG_MAXSLOTS, NULL);
1634 /* validate number of pipes. We want at least 1,
1635 * but probably can do with some more.
1636 * So let's use 2 as default (when 0 is supplied)
1638 npipes = nmr->nr_arg1;
1639 nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
1640 nmr->nr_arg1 = npipes; /* write back */
1641 /* validate extra bufs */
1642 nm_bound_var(&nmr->nr_arg3, 0, 0,
1643 128*NM_BDG_MAXSLOTS, NULL);
1644 na->num_rx_desc = nmr->nr_rx_slots;
1645 vpna->virt_hdr_len = 0;
1647 /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero??
1648 vpna->mfs = netmap_buf_size; */
1650 D("max frame size %u", vpna->mfs);
1652 na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER;
1653 na->nm_txsync = bdg_netmap_txsync;
1654 na->nm_rxsync = bdg_netmap_rxsync;
1655 na->nm_register = bdg_netmap_reg;
1656 na->nm_dtor = netmap_adapter_vp_dtor;
1657 na->nm_krings_create = netmap_vp_krings_create;
1658 na->nm_krings_delete = netmap_vp_krings_delete;
1659 na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp),
1660 na->num_tx_rings, na->num_tx_desc,
1661 na->num_rx_rings, na->num_rx_desc,
1662 nmr->nr_arg3, npipes, &error);
1663 if (na->nm_mem == NULL)
1665 /* other nmd fields are set in the common routine */
1666 error = netmap_attach_common(na);
1672 if (na->nm_mem != NULL)
1673 netmap_mem_private_delete(na->nm_mem);
1674 free(vpna, M_DEVBUF);
1680 netmap_bwrap_dtor(struct netmap_adapter *na)
1682 struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1683 struct netmap_adapter *hwna = bna->hwna;
1684 struct nm_bridge *b = bna->up.na_bdg,
1685 *bh = bna->host.na_bdg;
1686 struct ifnet *ifp = na->ifp;
1691 netmap_bdg_detach_common(b, bna->up.bdg_port,
1692 (bh ? bna->host.bdg_port : -1));
1695 hwna->na_private = NULL;
1696 netmap_adapter_put(hwna);
1698 bzero(ifp, sizeof(*ifp));
1699 free(ifp, M_DEVBUF);
1706 * Intr callback for NICs connected to a bridge.
1707 * Simply ignore tx interrupts (maybe we could try to recover space ?)
1708 * and pass received packets from nic to the bridge.
1710 * XXX TODO check locking: this is called from the interrupt
1711 * handler so we should make sure that the interface is not
1712 * disconnected while passing down an interrupt.
1714 * Note, no user process can access this NIC or the host stack.
1715 * The only part of the ring that is significant are the slots,
1716 * and head/cur/tail are set from the kring as needed
1717 * (part as a receive ring, part as a transmit ring).
1719 * callback that overwrites the hwna notify callback.
1720 * Packets come from the outside or from the host stack and are put on an hwna rx ring.
1721 * The bridge wrapper then sends the packets through the bridge.
1724 netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags)
1726 struct ifnet *ifp = na->ifp;
1727 struct netmap_bwrap_adapter *bna = na->na_private;
1728 struct netmap_vp_adapter *hostna = &bna->host;
1729 struct netmap_kring *kring, *bkring;
1730 struct netmap_ring *ring;
1731 int is_host_ring = ring_nr == na->num_rx_rings;
1732 struct netmap_vp_adapter *vpna = &bna->up;
1736 D("%s %s%d 0x%x", NM_IFPNAME(ifp),
1737 (tx == NR_TX ? "TX" : "RX"), ring_nr, flags);
1739 if (flags & NAF_DISABLE_NOTIFY) {
1740 kring = tx == NR_TX ? na->tx_rings : na->rx_rings;
1741 bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings;
1742 if (kring[ring_nr].nkr_stopped)
1743 netmap_disable_ring(&bkring[ring_nr]);
1745 bkring[ring_nr].nkr_stopped = 0;
1749 if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP))
1752 /* we only care about receive interrupts */
1756 kring = &na->rx_rings[ring_nr];
1759 /* make sure the ring is not disabled */
1760 if (nm_kr_tryget(kring))
1763 if (is_host_ring && hostna->na_bdg == NULL) {
1764 error = bna->save_notify(na, ring_nr, tx, flags);
1768 /* Here we expect ring->head = ring->cur = ring->tail
1769 * because everything has been released from the previous round.
1770 * However the ring is shared and we might have info from
1771 * the wrong side (the tx ring). Hence we overwrite with
1772 * the info from the rx kring.
1775 D("%s head %d cur %d tail %d (kring %d %d %d)", NM_IFPNAME(ifp),
1776 ring->head, ring->cur, ring->tail,
1777 kring->rhead, kring->rcur, kring->rtail);
1779 ring->head = kring->rhead;
1780 ring->cur = kring->rcur;
1781 ring->tail = kring->rtail;
1787 /* simulate a user wakeup on the rx ring */
1788 /* fetch packets that have arrived.
1789 * XXX maybe do this in a loop ?
1791 error = kring->nm_sync(kring, 0);
1794 if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
1795 D("how strange, interrupt with no packets on %s",
1800 /* new packets are ring->cur to ring->tail, and the bkring
1801 * had hwcur == ring->cur. So advance ring->cur to ring->tail
1802 * to push all packets out.
1804 ring->head = ring->cur = ring->tail;
1806 /* also set tail to what the bwrap expects */
1807 bkring = &vpna->up.tx_rings[ring_nr];
1808 ring->tail = bkring->nr_hwtail; // rtail too ?
1810 /* pass packets to the switch */
1811 nm_txsync_prologue(bkring); // XXX error checking ?
1812 netmap_vp_txsync(vpna, ring_nr, flags);
1814 /* mark all buffers as released on this ring */
1815 ring->head = ring->cur = kring->nr_hwtail;
1816 ring->tail = kring->rtail;
1817 /* another call to actually release the buffers */
1818 if (!is_host_ring) {
1819 error = kring->nm_sync(kring, 0);
1821 /* mark all packets as released, as in the
1822 * second part of netmap_rxsync_from_host()
1824 kring->nr_hwcur = kring->nr_hwtail;
1825 nm_rxsync_finalize(kring);
1835 netmap_bwrap_register(struct netmap_adapter *na, int onoff)
1837 struct netmap_bwrap_adapter *bna =
1838 (struct netmap_bwrap_adapter *)na;
1839 struct netmap_adapter *hwna = bna->hwna;
1840 struct netmap_vp_adapter *hostna = &bna->host;
1843 ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off");
1848 hwna->na_lut = na->na_lut;
1849 hwna->na_lut_objtotal = na->na_lut_objtotal;
1851 if (hostna->na_bdg) {
1852 hostna->up.na_lut = na->na_lut;
1853 hostna->up.na_lut_objtotal = na->na_lut_objtotal;
1856 /* cross-link the netmap rings
1857 * The original number of rings comes from hwna,
1858 * rx rings on one side equals tx rings on the other.
1860 for (i = 0; i < na->num_rx_rings + 1; i++) {
1861 hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots;
1862 hwna->tx_rings[i].ring = na->rx_rings[i].ring;
1864 for (i = 0; i < na->num_tx_rings + 1; i++) {
1865 hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots;
1866 hwna->rx_rings[i].ring = na->tx_rings[i].ring;
1871 error = hwna->nm_register(hwna, onoff);
1876 bdg_netmap_reg(na, onoff);
1879 bna->save_notify = hwna->nm_notify;
1880 hwna->nm_notify = netmap_bwrap_intr_notify;
1882 hwna->nm_notify = bna->save_notify;
1883 hwna->na_lut = NULL;
1884 hwna->na_lut_objtotal = 0;
1892 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
1893 u_int *rxr, u_int *rxd)
1895 struct netmap_bwrap_adapter *bna =
1896 (struct netmap_bwrap_adapter *)na;
1897 struct netmap_adapter *hwna = bna->hwna;
1899 /* forward the request */
1900 netmap_update_config(hwna);
1901 /* swap the results */
1902 *txr = hwna->num_rx_rings;
1903 *txd = hwna->num_rx_desc;
1904 *rxr = hwna->num_tx_rings;
1905 *rxd = hwna->num_rx_desc;
1912 netmap_bwrap_krings_create(struct netmap_adapter *na)
1914 struct netmap_bwrap_adapter *bna =
1915 (struct netmap_bwrap_adapter *)na;
1916 struct netmap_adapter *hwna = bna->hwna;
1917 struct netmap_adapter *hostna = &bna->host.up;
1920 ND("%s", NM_IFPNAME(na->ifp));
1922 error = netmap_vp_krings_create(na);
1926 error = hwna->nm_krings_create(hwna);
1928 netmap_vp_krings_delete(na);
1932 if (na->na_flags & NAF_HOST_RINGS) {
1933 hostna->tx_rings = na->tx_rings + na->num_tx_rings;
1934 hostna->rx_rings = na->rx_rings + na->num_rx_rings;
1942 netmap_bwrap_krings_delete(struct netmap_adapter *na)
1944 struct netmap_bwrap_adapter *bna =
1945 (struct netmap_bwrap_adapter *)na;
1946 struct netmap_adapter *hwna = bna->hwna;
1948 ND("%s", NM_IFPNAME(na->ifp));
1950 hwna->nm_krings_delete(hwna);
1951 netmap_vp_krings_delete(na);
1955 /* notify method for the bridge-->hwna direction */
1957 netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
1959 struct netmap_bwrap_adapter *bna =
1960 (struct netmap_bwrap_adapter *)na;
1961 struct netmap_adapter *hwna = bna->hwna;
1962 struct netmap_kring *kring, *hw_kring;
1963 struct netmap_ring *ring;
1970 kring = &na->rx_rings[ring_n];
1971 hw_kring = &hwna->tx_rings[ring_n];
1973 lim = kring->nkr_num_slots - 1;
1975 if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP))
1977 mtx_lock(&kring->q_lock);
1978 /* first step: simulate a user wakeup on the rx ring */
1979 netmap_vp_rxsync(na, ring_n, flags);
1980 ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1981 NM_IFPNAME(na->ifp), ring_n,
1982 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1983 ring->head, ring->cur, ring->tail,
1984 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
1985 /* second step: the simulated user consumes all new packets */
1986 ring->head = ring->cur = ring->tail;
1988 /* third step: the new packets are sent on the tx ring
1989 * (which is actually the same ring)
1991 /* set tail to what the hw expects */
1992 ring->tail = hw_kring->rtail;
1993 nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ?
1994 error = hw_kring->nm_sync(hw_kring, flags);
1996 /* fourth step: now we are back the rx ring */
1997 /* claim ownership on all hw owned bufs */
1998 ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */
1999 ring->tail = kring->rtail; /* restore saved value of tail, for safety */
2001 /* fifth step: the user goes to sleep again, causing another rxsync */
2002 netmap_vp_rxsync(na, ring_n, flags);
2003 ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2004 NM_IFPNAME(na->ifp), ring_n,
2005 kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2006 ring->head, ring->cur, ring->tail,
2007 hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
2008 mtx_unlock(&kring->q_lock);
2014 netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
2016 struct netmap_bwrap_adapter *bna = na->na_private;
2017 struct netmap_adapter *port_na = &bna->up.up;
2018 if (tx == NR_TX || ring_n != 0)
2020 return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags);
2024 /* attach a bridge wrapper to the 'real' device */
2026 netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
2028 struct netmap_bwrap_adapter *bna;
2029 struct netmap_adapter *na;
2030 struct netmap_adapter *hwna = NA(real);
2031 struct netmap_adapter *hostna;
2035 bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO);
2041 /* fill the ring data for the bwrap adapter with rx/tx meanings
2042 * swapped. The real cross-linking will be done during register,
2043 * when all the krings will have been created.
2045 na->num_rx_rings = hwna->num_tx_rings;
2046 na->num_tx_rings = hwna->num_rx_rings;
2047 na->num_tx_desc = hwna->num_rx_desc;
2048 na->num_rx_desc = hwna->num_tx_desc;
2049 na->nm_dtor = netmap_bwrap_dtor;
2050 na->nm_register = netmap_bwrap_register;
2051 // na->nm_txsync = netmap_bwrap_txsync;
2052 // na->nm_rxsync = netmap_bwrap_rxsync;
2053 na->nm_config = netmap_bwrap_config;
2054 na->nm_krings_create = netmap_bwrap_krings_create;
2055 na->nm_krings_delete = netmap_bwrap_krings_delete;
2056 na->nm_notify = netmap_bwrap_notify;
2057 na->nm_mem = hwna->nm_mem;
2058 na->na_private = na; /* prevent NIOCREGIF */
2059 bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2062 netmap_adapter_get(hwna);
2063 hwna->na_private = bna; /* weak reference */
2065 if (hwna->na_flags & NAF_HOST_RINGS) {
2066 na->na_flags |= NAF_HOST_RINGS;
2067 hostna = &bna->host.up;
2068 hostna->ifp = hwna->ifp;
2069 hostna->num_tx_rings = 1;
2070 hostna->num_tx_desc = hwna->num_rx_desc;
2071 hostna->num_rx_rings = 1;
2072 hostna->num_rx_desc = hwna->num_tx_desc;
2073 // hostna->nm_txsync = netmap_bwrap_host_txsync;
2074 // hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2075 hostna->nm_notify = netmap_bwrap_host_notify;
2076 hostna->nm_mem = na->nm_mem;
2077 hostna->na_private = bna;
2080 ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2081 fake->if_xname, real->if_xname,
2082 na->num_tx_rings, na->num_tx_desc,
2083 na->num_rx_rings, na->num_rx_desc);
2085 error = netmap_attach_common(na);
2087 netmap_adapter_put(hwna);
2088 free(bna, M_DEVBUF);
2096 netmap_init_bridges(void)
2099 bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */
2100 for (i = 0; i < NM_BRIDGES; i++)
2101 BDG_RWINIT(&nm_bridges[i]);
2103 #endif /* WITH_VALE */