2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (C) 2011-2014 Matteo Landi
5 * Copyright (C) 2011-2016 Luigi Rizzo
6 * Copyright (C) 2011-2016 Giuseppe Lettieri
7 * Copyright (C) 2011-2016 Vincenzo Maffione
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * This module supports memory mapped access to network devices,
39 * The module uses a large, memory pool allocated by the kernel
40 * and accessible as mmapped memory by multiple userspace threads/processes.
41 * The memory pool contains packet buffers and "netmap rings",
42 * i.e. user-accessible copies of the interface's queues.
44 * Access to the network card works like this:
45 * 1. a process/thread issues one or more open() on /dev/netmap, to create
46 * select()able file descriptor on which events are reported.
47 * 2. on each descriptor, the process issues an ioctl() to identify
48 * the interface that should report events to the file descriptor.
49 * 3. on each descriptor, the process issues an mmap() request to
50 * map the shared memory region within the process' address space.
51 * The list of interesting queues is indicated by a location in
52 * the shared memory region.
53 * 4. using the functions in the netmap(4) userspace API, a process
54 * can look up the occupation state of a queue, access memory buffers,
55 * and retrieve received packets or enqueue packets to transmit.
56 * 5. using some ioctl()s the process can synchronize the userspace view
57 * of the queue with the actual status in the kernel. This includes both
58 * receiving the notification of new packets, and transmitting new
59 * packets on the output interface.
60 * 6. select() or poll() can be used to wait for events on individual
61 * transmit or receive queues (or all queues for a given interface).
64 SYNCHRONIZATION (USER)
66 The netmap rings and data structures may be shared among multiple
67 user threads or even independent processes.
68 Any synchronization among those threads/processes is delegated
69 to the threads themselves. Only one thread at a time can be in
70 a system call on the same netmap ring. The OS does not enforce
71 this and only guarantees against system crashes in case of
76 Within the kernel, access to the netmap rings is protected as follows:
78 - a spinlock on each ring, to handle producer/consumer races on
79 RX rings attached to the host stack (against multiple host
80 threads writing from the host stack to the same ring),
81 and on 'destination' rings attached to a VALE switch
82 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
83 protecting multiple active senders for the same destination)
85 - an atomic variable to guarantee that there is at most one
86 instance of *_*xsync() on the ring at any time.
87 For rings connected to user file
88 descriptors, an atomic_test_and_set() protects this, and the
89 lock on the ring is not actually used.
90 For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
91 is also used to prevent multiple executions (the driver might indeed
92 already guarantee this).
93 For NIC TX rings connected to a VALE switch, the lock arbitrates
94 access to the queue (both when allocating buffers and when pushing
97 - *xsync() should be protected against initializations of the card.
98 On FreeBSD most devices have the reset routine protected by
99 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
100 the RING protection on rx_reset(), this should be added.
102 On linux there is an external lock on the tx path, which probably
103 also arbitrates access to the reset routine. XXX to be revised
105 - a per-interface core_lock protecting access from the host stack
106 while interfaces may be detached from netmap mode.
107 XXX there should be no need for this lock if we detach the interfaces
108 only while they are down.
113 NMG_LOCK() serializes all modifications to switches and ports.
114 A switch cannot be deleted until all ports are gone.
116 For each switch, an SX lock (RWlock on linux) protects
117 deletion of ports. When configuring or deleting a new port, the
118 lock is acquired in exclusive mode (after holding NMG_LOCK).
119 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
120 The lock is held throughout the entire forwarding cycle,
121 during which the thread may incur in a page fault.
122 Hence it is important that sleepable shared locks are used.
124 On the rx ring, the per-port lock is grabbed initially to reserve
125 a number of slot in the ring, then the lock is released,
126 packets are copied from source to destination, and then
127 the lock is acquired again and the receive ring is updated.
128 (A similar thing is done on the tx ring for NIC and host stack
129 ports attached to the switch)
134 /* --- internals ----
136 * Roadmap to the code that implements the above.
138 * > 1. a process/thread issues one or more open() on /dev/netmap, to create
139 * > select()able file descriptor on which events are reported.
141 * Internally, we allocate a netmap_priv_d structure, that will be
142 * initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
143 * structure for each open().
146 * FreeBSD: see netmap_open() (netmap_freebsd.c)
147 * linux: see linux_netmap_open() (netmap_linux.c)
149 * > 2. on each descriptor, the process issues an ioctl() to identify
150 * > the interface that should report events to the file descriptor.
152 * Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
153 * Most important things happen in netmap_get_na() and
154 * netmap_do_regif(), called from there. Additional details can be
155 * found in the comments above those functions.
157 * In all cases, this action creates/takes-a-reference-to a
158 * netmap_*_adapter describing the port, and allocates a netmap_if
159 * and all necessary netmap rings, filling them with netmap buffers.
161 * In this phase, the sync callbacks for each ring are set (these are used
162 * in steps 5 and 6 below). The callbacks depend on the type of adapter.
163 * The adapter creation/initialization code puts them in the
164 * netmap_adapter (fields na->nm_txsync and na->nm_rxsync). Then, they
165 * are copied from there to the netmap_kring's during netmap_do_regif(), by
166 * the nm_krings_create() callback. All the nm_krings_create callbacks
167 * actually call netmap_krings_create() to perform this and the other
168 * common stuff. netmap_krings_create() also takes care of the host rings,
169 * if needed, by setting their sync callbacks appropriately.
171 * Additional actions depend on the kind of netmap_adapter that has been
174 * - netmap_hw_adapter: [netmap.c]
175 * This is a system netdev/ifp with native netmap support.
176 * The ifp is detached from the host stack by redirecting:
177 * - transmissions (from the network stack) to netmap_transmit()
178 * - receive notifications to the nm_notify() callback for
179 * this adapter. The callback is normally netmap_notify(), unless
180 * the ifp is attached to a bridge using bwrap, in which case it
181 * is netmap_bwrap_intr_notify().
183 * - netmap_generic_adapter: [netmap_generic.c]
184 * A system netdev/ifp without native netmap support.
186 * (the decision about native/non native support is taken in
187 * netmap_get_hw_na(), called by netmap_get_na())
189 * - netmap_vp_adapter [netmap_vale.c]
190 * Returned by netmap_get_bdg_na().
191 * This is a persistent or ephemeral VALE port. Ephemeral ports
192 * are created on the fly if they don't already exist, and are
193 * always attached to a bridge.
194 * Persistent VALE ports must must be created separately, and i
195 * then attached like normal NICs. The NIOCREGIF we are examining
196 * will find them only if they had previously been created and
197 * attached (see VALE_CTL below).
199 * - netmap_pipe_adapter [netmap_pipe.c]
200 * Returned by netmap_get_pipe_na().
201 * Both pipe ends are created, if they didn't already exist.
203 * - netmap_monitor_adapter [netmap_monitor.c]
204 * Returned by netmap_get_monitor_na().
205 * If successful, the nm_sync callbacks of the monitored adapter
206 * will be intercepted by the returned monitor.
208 * - netmap_bwrap_adapter [netmap_vale.c]
209 * Cannot be obtained in this way, see VALE_CTL below
213 * linux: we first go through linux_netmap_ioctl() to
214 * adapt the FreeBSD interface to the linux one.
217 * > 3. on each descriptor, the process issues an mmap() request to
218 * > map the shared memory region within the process' address space.
219 * > The list of interesting queues is indicated by a location in
220 * > the shared memory region.
223 * FreeBSD: netmap_mmap_single (netmap_freebsd.c).
224 * linux: linux_netmap_mmap (netmap_linux.c).
226 * > 4. using the functions in the netmap(4) userspace API, a process
227 * > can look up the occupation state of a queue, access memory buffers,
228 * > and retrieve received packets or enqueue packets to transmit.
230 * these actions do not involve the kernel.
232 * > 5. using some ioctl()s the process can synchronize the userspace view
233 * > of the queue with the actual status in the kernel. This includes both
234 * > receiving the notification of new packets, and transmitting new
235 * > packets on the output interface.
237 * These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
238 * cases. They invoke the nm_sync callbacks on the netmap_kring
239 * structures, as initialized in step 2 and maybe later modified
240 * by a monitor. Monitors, however, will always call the original
241 * callback before doing anything else.
244 * > 6. select() or poll() can be used to wait for events on individual
245 * > transmit or receive queues (or all queues for a given interface).
247 * Implemented in netmap_poll(). This will call the same nm_sync()
248 * callbacks as in step 5 above.
251 * linux: we first go through linux_netmap_poll() to adapt
252 * the FreeBSD interface to the linux one.
255 * ---- VALE_CTL -----
257 * VALE switches are controlled by issuing a NIOCREGIF with a non-null
258 * nr_cmd in the nmreq structure. These subcommands are handled by
259 * netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
260 * and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
261 * subcommands, respectively.
263 * Any network interface known to the system (including a persistent VALE
264 * port) can be attached to a VALE switch by issuing the
265 * NETMAP_REQ_VALE_ATTACH command. After the attachment, persistent VALE ports
266 * look exactly like ephemeral VALE ports (as created in step 2 above). The
267 * attachment of other interfaces, instead, requires the creation of a
268 * netmap_bwrap_adapter. Moreover, the attached interface must be put in
269 * netmap mode. This may require the creation of a netmap_generic_adapter if
270 * we have no native support for the interface, or if generic adapters have
271 * been forced by sysctl.
273 * Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
274 * called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
275 * callback. In the case of the bwrap, the callback creates the
276 * netmap_bwrap_adapter. The initialization of the bwrap is then
277 * completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
278 * callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
279 * A generic adapter for the wrapped ifp will be created if needed, when
280 * netmap_get_bdg_na() calls netmap_get_hw_na().
283 * ---- DATAPATHS -----
285 * -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
287 * na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
289 * - tx from netmap userspace:
291 * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
292 * kring->nm_sync() == DEVICE_netmap_txsync()
293 * 2) device interrupt handler
294 * na->nm_notify() == netmap_notify()
295 * - rx from netmap userspace:
297 * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
298 * kring->nm_sync() == DEVICE_netmap_rxsync()
299 * 2) device interrupt handler
300 * na->nm_notify() == netmap_notify()
301 * - rx from host stack
305 * na->nm_notify == netmap_notify()
306 * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
307 * kring->nm_sync() == netmap_rxsync_from_host
308 * netmap_rxsync_from_host(na, NULL, NULL)
310 * ioctl(NIOCTXSYNC)/netmap_poll() in process context
311 * kring->nm_sync() == netmap_txsync_to_host
312 * netmap_txsync_to_host(na)
314 * FreeBSD: na->if_input() == ether_input()
315 * linux: netif_rx() with NM_MAGIC_PRIORITY_RX
318 * -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
320 * na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
322 * - tx from netmap userspace:
324 * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
325 * kring->nm_sync() == generic_netmap_txsync()
326 * nm_os_generic_xmit_frame()
327 * linux: dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
328 * ifp->ndo_start_xmit == generic_ndo_start_xmit()
329 * gna->save_start_xmit == orig. dev. start_xmit
330 * FreeBSD: na->if_transmit() == orig. dev if_transmit
331 * 2) generic_mbuf_destructor()
332 * na->nm_notify() == netmap_notify()
333 * - rx from netmap userspace:
334 * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
335 * kring->nm_sync() == generic_netmap_rxsync()
338 * generic_rx_handler()
340 * na->nm_notify() == netmap_notify()
341 * - rx from host stack
342 * FreeBSD: same as native
343 * Linux: same as native except:
345 * dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
346 * ifp->ndo_start_xmit == generic_ndo_start_xmit()
348 * na->nm_notify() == netmap_notify()
349 * - tx to host stack (same as native):
357 * ioctl(NIOCTXSYNC)/netmap_poll() in process context
358 * kring->nm_sync() == netmap_vp_txsync()
360 * - system device with native support:
363 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
364 * kring->nm_sync() == DEVICE_netmap_rxsync()
366 * kring->nm_sync() == DEVICE_netmap_rxsync()
369 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
370 * kring->nm_sync() == netmap_rxsync_from_host()
373 * - system device with generic support:
374 * from device driver:
375 * generic_rx_handler()
376 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
377 * kring->nm_sync() == generic_netmap_rxsync()
379 * kring->nm_sync() == generic_netmap_rxsync()
382 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
383 * kring->nm_sync() == netmap_rxsync_from_host()
386 * (all cases) --> nm_bdg_flush()
387 * dest_na->nm_notify() == (see below)
393 * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
394 * kring->nm_sync() == netmap_vp_rxsync()
395 * 2) from nm_bdg_flush()
396 * na->nm_notify() == netmap_notify()
398 * - system device with native support:
400 * na->nm_notify() == netmap_bwrap_notify()
402 * kring->nm_sync() == DEVICE_netmap_txsync()
406 * kring->nm_sync() == netmap_txsync_to_host
407 * netmap_vp_rxsync_locked()
409 * - system device with generic adapter:
411 * na->nm_notify() == netmap_bwrap_notify()
413 * kring->nm_sync() == generic_netmap_txsync()
417 * kring->nm_sync() == netmap_txsync_to_host
423 * OS-specific code that is used only within this file.
424 * Other OS-specific code that must be accessed by drivers
425 * is present in netmap_kern.h
428 #if defined(__FreeBSD__)
429 #include <sys/cdefs.h> /* prerequisite */
430 #include <sys/types.h>
431 #include <sys/errno.h>
432 #include <sys/param.h> /* defines used in kernel.h */
433 #include <sys/kernel.h> /* types used in module initialization */
434 #include <sys/conf.h> /* cdevsw struct, UID, GID */
435 #include <sys/filio.h> /* FIONBIO */
436 #include <sys/sockio.h>
437 #include <sys/socketvar.h> /* struct socket */
438 #include <sys/malloc.h>
439 #include <sys/poll.h>
440 #include <sys/proc.h>
441 #include <sys/rwlock.h>
442 #include <sys/socket.h> /* sockaddrs */
443 #include <sys/selinfo.h>
444 #include <sys/sysctl.h>
445 #include <sys/jail.h>
446 #include <sys/epoch.h>
447 #include <net/vnet.h>
449 #include <net/if_var.h>
450 #include <net/bpf.h> /* BIOCIMMEDIATE */
451 #include <machine/bus.h> /* bus_dmamap_* */
452 #include <sys/endian.h>
453 #include <sys/refcount.h>
454 #include <net/ethernet.h> /* ETHER_BPF_MTAP */
459 #include "bsd_glue.h"
461 #elif defined(__APPLE__)
463 #warning OSX support is only partial
464 #include "osx_glue.h"
466 #elif defined (_WIN32)
468 #include "win_glue.h"
472 #error Unsupported platform
474 #endif /* unsupported */
479 #include <net/netmap.h>
480 #include <dev/netmap/netmap_kern.h>
481 #include <dev/netmap/netmap_mem2.h>
484 /* user-controlled variables */
486 #ifdef CONFIG_NETMAP_DEBUG
488 #endif /* CONFIG_NETMAP_DEBUG */
490 static int netmap_no_timestamp; /* don't timestamp on rxsync */
491 int netmap_no_pendintr = 1;
492 int netmap_txsync_retry = 2;
493 static int netmap_fwd = 0; /* force transparent forwarding */
496 * netmap_admode selects the netmap mode to use.
497 * Invalid values are reset to NETMAP_ADMODE_BEST
499 enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */
500 NETMAP_ADMODE_NATIVE, /* either native or none */
501 NETMAP_ADMODE_GENERIC, /* force generic */
502 NETMAP_ADMODE_LAST };
503 static int netmap_admode = NETMAP_ADMODE_BEST;
505 /* netmap_generic_mit controls mitigation of RX notifications for
506 * the generic netmap adapter. The value is a time interval in
508 int netmap_generic_mit = 100*1000;
510 /* We use by default netmap-aware qdiscs with generic netmap adapters,
511 * even if there can be a little performance hit with hardware NICs.
512 * However, using the qdisc is the safer approach, for two reasons:
513 * 1) it prevents non-fifo qdiscs to break the TX notification
514 * scheme, which is based on mbuf destructors when txqdisc is
516 * 2) it makes it possible to transmit over software devices that
517 * change skb->dev, like bridge, veth, ...
519 * Anyway users looking for the best performance should
520 * use native adapters.
523 int netmap_generic_txqdisc = 1;
526 /* Default number of slots and queues for generic adapters. */
527 int netmap_generic_ringsize = 1024;
528 int netmap_generic_rings = 1;
530 /* Non-zero to enable checksum offloading in NIC drivers */
531 int netmap_generic_hwcsum = 0;
533 /* Non-zero if ptnet devices are allowed to use virtio-net headers. */
534 int ptnet_vnet_hdr = 1;
537 * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
538 * in some other operating systems
542 SYSCTL_DECL(_dev_netmap);
543 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
545 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
546 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
547 #ifdef CONFIG_NETMAP_DEBUG
548 SYSCTL_INT(_dev_netmap, OID_AUTO, debug,
549 CTLFLAG_RW, &netmap_debug, 0, "Debug messages");
550 #endif /* CONFIG_NETMAP_DEBUG */
551 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
552 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
553 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr,
554 0, "Always look for new received packets.");
555 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
556 &netmap_txsync_retry, 0, "Number of txsync loops in bridge's flush.");
558 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0,
559 "Force NR_FORWARD mode");
560 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0,
561 "Adapter mode. 0 selects the best option available,"
562 "1 forces native adapter, 2 forces emulated adapter");
563 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_hwcsum, CTLFLAG_RW, &netmap_generic_hwcsum,
564 0, "Hardware checksums. 0 to disable checksum generation by the NIC (default),"
565 "1 to enable checksum generation by the NIC");
566 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit,
567 0, "RX notification interval in nanoseconds");
568 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW,
569 &netmap_generic_ringsize, 0,
570 "Number of per-ring slots for emulated netmap mode");
571 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW,
572 &netmap_generic_rings, 0,
573 "Number of TX/RX queues for emulated netmap adapters");
575 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW,
576 &netmap_generic_txqdisc, 0, "Use qdisc for generic adapters");
578 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr,
579 0, "Allow ptnet devices to use virtio-net headers");
583 NMG_LOCK_T netmap_global_lock;
586 * mark the ring as stopped, and run through the locks
587 * to make sure other users get to see it.
588 * stopped must be either NR_KR_STOPPED (for unbounded stop)
589 * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
592 netmap_disable_ring(struct netmap_kring *kr, int stopped)
594 nm_kr_stop(kr, stopped);
595 // XXX check if nm_kr_stop is sufficient
596 mtx_lock(&kr->q_lock);
597 mtx_unlock(&kr->q_lock);
601 /* stop or enable a single ring */
603 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
606 netmap_disable_ring(NMR(na, t)[ring_id], stopped);
608 NMR(na, t)[ring_id]->nkr_stopped = 0;
612 /* stop or enable all the rings of na */
614 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
619 if (!nm_netmap_on(na))
622 if (netmap_verbose) {
623 nm_prinf("%s: %sable all rings", na->name,
624 (stopped ? "dis" : "en"));
627 for (i = 0; i < netmap_real_rings(na, t); i++) {
628 netmap_set_ring(na, i, t, stopped);
634 * Convenience function used in drivers. Waits for current txsync()s/rxsync()s
635 * to finish and prevents any new one from starting. Call this before turning
636 * netmap mode off, or before removing the hardware rings (e.g., on module
640 netmap_disable_all_rings(struct ifnet *ifp)
642 if (NM_NA_VALID(ifp)) {
643 netmap_set_all_rings(NA(ifp), NM_KR_LOCKED);
648 * Convenience function used in drivers. Re-enables rxsync and txsync on the
649 * adapter's rings In linux drivers, this should be placed near each
653 netmap_enable_all_rings(struct ifnet *ifp)
655 if (NM_NA_VALID(ifp)) {
656 netmap_set_all_rings(NA(ifp), 0 /* enabled */);
661 netmap_make_zombie(struct ifnet *ifp)
663 if (NM_NA_VALID(ifp)) {
664 struct netmap_adapter *na = NA(ifp);
665 netmap_set_all_rings(na, NM_KR_LOCKED);
666 na->na_flags |= NAF_ZOMBIE;
667 netmap_set_all_rings(na, 0);
672 netmap_undo_zombie(struct ifnet *ifp)
674 if (NM_NA_VALID(ifp)) {
675 struct netmap_adapter *na = NA(ifp);
676 if (na->na_flags & NAF_ZOMBIE) {
677 netmap_set_all_rings(na, NM_KR_LOCKED);
678 na->na_flags &= ~NAF_ZOMBIE;
679 netmap_set_all_rings(na, 0);
685 * generic bound_checking function
688 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
691 const char *op = NULL;
700 } else if (oldv > hi) {
705 nm_prinf("%s %s to %d (was %d)", op, msg, *v, oldv);
711 * packet-dump function, user-supplied or static buffer.
712 * The destination buffer must be at least 30+4*len
715 nm_dump_buf(char *p, int len, int lim, char *dst)
717 static char _dst[8192];
719 static char hex[] ="0123456789abcdef";
720 char *o; /* output position */
722 #define P_HI(x) hex[((x) & 0xf0)>>4]
723 #define P_LO(x) hex[((x) & 0xf)]
724 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
727 if (lim <= 0 || lim > len)
730 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
732 /* hexdump routine */
733 for (i = 0; i < lim; ) {
734 sprintf(o, "%5d: ", i);
738 for (j=0; j < 16 && i < lim; i++, j++) {
740 o[j*3+1] = P_LO(p[i]);
743 for (j=0; j < 16 && i < lim; i++, j++)
744 o[j + 48] = P_C(p[i]);
757 * Fetch configuration from the device, to cope with dynamic
758 * reconfigurations after loading the module.
760 /* call with NMG_LOCK held */
762 netmap_update_config(struct netmap_adapter *na)
764 struct nm_config_info info;
766 bzero(&info, sizeof(info));
767 if (na->nm_config == NULL ||
768 na->nm_config(na, &info)) {
769 /* take whatever we had at init time */
770 info.num_tx_rings = na->num_tx_rings;
771 info.num_tx_descs = na->num_tx_desc;
772 info.num_rx_rings = na->num_rx_rings;
773 info.num_rx_descs = na->num_rx_desc;
774 info.rx_buf_maxsize = na->rx_buf_maxsize;
777 if (na->num_tx_rings == info.num_tx_rings &&
778 na->num_tx_desc == info.num_tx_descs &&
779 na->num_rx_rings == info.num_rx_rings &&
780 na->num_rx_desc == info.num_rx_descs &&
781 na->rx_buf_maxsize == info.rx_buf_maxsize)
782 return 0; /* nothing changed */
783 if (na->active_fds == 0) {
784 na->num_tx_rings = info.num_tx_rings;
785 na->num_tx_desc = info.num_tx_descs;
786 na->num_rx_rings = info.num_rx_rings;
787 na->num_rx_desc = info.num_rx_descs;
788 na->rx_buf_maxsize = info.rx_buf_maxsize;
790 nm_prinf("configuration changed for %s: txring %d x %d, "
791 "rxring %d x %d, rxbufsz %d",
792 na->name, na->num_tx_rings, na->num_tx_desc,
793 na->num_rx_rings, na->num_rx_desc, na->rx_buf_maxsize);
796 nm_prerr("WARNING: configuration changed for %s while active: "
797 "txring %d x %d, rxring %d x %d, rxbufsz %d",
798 na->name, info.num_tx_rings, info.num_tx_descs,
799 info.num_rx_rings, info.num_rx_descs,
800 info.rx_buf_maxsize);
804 /* nm_sync callbacks for the host rings */
805 static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
806 static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
809 netmap_default_bufcfg(struct netmap_kring *kring, uint64_t target)
811 kring->hwbuf_len = target;
812 kring->buf_align = 0; /* no alignment */
816 /* create the krings array and initialize the fields common to all adapters.
817 * The array layout is this:
820 * na->tx_rings ----->| | \
821 * | | } na->num_tx_ring
825 * na->rx_rings ----> +----------+
827 * | | } na->num_rx_rings
832 * na->tailroom ----->| | \
833 * | | } tailroom bytes
837 * Note: for compatibility, host krings are created even when not needed.
838 * The tailroom space is currently used by vale ports for allocating leases.
840 /* call with NMG_LOCK held */
842 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
845 struct netmap_kring *kring;
850 if (na->tx_rings != NULL) {
851 if (netmap_debug & NM_DEBUG_ON)
852 nm_prerr("warning: krings were already created");
856 /* account for the (possibly fake) host rings */
857 n[NR_TX] = netmap_all_rings(na, NR_TX);
858 n[NR_RX] = netmap_all_rings(na, NR_RX);
860 len = (n[NR_TX] + n[NR_RX]) *
861 (sizeof(struct netmap_kring) + sizeof(struct netmap_kring *))
864 na->tx_rings = nm_os_malloc((size_t)len);
865 if (na->tx_rings == NULL) {
866 nm_prerr("Cannot allocate krings");
869 na->rx_rings = na->tx_rings + n[NR_TX];
870 na->tailroom = na->rx_rings + n[NR_RX];
872 /* link the krings in the krings array */
873 kring = (struct netmap_kring *)((char *)na->tailroom + tailroom);
874 for (i = 0; i < n[NR_TX] + n[NR_RX]; i++) {
875 na->tx_rings[i] = kring;
880 * All fields in krings are 0 except the one initialized below.
881 * but better be explicit on important kring fields.
884 ndesc = nma_get_ndesc(na, t);
885 for (i = 0; i < n[t]; i++) {
886 kring = NMR(na, t)[i];
887 bzero(kring, sizeof(*kring));
888 kring->notify_na = na;
891 kring->nkr_num_slots = ndesc;
892 kring->nr_mode = NKR_NETMAP_OFF;
893 kring->nr_pending_mode = NKR_NETMAP_OFF;
894 if (i < nma_get_nrings(na, t)) {
895 kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
896 kring->nm_bufcfg = na->nm_bufcfg;
897 if (kring->nm_bufcfg == NULL)
898 kring->nm_bufcfg = netmap_default_bufcfg;
900 if (!(na->na_flags & NAF_HOST_RINGS))
901 kring->nr_kflags |= NKR_FAKERING;
902 kring->nm_sync = (t == NR_TX ?
903 netmap_txsync_to_host:
904 netmap_rxsync_from_host);
905 kring->nm_bufcfg = netmap_default_bufcfg;
907 kring->nm_notify = na->nm_notify;
908 kring->rhead = kring->rcur = kring->nr_hwcur = 0;
910 * IMPORTANT: Always keep one slot empty.
912 kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
913 snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
915 nm_prdis("ktx %s h %d c %d t %d",
916 kring->name, kring->rhead, kring->rcur, kring->rtail);
917 err = nm_os_selinfo_init(&kring->si, kring->name);
919 netmap_krings_delete(na);
922 mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
923 kring->na = na; /* setting this field marks the mutex as initialized */
925 err = nm_os_selinfo_init(&na->si[t], na->name);
927 netmap_krings_delete(na);
936 /* undo the actions performed by netmap_krings_create */
937 /* call with NMG_LOCK held */
939 netmap_krings_delete(struct netmap_adapter *na)
941 struct netmap_kring **kring = na->tx_rings;
944 if (na->tx_rings == NULL) {
945 if (netmap_debug & NM_DEBUG_ON)
946 nm_prerr("warning: krings were already deleted");
951 nm_os_selinfo_uninit(&na->si[t]);
953 /* we rely on the krings layout described above */
954 for ( ; kring != na->tailroom; kring++) {
955 if ((*kring)->na != NULL)
956 mtx_destroy(&(*kring)->q_lock);
957 nm_os_selinfo_uninit(&(*kring)->si);
959 nm_os_free(na->tx_rings);
960 na->tx_rings = na->rx_rings = na->tailroom = NULL;
965 * Destructor for NIC ports. They also have an mbuf queue
966 * on the rings connected to the host so we need to purge
969 /* call with NMG_LOCK held */
971 netmap_hw_krings_delete(struct netmap_adapter *na)
973 u_int lim = netmap_real_rings(na, NR_RX), i;
975 for (i = nma_get_nrings(na, NR_RX); i < lim; i++) {
976 struct mbq *q = &NMR(na, NR_RX)[i]->rx_queue;
977 nm_prdis("destroy sw mbq with len %d", mbq_len(q));
981 netmap_krings_delete(na);
985 netmap_mem_restore(struct netmap_adapter *na)
987 if (na->nm_mem_prev) {
988 netmap_mem_put(na->nm_mem);
989 na->nm_mem = na->nm_mem_prev;
990 na->nm_mem_prev = NULL;
995 netmap_mem_drop(struct netmap_adapter *na)
997 netmap_mem_deref(na->nm_mem, na);
999 if (na->active_fds <= 0) {
1000 /* if the native allocator had been overridden on regif,
1001 * restore it now and drop the temporary one
1003 netmap_mem_restore(na);
1008 netmap_update_hostrings_mode(struct netmap_adapter *na)
1011 struct netmap_kring *kring;
1015 for (i = nma_get_nrings(na, t);
1016 i < netmap_real_rings(na, t); i++) {
1017 kring = NMR(na, t)[i];
1018 kring->nr_mode = kring->nr_pending_mode;
1024 * Undo everything that was done in netmap_do_regif(). In particular,
1025 * call nm_register(ifp,0) to stop netmap mode on the interface and
1026 * revert to normal operation.
1028 /* call with NMG_LOCK held */
1029 static void netmap_unset_ringid(struct netmap_priv_d *);
1030 static void netmap_krings_put(struct netmap_priv_d *);
1032 netmap_do_unregif(struct netmap_priv_d *priv)
1034 struct netmap_adapter *na = priv->np_na;
1038 /* unset nr_pending_mode and possibly release exclusive mode */
1039 netmap_krings_put(priv);
1042 /* XXX check whether we have to do something with monitor
1043 * when rings change nr_mode. */
1044 if (na->active_fds <= 0) {
1045 /* walk through all the rings and tell any monitor
1046 * that the port is going to exit netmap mode
1048 netmap_monitor_stop(na);
1052 if (na->active_fds <= 0 || nm_kring_pending(priv)) {
1053 netmap_set_all_rings(na, NM_KR_LOCKED);
1054 na->nm_register(na, 0);
1055 netmap_set_all_rings(na, 0);
1058 /* delete rings and buffers that are no longer needed */
1059 netmap_mem_rings_delete(na);
1061 if (na->active_fds <= 0) { /* last instance */
1063 * (TO CHECK) We enter here
1064 * when the last reference to this file descriptor goes
1065 * away. This means we cannot have any pending poll()
1066 * or interrupt routine operating on the structure.
1067 * XXX The file may be closed in a thread while
1068 * another thread is using it.
1069 * Linux keeps the file opened until the last reference
1070 * by any outstanding ioctl/poll or mmap is gone.
1071 * FreeBSD does not track mmap()s (but we do) and
1072 * wakes up any sleeping poll(). Need to check what
1073 * happens if the close() occurs while a concurrent
1074 * syscall is running.
1076 if (netmap_debug & NM_DEBUG_ON)
1077 nm_prinf("deleting last instance for %s", na->name);
1079 if (nm_netmap_on(na)) {
1080 nm_prerr("BUG: netmap on while going to delete the krings");
1083 na->nm_krings_delete(na);
1085 /* restore the default number of host tx and rx rings */
1086 if (na->na_flags & NAF_HOST_RINGS) {
1087 na->num_host_tx_rings = 1;
1088 na->num_host_rx_rings = 1;
1090 na->num_host_tx_rings = 0;
1091 na->num_host_rx_rings = 0;
1095 /* possibly decrement counter of tx_si/rx_si users */
1096 netmap_unset_ringid(priv);
1097 /* delete the nifp */
1098 netmap_mem_if_delete(na, priv->np_nifp);
1099 /* drop the allocator */
1100 netmap_mem_drop(na);
1101 /* mark the priv as unregistered */
1103 priv->np_nifp = NULL;
1106 struct netmap_priv_d*
1107 netmap_priv_new(void)
1109 struct netmap_priv_d *priv;
1111 priv = nm_os_malloc(sizeof(struct netmap_priv_d));
1120 * Destructor of the netmap_priv_d, called when the fd is closed
1121 * Action: undo all the things done by NIOCREGIF,
1122 * On FreeBSD we need to track whether there are active mmap()s,
1123 * and we use np_active_mmaps for that. On linux, the field is always 0.
1124 * Return: 1 if we can free priv, 0 otherwise.
1127 /* call with NMG_LOCK held */
1129 netmap_priv_delete(struct netmap_priv_d *priv)
1131 struct netmap_adapter *na = priv->np_na;
1133 /* number of active references to this fd */
1134 if (--priv->np_refs > 0) {
1139 netmap_do_unregif(priv);
1141 netmap_unget_na(na, priv->np_ifp);
1142 bzero(priv, sizeof(*priv)); /* for safety */
1147 /* call with NMG_LOCK *not* held */
1149 netmap_dtor(void *data)
1151 struct netmap_priv_d *priv = data;
1154 netmap_priv_delete(priv);
1160 * Handlers for synchronization of the rings from/to the host stack.
1161 * These are associated to a network interface and are just another
1162 * ring pair managed by userspace.
1164 * Netmap also supports transparent forwarding (NS_FORWARD and NR_FORWARD
1167 * - Before releasing buffers on hw RX rings, the application can mark
1168 * them with the NS_FORWARD flag. During the next RXSYNC or poll(), they
1169 * will be forwarded to the host stack, similarly to what happened if
1170 * the application moved them to the host TX ring.
1172 * - Before releasing buffers on the host RX ring, the application can
1173 * mark them with the NS_FORWARD flag. During the next RXSYNC or poll(),
1174 * they will be forwarded to the hw TX rings, saving the application
1175 * from doing the same task in user-space.
1177 * Transparent forwarding can be enabled per-ring, by setting the NR_FORWARD
1178 * flag, or globally with the netmap_fwd sysctl.
1180 * The transfer NIC --> host is relatively easy, just encapsulate
1181 * into mbufs and we are done. The host --> NIC side is slightly
1182 * harder because there might not be room in the tx ring so it
1183 * might take a while before releasing the buffer.
1188 * Pass a whole queue of mbufs to the host stack as coming from 'dst'
1189 * We do not need to lock because the queue is private.
1190 * After this call the queue is empty.
1193 netmap_send_up(struct ifnet *dst, struct mbq *q)
1196 struct mbuf *head = NULL, *prev = NULL;
1198 struct epoch_tracker et;
1200 NET_EPOCH_ENTER(et);
1201 #endif /* __FreeBSD__ */
1202 /* Send packets up, outside the lock; head/prev machinery
1203 * is only useful for Windows. */
1204 while ((m = mbq_dequeue(q)) != NULL) {
1205 if (netmap_debug & NM_DEBUG_HOST)
1206 nm_prinf("sending up pkt %p size %d", m, MBUF_LEN(m));
1207 prev = nm_os_send_up(dst, m, prev);
1212 nm_os_send_up(dst, NULL, head);
1215 #endif /* __FreeBSD__ */
1221 * Scan the buffers from hwcur to ring->head, and put a copy of those
1222 * marked NS_FORWARD (or all of them if forced) into a queue of mbufs.
1223 * Drop remaining packets in the unlikely event
1224 * of an mbuf shortage.
1227 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1229 u_int const lim = kring->nkr_num_slots - 1;
1230 u_int const head = kring->rhead;
1232 struct netmap_adapter *na = kring->na;
1234 for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1236 struct netmap_slot *slot = &kring->ring->slot[n];
1238 if ((slot->flags & NS_FORWARD) == 0 && !force)
1240 if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1241 nm_prlim(5, "bad pkt at %d len %d", n, slot->len);
1244 slot->flags &= ~NS_FORWARD; // XXX needed ?
1245 /* XXX TODO: adapt to the case of a multisegment packet */
1246 m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1255 _nm_may_forward(struct netmap_kring *kring)
1257 return ((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
1258 kring->na->na_flags & NAF_HOST_RINGS &&
1259 kring->tx == NR_RX);
1263 nm_may_forward_up(struct netmap_kring *kring)
1265 return _nm_may_forward(kring) &&
1266 kring->ring_id != kring->na->num_rx_rings;
1270 nm_may_forward_down(struct netmap_kring *kring, int sync_flags)
1272 return _nm_may_forward(kring) &&
1273 (sync_flags & NAF_CAN_FORWARD_DOWN) &&
1274 kring->ring_id == kring->na->num_rx_rings;
1278 * Send to the NIC rings packets marked NS_FORWARD between
1279 * kring->nr_hwcur and kring->rhead.
1280 * Called under kring->rx_queue.lock on the sw rx ring.
1282 * It can only be called if the user opened all the TX hw rings,
1283 * see NAF_CAN_FORWARD_DOWN flag.
1284 * We can touch the TX netmap rings (slots, head and cur) since
1285 * we are in poll/ioctl system call context, and the application
1286 * is not supposed to touch the ring (using a different thread)
1287 * during the execution of the system call.
1290 netmap_sw_to_nic(struct netmap_adapter *na)
1292 struct netmap_kring *kring = na->rx_rings[na->num_rx_rings];
1293 struct netmap_slot *rxslot = kring->ring->slot;
1294 u_int i, rxcur = kring->nr_hwcur;
1295 u_int const head = kring->rhead;
1296 u_int const src_lim = kring->nkr_num_slots - 1;
1299 /* scan rings to find space, then fill as much as possible */
1300 for (i = 0; i < na->num_tx_rings; i++) {
1301 struct netmap_kring *kdst = na->tx_rings[i];
1302 struct netmap_ring *rdst = kdst->ring;
1303 u_int const dst_lim = kdst->nkr_num_slots - 1;
1305 /* XXX do we trust ring or kring->rcur,rtail ? */
1306 for (; rxcur != head && !nm_ring_empty(rdst);
1307 rxcur = nm_next(rxcur, src_lim) ) {
1308 struct netmap_slot *src, *dst, tmp;
1309 u_int dst_head = rdst->head;
1311 src = &rxslot[rxcur];
1312 if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1317 dst = &rdst->slot[dst_head];
1321 src->buf_idx = dst->buf_idx;
1322 src->flags = NS_BUF_CHANGED;
1324 dst->buf_idx = tmp.buf_idx;
1326 dst->flags = NS_BUF_CHANGED;
1328 rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
1330 /* if (sent) XXX txsync ? it would be just an optimization */
1337 * netmap_txsync_to_host() passes packets up. We are called from a
1338 * system call in user process context, and the only contention
1339 * can be among multiple user threads erroneously calling
1340 * this routine concurrently.
1343 netmap_txsync_to_host(struct netmap_kring *kring, int flags)
1345 struct netmap_adapter *na = kring->na;
1346 u_int const lim = kring->nkr_num_slots - 1;
1347 u_int const head = kring->rhead;
1350 /* Take packets from hwcur to head and pass them up.
1351 * Force hwcur = head since netmap_grab_packets() stops at head
1354 netmap_grab_packets(kring, &q, 1 /* force */);
1355 nm_prdis("have %d pkts in queue", mbq_len(&q));
1356 kring->nr_hwcur = head;
1357 kring->nr_hwtail = head + lim;
1358 if (kring->nr_hwtail > lim)
1359 kring->nr_hwtail -= lim + 1;
1361 netmap_send_up(na->ifp, &q);
1367 * rxsync backend for packets coming from the host stack.
1368 * They have been put in kring->rx_queue by netmap_transmit().
1369 * We protect access to the kring using kring->rx_queue.lock
1371 * also moves to the nic hw rings any packet the user has marked
1372 * for transparent-mode forwarding, then sets the NR_FORWARD
1373 * flag in the kring to let the caller push them out
1376 netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
1378 struct netmap_adapter *na = kring->na;
1379 struct netmap_ring *ring = kring->ring;
1381 u_int const lim = kring->nkr_num_slots - 1;
1382 u_int const head = kring->rhead;
1384 struct mbq *q = &kring->rx_queue, fq;
1386 mbq_init(&fq); /* fq holds packets to be freed */
1390 /* First part: import newly received packets */
1392 if (n) { /* grab packets from the queue */
1396 nm_i = kring->nr_hwtail;
1397 stop_i = nm_prev(kring->nr_hwcur, lim);
1398 while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1399 int len = MBUF_LEN(m);
1400 struct netmap_slot *slot = &ring->slot[nm_i];
1402 m_copydata(m, 0, len, NMB(na, slot));
1403 nm_prdis("nm %d len %d", nm_i, len);
1404 if (netmap_debug & NM_DEBUG_HOST)
1405 nm_prinf("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1409 nm_i = nm_next(nm_i, lim);
1410 mbq_enqueue(&fq, m);
1412 kring->nr_hwtail = nm_i;
1416 * Second part: skip past packets that userspace has released.
1418 nm_i = kring->nr_hwcur;
1419 if (nm_i != head) { /* something was released */
1420 if (nm_may_forward_down(kring, flags)) {
1421 ret = netmap_sw_to_nic(na);
1423 kring->nr_kflags |= NR_FORWARD;
1427 kring->nr_hwcur = head;
1439 /* Get a netmap adapter for the port.
1441 * If it is possible to satisfy the request, return 0
1442 * with *na containing the netmap adapter found.
1443 * Otherwise return an error code, with *na containing NULL.
1445 * When the port is attached to a bridge, we always return
1447 * Otherwise, if the port is already bound to a file descriptor,
1448 * then we unconditionally return the existing adapter into *na.
1449 * In all the other cases, we return (into *na) either native,
1450 * generic or NULL, according to the following table:
1453 * active_fds dev.netmap.admode YES NO
1454 * -------------------------------------------------------
1455 * >0 * NA(ifp) NA(ifp)
1457 * 0 NETMAP_ADMODE_BEST NATIVE GENERIC
1458 * 0 NETMAP_ADMODE_NATIVE NATIVE NULL
1459 * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC
1462 static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
1464 netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na)
1466 /* generic support */
1467 int i = netmap_admode; /* Take a snapshot. */
1468 struct netmap_adapter *prev_na;
1471 *na = NULL; /* default */
1473 /* reset in case of invalid value */
1474 if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1475 i = netmap_admode = NETMAP_ADMODE_BEST;
1477 if (NM_NA_VALID(ifp)) {
1479 /* If an adapter already exists, return it if
1480 * there are active file descriptors or if
1481 * netmap is not forced to use generic
1484 if (NETMAP_OWNED_BY_ANY(prev_na)
1485 || i != NETMAP_ADMODE_GENERIC
1486 || prev_na->na_flags & NAF_FORCE_NATIVE
1488 /* ugly, but we cannot allow an adapter switch
1489 * if some pipe is referring to this one
1491 || prev_na->na_next_pipe > 0
1499 /* If there isn't native support and netmap is not allowed
1500 * to use generic adapters, we cannot satisfy the request.
1502 if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
1505 /* Otherwise, create a generic adapter and return it,
1506 * saving the previously used netmap adapter, if any.
1508 * Note that here 'prev_na', if not NULL, MUST be a
1509 * native adapter, and CANNOT be a generic one. This is
1510 * true because generic adapters are created on demand, and
1511 * destroyed when not used anymore. Therefore, if the adapter
1512 * currently attached to an interface 'ifp' is generic, it
1514 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1515 * Consequently, if NA(ifp) is generic, we will enter one of
1516 * the branches above. This ensures that we never override
1517 * a generic adapter with another generic adapter.
1519 error = generic_netmap_attach(ifp);
1526 if (nmd != NULL && !((*na)->na_flags & NAF_MEM_OWNER) &&
1527 (*na)->active_fds == 0 && ((*na)->nm_mem != nmd)) {
1528 (*na)->nm_mem_prev = (*na)->nm_mem;
1529 (*na)->nm_mem = netmap_mem_get(nmd);
1536 * MUST BE CALLED UNDER NMG_LOCK()
1538 * Get a refcounted reference to a netmap adapter attached
1539 * to the interface specified by req.
1540 * This is always called in the execution of an ioctl().
1542 * Return ENXIO if the interface specified by the request does
1543 * not exist, ENOTSUP if netmap is not supported by the interface,
1544 * EBUSY if the interface is already attached to a bridge,
1545 * EINVAL if parameters are invalid, ENOMEM if needed resources
1546 * could not be allocated.
1547 * If successful, hold a reference to the netmap adapter.
1549 * If the interface specified by req is a system one, also keep
1550 * a reference to it and return a valid *ifp.
1553 netmap_get_na(struct nmreq_header *hdr,
1554 struct netmap_adapter **na, struct ifnet **ifp,
1555 struct netmap_mem_d *nmd, int create)
1557 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1559 struct netmap_adapter *ret = NULL;
1562 *na = NULL; /* default return value */
1565 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1569 if (req->nr_mode == NR_REG_PIPE_MASTER ||
1570 req->nr_mode == NR_REG_PIPE_SLAVE) {
1571 /* Do not accept deprecated pipe modes. */
1572 nm_prerr("Deprecated pipe nr_mode, use xx{yy or xx}yy syntax");
1578 /* if the request contain a memid, try to find the
1579 * corresponding memory region
1581 if (nmd == NULL && req->nr_mem_id) {
1582 nmd = netmap_mem_find(req->nr_mem_id);
1585 /* keep the rereference */
1589 /* We cascade through all possible types of netmap adapter.
1590 * All netmap_get_*_na() functions return an error and an na,
1591 * with the following combinations:
1594 * 0 NULL type doesn't match
1595 * !0 NULL type matches, but na creation/lookup failed
1596 * 0 !NULL type matches and na created/found
1597 * !0 !NULL impossible
1599 error = netmap_get_null_na(hdr, na, nmd, create);
1600 if (error || *na != NULL)
1603 /* try to see if this is a monitor port */
1604 error = netmap_get_monitor_na(hdr, na, nmd, create);
1605 if (error || *na != NULL)
1608 /* try to see if this is a pipe port */
1609 error = netmap_get_pipe_na(hdr, na, nmd, create);
1610 if (error || *na != NULL)
1613 /* try to see if this is a vale port */
1614 error = netmap_get_vale_na(hdr, na, nmd, create);
1618 if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1622 * This must be a hardware na, lookup the name in the system.
1623 * Note that by hardware we actually mean "it shows up in ifconfig".
1624 * This may still be a tap, a veth/epair, or even a
1625 * persistent VALE port.
1627 *ifp = ifunit_ref(hdr->nr_name);
1633 error = netmap_get_hw_na(*ifp, nmd, &ret);
1638 netmap_adapter_get(ret);
1641 * if the adapter supports the host rings and it is not already open,
1642 * try to set the number of host rings as requested by the user
1644 if (((*na)->na_flags & NAF_HOST_RINGS) && (*na)->active_fds == 0) {
1645 if (req->nr_host_tx_rings)
1646 (*na)->num_host_tx_rings = req->nr_host_tx_rings;
1647 if (req->nr_host_rx_rings)
1648 (*na)->num_host_rx_rings = req->nr_host_rx_rings;
1650 nm_prdis("%s: host tx %d rx %u", (*na)->name, (*na)->num_host_tx_rings,
1651 (*na)->num_host_rx_rings);
1656 netmap_adapter_put(ret);
1663 netmap_mem_put(nmd);
1668 /* undo netmap_get_na() */
1670 netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)
1675 netmap_adapter_put(na);
1679 #define NM_FAIL_ON(t) do { \
1680 if (unlikely(t)) { \
1681 nm_prlim(5, "%s: fail '" #t "' " \
1683 "rh %d rc %d rt %d " \
1686 head, cur, ring->tail, \
1687 kring->rhead, kring->rcur, kring->rtail, \
1688 kring->nr_hwcur, kring->nr_hwtail); \
1689 return kring->nkr_num_slots; \
1694 * validate parameters on entry for *_txsync()
1695 * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1698 * rhead, rcur and rtail=hwtail are stored from previous round.
1699 * hwcur is the next packet to send to the ring.
1702 * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1704 * hwcur, rhead, rtail and hwtail are reliable
1707 nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1709 u_int head = ring->head; /* read only once */
1710 u_int cur = ring->cur; /* read only once */
1711 u_int n = kring->nkr_num_slots;
1713 nm_prdis(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1715 kring->nr_hwcur, kring->nr_hwtail,
1716 ring->head, ring->cur, ring->tail);
1717 #if 1 /* kernel sanity checks; but we can trust the kring. */
1718 NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
1719 kring->rtail >= n || kring->nr_hwtail >= n);
1720 #endif /* kernel sanity checks */
1722 * user sanity checks. We only use head,
1723 * A, B, ... are possible positions for head:
1725 * 0 A rhead B rtail C n-1
1726 * 0 D rtail E rhead F n-1
1728 * B, F, D are valid. A, C, E are wrong
1730 if (kring->rtail >= kring->rhead) {
1731 /* want rhead <= head <= rtail */
1732 NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
1733 /* and also head <= cur <= rtail */
1734 NM_FAIL_ON(cur < head || cur > kring->rtail);
1735 } else { /* here rtail < rhead */
1736 /* we need head outside rtail .. rhead */
1737 NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
1739 /* two cases now: head <= rtail or head >= rhead */
1740 if (head <= kring->rtail) {
1741 /* want head <= cur <= rtail */
1742 NM_FAIL_ON(cur < head || cur > kring->rtail);
1743 } else { /* head >= rhead */
1744 /* cur must be outside rtail..head */
1745 NM_FAIL_ON(cur > kring->rtail && cur < head);
1748 if (ring->tail != kring->rtail) {
1749 nm_prlim(5, "%s tail overwritten was %d need %d", kring->name,
1750 ring->tail, kring->rtail);
1751 ring->tail = kring->rtail;
1753 kring->rhead = head;
1760 * validate parameters on entry for *_rxsync()
1761 * Returns ring->head if ok, kring->nkr_num_slots on error.
1763 * For a valid configuration,
1764 * hwcur <= head <= cur <= tail <= hwtail
1766 * We only consider head and cur.
1767 * hwcur and hwtail are reliable.
1771 nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1773 uint32_t const n = kring->nkr_num_slots;
1776 nm_prdis(5,"%s kc %d kt %d h %d c %d t %d",
1778 kring->nr_hwcur, kring->nr_hwtail,
1779 ring->head, ring->cur, ring->tail);
1781 * Before storing the new values, we should check they do not
1782 * move backwards. However:
1783 * - head is not an issue because the previous value is hwcur;
1784 * - cur could in principle go back, however it does not matter
1785 * because we are processing a brand new rxsync()
1787 cur = kring->rcur = ring->cur; /* read only once */
1788 head = kring->rhead = ring->head; /* read only once */
1789 #if 1 /* kernel sanity checks */
1790 NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
1791 #endif /* kernel sanity checks */
1792 /* user sanity checks */
1793 if (kring->nr_hwtail >= kring->nr_hwcur) {
1794 /* want hwcur <= rhead <= hwtail */
1795 NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
1796 /* and also rhead <= rcur <= hwtail */
1797 NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1799 /* we need rhead outside hwtail..hwcur */
1800 NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
1801 /* two cases now: head <= hwtail or head >= hwcur */
1802 if (head <= kring->nr_hwtail) {
1803 /* want head <= cur <= hwtail */
1804 NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1806 /* cur must be outside hwtail..head */
1807 NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
1810 if (ring->tail != kring->rtail) {
1811 nm_prlim(5, "%s tail overwritten was %d need %d",
1813 ring->tail, kring->rtail);
1814 ring->tail = kring->rtail;
1821 * Error routine called when txsync/rxsync detects an error.
1822 * Can't do much more than resetting head = cur = hwcur, tail = hwtail
1823 * Return 1 on reinit.
1825 * This routine is only called by the upper half of the kernel.
1826 * It only reads hwcur (which is changed only by the upper half, too)
1827 * and hwtail (which may be changed by the lower half, but only on
1828 * a tx ring and only to increase it, so any error will be recovered
1829 * on the next call). For the above, we don't strictly need to call
1833 netmap_ring_reinit(struct netmap_kring *kring)
1835 struct netmap_ring *ring = kring->ring;
1836 u_int i, lim = kring->nkr_num_slots - 1;
1839 // XXX KASSERT nm_kr_tryget
1840 nm_prlim(10, "called for %s", kring->name);
1841 // XXX probably wrong to trust userspace
1842 kring->rhead = ring->head;
1843 kring->rcur = ring->cur;
1844 kring->rtail = ring->tail;
1846 if (ring->cur > lim)
1848 if (ring->head > lim)
1850 if (ring->tail > lim)
1852 for (i = 0; i <= lim; i++) {
1853 u_int idx = ring->slot[i].buf_idx;
1854 u_int len = ring->slot[i].len;
1855 if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1856 nm_prlim(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1857 ring->slot[i].buf_idx = 0;
1858 ring->slot[i].len = 0;
1859 } else if (len > NETMAP_BUF_SIZE(kring->na)) {
1860 ring->slot[i].len = 0;
1861 nm_prlim(5, "bad len at slot %d idx %d len %d", i, idx, len);
1865 nm_prlim(10, "total %d errors", errors);
1866 nm_prlim(10, "%s reinit, cur %d -> %d tail %d -> %d",
1868 ring->cur, kring->nr_hwcur,
1869 ring->tail, kring->nr_hwtail);
1870 ring->head = kring->rhead = kring->nr_hwcur;
1871 ring->cur = kring->rcur = kring->nr_hwcur;
1872 ring->tail = kring->rtail = kring->nr_hwtail;
1874 return (errors ? 1 : 0);
1877 /* interpret the ringid and flags fields of an nmreq, by translating them
1878 * into a pair of intervals of ring indices:
1880 * [priv->np_txqfirst, priv->np_txqlast) and
1881 * [priv->np_rxqfirst, priv->np_rxqlast)
1885 netmap_interp_ringid(struct netmap_priv_d *priv, struct nmreq_header *hdr)
1887 struct netmap_adapter *na = priv->np_na;
1888 struct nmreq_register *reg = (struct nmreq_register *)hdr->nr_body;
1889 int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
1892 u_int nr_flags = reg->nr_flags, nr_mode = reg->nr_mode,
1893 nr_ringid = reg->nr_ringid;
1896 if (nr_flags & excluded_direction[t]) {
1897 priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1901 case NR_REG_ALL_NIC:
1903 priv->np_qfirst[t] = 0;
1904 priv->np_qlast[t] = nma_get_nrings(na, t);
1905 nm_prdis("ALL/PIPE: %s %d %d", nm_txrx2str(t),
1906 priv->np_qfirst[t], priv->np_qlast[t]);
1910 if (!(na->na_flags & NAF_HOST_RINGS)) {
1911 nm_prerr("host rings not supported");
1914 priv->np_qfirst[t] = (nr_mode == NR_REG_SW ?
1915 nma_get_nrings(na, t) : 0);
1916 priv->np_qlast[t] = netmap_all_rings(na, t);
1917 nm_prdis("%s: %s %d %d", nr_mode == NR_REG_SW ? "SW" : "NIC+SW",
1919 priv->np_qfirst[t], priv->np_qlast[t]);
1921 case NR_REG_ONE_NIC:
1922 if (nr_ringid >= na->num_tx_rings &&
1923 nr_ringid >= na->num_rx_rings) {
1924 nm_prerr("invalid ring id %d", nr_ringid);
1927 /* if not enough rings, use the first one */
1929 if (j >= nma_get_nrings(na, t))
1931 priv->np_qfirst[t] = j;
1932 priv->np_qlast[t] = j + 1;
1933 nm_prdis("ONE_NIC: %s %d %d", nm_txrx2str(t),
1934 priv->np_qfirst[t], priv->np_qlast[t]);
1937 if (!(na->na_flags & NAF_HOST_RINGS)) {
1938 nm_prerr("host rings not supported");
1941 if (nr_ringid >= na->num_host_tx_rings &&
1942 nr_ringid >= na->num_host_rx_rings) {
1943 nm_prerr("invalid ring id %d", nr_ringid);
1946 /* if not enough rings, use the first one */
1948 if (j >= nma_get_host_nrings(na, t))
1950 priv->np_qfirst[t] = nma_get_nrings(na, t) + j;
1951 priv->np_qlast[t] = nma_get_nrings(na, t) + j + 1;
1952 nm_prdis("ONE_SW: %s %d %d", nm_txrx2str(t),
1953 priv->np_qfirst[t], priv->np_qlast[t]);
1956 nm_prerr("invalid regif type %d", nr_mode);
1960 priv->np_flags = nr_flags;
1962 /* Allow transparent forwarding mode in the host --> nic
1963 * direction only if all the TX hw rings have been opened. */
1964 if (priv->np_qfirst[NR_TX] == 0 &&
1965 priv->np_qlast[NR_TX] >= na->num_tx_rings) {
1966 priv->np_sync_flags |= NAF_CAN_FORWARD_DOWN;
1969 if (netmap_verbose) {
1970 nm_prinf("%s: tx [%d,%d) rx [%d,%d) id %d",
1972 priv->np_qfirst[NR_TX],
1973 priv->np_qlast[NR_TX],
1974 priv->np_qfirst[NR_RX],
1975 priv->np_qlast[NR_RX],
1983 * Set the ring ID. For devices with a single queue, a request
1984 * for all rings is the same as a single ring.
1987 netmap_set_ringid(struct netmap_priv_d *priv, struct nmreq_header *hdr)
1989 struct netmap_adapter *na = priv->np_na;
1990 struct nmreq_register *reg = (struct nmreq_register *)hdr->nr_body;
1994 error = netmap_interp_ringid(priv, hdr);
1999 priv->np_txpoll = (reg->nr_flags & NR_NO_TX_POLL) ? 0 : 1;
2001 /* optimization: count the users registered for more than
2002 * one ring, which are the ones sleeping on the global queue.
2003 * The default netmap_notify() callback will then
2004 * avoid signaling the global queue if nobody is using it
2007 if (nm_si_user(priv, t))
2014 netmap_unset_ringid(struct netmap_priv_d *priv)
2016 struct netmap_adapter *na = priv->np_na;
2020 if (nm_si_user(priv, t))
2022 priv->np_qfirst[t] = priv->np_qlast[t] = 0;
2025 priv->np_txpoll = 0;
2026 priv->np_kloop_state = 0;
2029 #define within_sel(p_, t_, i_) \
2030 ((i_) < (p_)->np_qlast[(t_)])
2031 #define nonempty_sel(p_, t_) \
2032 (within_sel((p_), (t_), (p_)->np_qfirst[(t_)]))
2033 #define foreach_selected_ring(p_, t_, i_, kring_) \
2034 for ((t_) = nonempty_sel((p_), NR_RX) ? NR_RX : NR_TX, \
2035 (i_) = (p_)->np_qfirst[(t_)]; \
2037 (t == NR_TX && within_sel((p_), (t_), (i_)))) && \
2038 ((kring_) = NMR((p_)->np_na, (t_))[(i_)]); \
2039 (i_) = within_sel((p_), (t_), (i_) + 1) ? (i_) + 1 : \
2040 (++(t_) < NR_TXRX ? (p_)->np_qfirst[(t_)] : (i_)))
2043 /* Set the nr_pending_mode for the requested rings.
2044 * If requested, also try to get exclusive access to the rings, provided
2045 * the rings we want to bind are not exclusively owned by a previous bind.
2048 netmap_krings_get(struct netmap_priv_d *priv)
2050 struct netmap_adapter *na = priv->np_na;
2052 struct netmap_kring *kring;
2053 int excl = (priv->np_flags & NR_EXCLUSIVE);
2056 if (netmap_debug & NM_DEBUG_ON)
2057 nm_prinf("%s: grabbing tx [%d, %d) rx [%d, %d)",
2059 priv->np_qfirst[NR_TX],
2060 priv->np_qlast[NR_TX],
2061 priv->np_qfirst[NR_RX],
2062 priv->np_qlast[NR_RX]);
2064 /* first round: check that all the requested rings
2065 * are neither already exclusively owned, nor we
2066 * want exclusive ownership when they are already in use
2068 foreach_selected_ring(priv, t, i, kring) {
2069 if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
2070 (kring->users && excl))
2072 nm_prdis("ring %s busy", kring->name);
2077 /* second round: increment usage count (possibly marking them
2078 * as exclusive) and set the nr_pending_mode
2080 foreach_selected_ring(priv, t, i, kring) {
2083 kring->nr_kflags |= NKR_EXCLUSIVE;
2084 kring->nr_pending_mode = NKR_NETMAP_ON;
2091 /* Undo netmap_krings_get(). This is done by clearing the exclusive mode
2092 * if was asked on regif, and unset the nr_pending_mode if we are the
2093 * last users of the involved rings. */
2095 netmap_krings_put(struct netmap_priv_d *priv)
2098 struct netmap_kring *kring;
2099 int excl = (priv->np_flags & NR_EXCLUSIVE);
2102 nm_prdis("%s: releasing tx [%d, %d) rx [%d, %d)",
2104 priv->np_qfirst[NR_TX],
2105 priv->np_qlast[NR_TX],
2106 priv->np_qfirst[NR_RX],
2107 priv->np_qlast[MR_RX]);
2109 foreach_selected_ring(priv, t, i, kring) {
2111 kring->nr_kflags &= ~NKR_EXCLUSIVE;
2113 if (kring->users == 0)
2114 kring->nr_pending_mode = NKR_NETMAP_OFF;
2119 nm_priv_rx_enabled(struct netmap_priv_d *priv)
2121 return (priv->np_qfirst[NR_RX] != priv->np_qlast[NR_RX]);
2124 /* Validate the CSB entries for both directions (atok and ktoa).
2125 * To be called under NMG_LOCK(). */
2127 netmap_csb_validate(struct netmap_priv_d *priv, struct nmreq_opt_csb *csbo)
2129 struct nm_csb_atok *csb_atok_base =
2130 (struct nm_csb_atok *)(uintptr_t)csbo->csb_atok;
2131 struct nm_csb_ktoa *csb_ktoa_base =
2132 (struct nm_csb_ktoa *)(uintptr_t)csbo->csb_ktoa;
2134 int num_rings[NR_TXRX], tot_rings;
2135 size_t entry_size[2];
2139 if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) {
2140 nm_prerr("Cannot update CSB while kloop is running");
2146 num_rings[t] = priv->np_qlast[t] - priv->np_qfirst[t];
2147 tot_rings += num_rings[t];
2152 if (!(priv->np_flags & NR_EXCLUSIVE)) {
2153 nm_prerr("CSB mode requires NR_EXCLUSIVE");
2157 entry_size[0] = sizeof(*csb_atok_base);
2158 entry_size[1] = sizeof(*csb_ktoa_base);
2159 csb_start[0] = (void *)csb_atok_base;
2160 csb_start[1] = (void *)csb_ktoa_base;
2162 for (i = 0; i < 2; i++) {
2163 /* On Linux we could use access_ok() to simplify
2164 * the validation. However, the advantage of
2165 * this approach is that it works also on
2167 size_t csb_size = tot_rings * entry_size[i];
2171 if ((uintptr_t)csb_start[i] & (entry_size[i]-1)) {
2172 nm_prerr("Unaligned CSB address");
2176 tmp = nm_os_malloc(csb_size);
2180 /* Application --> kernel direction. */
2181 err = copyin(csb_start[i], tmp, csb_size);
2183 /* Kernel --> application direction. */
2184 memset(tmp, 0, csb_size);
2185 err = copyout(tmp, csb_start[i], csb_size);
2189 nm_prerr("Invalid CSB address");
2194 priv->np_csb_atok_base = csb_atok_base;
2195 priv->np_csb_ktoa_base = csb_ktoa_base;
2197 /* Initialize the CSB. */
2199 for (i = 0; i < num_rings[t]; i++) {
2200 struct netmap_kring *kring =
2201 NMR(priv->np_na, t)[i + priv->np_qfirst[t]];
2202 struct nm_csb_atok *csb_atok = csb_atok_base + i;
2203 struct nm_csb_ktoa *csb_ktoa = csb_ktoa_base + i;
2206 csb_atok += num_rings[NR_TX];
2207 csb_ktoa += num_rings[NR_TX];
2210 CSB_WRITE(csb_atok, head, kring->rhead);
2211 CSB_WRITE(csb_atok, cur, kring->rcur);
2212 CSB_WRITE(csb_atok, appl_need_kick, 1);
2213 CSB_WRITE(csb_atok, sync_flags, 1);
2214 CSB_WRITE(csb_ktoa, hwcur, kring->nr_hwcur);
2215 CSB_WRITE(csb_ktoa, hwtail, kring->nr_hwtail);
2216 CSB_WRITE(csb_ktoa, kern_need_kick, 1);
2218 nm_prinf("csb_init for kring %s: head %u, cur %u, "
2219 "hwcur %u, hwtail %u", kring->name,
2220 kring->rhead, kring->rcur, kring->nr_hwcur,
2228 /* Ensure that the netmap adapter can support the given MTU.
2229 * @return EINVAL if the na cannot be set to mtu, 0 otherwise.
2232 netmap_buf_size_validate(const struct netmap_adapter *na, unsigned mtu) {
2233 unsigned nbs = NETMAP_BUF_SIZE(na);
2235 if (mtu <= na->rx_buf_maxsize) {
2236 /* The MTU fits a single NIC slot. We only
2237 * Need to check that netmap buffers are
2238 * large enough to hold an MTU. NS_MOREFRAG
2239 * cannot be used in this case. */
2241 nm_prerr("error: netmap buf size (%u) "
2242 "< device MTU (%u)", nbs, mtu);
2246 /* More NIC slots may be needed to receive
2247 * or transmit a single packet. Check that
2248 * the adapter supports NS_MOREFRAG and that
2249 * netmap buffers are large enough to hold
2250 * the maximum per-slot size. */
2251 if (!(na->na_flags & NAF_MOREFRAG)) {
2252 nm_prerr("error: large MTU (%d) needed "
2253 "but %s does not support "
2257 } else if (nbs < na->rx_buf_maxsize) {
2258 nm_prerr("error: using NS_MOREFRAG on "
2259 "%s requires netmap buf size "
2260 ">= %u", na->ifp->if_xname,
2261 na->rx_buf_maxsize);
2264 nm_prinf("info: netmap application on "
2265 "%s needs to support "
2267 "(MTU=%u,netmap_buf_size=%u)",
2268 na->ifp->if_xname, mtu, nbs);
2274 /* Handle the offset option, if present in the hdr.
2275 * Returns 0 on success, or an error.
2278 netmap_offsets_init(struct netmap_priv_d *priv, struct nmreq_header *hdr)
2280 struct nmreq_opt_offsets *opt;
2281 struct netmap_adapter *na = priv->np_na;
2282 struct netmap_kring *kring;
2283 uint64_t mask = 0, bits = 0, maxbits = sizeof(uint64_t) * 8,
2284 max_offset = 0, initial_offset = 0, min_gap = 0;
2289 opt = (struct nmreq_opt_offsets *)
2290 nmreq_getoption(hdr, NETMAP_REQ_OPT_OFFSETS);
2294 if (!(na->na_flags & NAF_OFFSETS)) {
2296 nm_prerr("%s does not support offsets",
2302 /* check sanity of the opt values */
2303 max_offset = opt->nro_max_offset;
2304 min_gap = opt->nro_min_gap;
2305 initial_offset = opt->nro_initial_offset;
2306 bits = opt->nro_offset_bits;
2308 if (bits > maxbits) {
2310 nm_prerr("bits: %llu too large (max %llu)",
2311 (unsigned long long)bits,
2312 (unsigned long long)maxbits);
2316 /* we take bits == 0 as a request to use the entire field */
2317 if (bits == 0 || bits == maxbits) {
2318 /* shifting a type by sizeof(type) is undefined */
2320 mask = 0xffffffffffffffff;
2322 mask = (1ULL << bits) - 1;
2324 if (max_offset > NETMAP_BUF_SIZE(na)) {
2326 nm_prerr("max offset %llu > buf size %u",
2327 (unsigned long long)max_offset, NETMAP_BUF_SIZE(na));
2331 if ((max_offset & mask) != max_offset) {
2333 nm_prerr("max offset %llu to large for %llu bits",
2334 (unsigned long long)max_offset,
2335 (unsigned long long)bits);
2339 if (initial_offset > max_offset) {
2341 nm_prerr("initial offset %llu > max offset %llu",
2342 (unsigned long long)initial_offset,
2343 (unsigned long long)max_offset);
2348 /* initialize the kring and ring fields. */
2349 foreach_selected_ring(priv, t, i, kring) {
2350 struct netmap_kring *kring = NMR(na, t)[i];
2351 struct netmap_ring *ring = kring->ring;
2354 /* it the ring is already in use we check that the
2355 * new request is compatible with the existing one
2357 if (kring->offset_mask) {
2358 if ((kring->offset_mask & mask) != mask ||
2359 kring->offset_max < max_offset) {
2361 nm_prinf("%s: cannot increase"
2362 "offset mask and/or max"
2363 "(current: mask=%llx,max=%llu",
2365 (unsigned long long)kring->offset_mask,
2366 (unsigned long long)kring->offset_max);
2370 mask = kring->offset_mask;
2371 max_offset = kring->offset_max;
2373 kring->offset_mask = mask;
2374 *(uint64_t *)(uintptr_t)&ring->offset_mask = mask;
2375 kring->offset_max = max_offset;
2376 kring->offset_gap = min_gap;
2379 /* if there is an initial offset, put it into
2382 * Note: we cannot change the offsets if the
2383 * ring is already in use.
2385 if (!initial_offset || kring->users > 1)
2388 for (j = 0; j < kring->nkr_num_slots; j++) {
2389 struct netmap_slot *slot = ring->slot + j;
2391 nm_write_offset(kring, slot, initial_offset);
2396 opt->nro_opt.nro_status = error;
2398 opt->nro_max_offset = max_offset;
2405 /* set the hardware buffer length in each one of the newly opened rings
2406 * (hwbuf_len field in the kring struct). The purpose it to select
2407 * the maximum supported input buffer lenght that will not cause writes
2408 * outside of the available space, even when offsets are in use.
2411 netmap_compute_buf_len(struct netmap_priv_d *priv)
2415 struct netmap_kring *kring;
2418 struct netmap_adapter *na = priv->np_na;
2421 foreach_selected_ring(priv, t, i, kring) {
2422 /* rings that are already active have their hwbuf_len
2423 * already set and we cannot change it.
2425 if (kring->users > 1)
2428 /* For netmap buffers which are not shared among several ring
2429 * slots (the normal case), the available space is the buf size
2430 * minus the max offset declared by the user at open time. If
2431 * the user plans to have several slots pointing to different
2432 * offsets into the same large buffer, she must also declare a
2433 * "minimum gap" between two such consecutive offsets. In this
2434 * case the user-declared 'offset_gap' is taken as the
2435 * available space and offset_max is ignored.
2438 /* start with the normal case (unshared buffers) */
2439 target = NETMAP_BUF_SIZE(kring->na) -
2441 /* if offset_gap is zero, the user does not intend to use
2442 * shared buffers. In this case the minimum gap between
2443 * two consective offsets into the same buffer can be
2444 * assumed to be equal to the buffer size. In this way
2445 * offset_gap always contains the available space ignoring
2446 * offset_max. This may be used by drivers of NICs that
2447 * are guaranteed to never write more than MTU bytes, even
2448 * if the input buffer is larger: if the MTU is less
2449 * than the target they can set hwbuf_len to offset_gap.
2451 if (!kring->offset_gap)
2453 NETMAP_BUF_SIZE(kring->na);
2455 if (kring->offset_gap < target)
2456 target = kring->offset_gap;
2457 error = kring->nm_bufcfg(kring, target);
2461 *(uint64_t *)(uintptr_t)&kring->ring->buf_align = kring->buf_align;
2463 if (mtu && t == NR_RX && kring->hwbuf_len < mtu) {
2464 if (!(na->na_flags & NAF_MOREFRAG)) {
2465 nm_prerr("error: large MTU (%d) needed "
2466 "but %s does not support "
2472 nm_prinf("info: netmap application on "
2473 "%s needs to support "
2475 "(MTU=%u,buf_size=%llu)",
2477 (unsigned long long)kring->hwbuf_len);
2486 * possibly move the interface to netmap-mode.
2487 * If success it returns a pointer to netmap_if, otherwise NULL.
2488 * This must be called with NMG_LOCK held.
2490 * The following na callbacks are called in the process:
2492 * na->nm_config() [by netmap_update_config]
2493 * (get current number and size of rings)
2495 * We have a generic one for linux (netmap_linux_config).
2496 * The bwrap has to override this, since it has to forward
2497 * the request to the wrapped adapter (netmap_bwrap_config).
2500 * na->nm_krings_create()
2501 * (create and init the krings array)
2503 * One of the following:
2505 * * netmap_hw_krings_create, (hw ports)
2506 * creates the standard layout for the krings
2507 * and adds the mbq (used for the host rings).
2509 * * netmap_vp_krings_create (VALE ports)
2510 * add leases and scratchpads
2512 * * netmap_pipe_krings_create (pipes)
2513 * create the krings and rings of both ends and
2516 * * netmap_monitor_krings_create (monitors)
2517 * avoid allocating the mbq
2519 * * netmap_bwrap_krings_create (bwraps)
2520 * create both the brap krings array,
2521 * the krings array of the wrapped adapter, and
2522 * (if needed) the fake array for the host adapter
2524 * na->nm_register(, 1)
2525 * (put the adapter in netmap mode)
2527 * This may be one of the following:
2529 * * netmap_hw_reg (hw ports)
2530 * checks that the ifp is still there, then calls
2531 * the hardware specific callback;
2533 * * netmap_vp_reg (VALE ports)
2534 * If the port is connected to a bridge,
2535 * set the NAF_NETMAP_ON flag under the
2536 * bridge write lock.
2538 * * netmap_pipe_reg (pipes)
2539 * inform the other pipe end that it is no
2540 * longer responsible for the lifetime of this
2543 * * netmap_monitor_reg (monitors)
2544 * intercept the sync callbacks of the monitored
2547 * * netmap_bwrap_reg (bwraps)
2548 * cross-link the bwrap and hwna rings,
2549 * forward the request to the hwna, override
2550 * the hwna notify callback (to get the frames
2551 * coming from outside go through the bridge).
2556 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
2557 struct nmreq_header *hdr)
2559 struct netmap_if *nifp = NULL;
2563 priv->np_na = na; /* store the reference */
2564 error = netmap_mem_finalize(na->nm_mem, na);
2568 if (na->active_fds == 0) {
2570 /* cache the allocator info in the na */
2571 error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
2574 nm_prdis("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
2575 na->na_lut.objsize);
2577 /* ring configuration may have changed, fetch from the card */
2578 netmap_update_config(na);
2581 /* compute the range of tx and rx rings to monitor */
2582 error = netmap_set_ringid(priv, hdr);
2586 if (na->active_fds == 0) {
2588 * If this is the first registration of the adapter,
2589 * perform sanity checks and create the in-kernel view
2590 * of the netmap rings (the netmap krings).
2592 if (na->ifp && nm_priv_rx_enabled(priv)) {
2593 /* This netmap adapter is attached to an ifnet. */
2594 unsigned mtu = nm_os_ifnet_mtu(na->ifp);
2596 nm_prdis("%s: mtu %d rx_buf_maxsize %d netmap_buf_size %d",
2597 na->name, mtu, na->rx_buf_maxsize, NETMAP_BUF_SIZE(na));
2599 if (na->rx_buf_maxsize == 0) {
2600 nm_prerr("%s: error: rx_buf_maxsize == 0", na->name);
2605 error = netmap_buf_size_validate(na, mtu);
2611 * Depending on the adapter, this may also create
2612 * the netmap rings themselves
2614 error = na->nm_krings_create(na);
2620 /* now the krings must exist and we can check whether some
2621 * previous bind has exclusive ownership on them, and set
2624 error = netmap_krings_get(priv);
2626 goto err_del_krings;
2628 /* create all needed missing netmap rings */
2629 error = netmap_mem_rings_create(na);
2633 /* initialize offsets if requested */
2634 error = netmap_offsets_init(priv, hdr);
2638 /* compute and validate the buf lengths */
2639 error = netmap_compute_buf_len(priv);
2643 /* in all cases, create a new netmap if */
2644 nifp = netmap_mem_if_new(na, priv);
2650 if (nm_kring_pending(priv)) {
2651 /* Some kring is switching mode, tell the adapter to
2653 netmap_set_all_rings(na, NM_KR_LOCKED);
2654 error = na->nm_register(na, 1);
2655 netmap_set_all_rings(na, 0);
2660 /* Commit the reference. */
2664 * advertise that the interface is ready by setting np_nifp.
2665 * The barrier is needed because readers (poll, *SYNC and mmap)
2666 * check for priv->np_nifp != NULL without locking
2668 mb(); /* make sure previous writes are visible to all CPUs */
2669 priv->np_nifp = nifp;
2674 netmap_mem_if_delete(na, nifp);
2676 netmap_krings_put(priv);
2677 netmap_mem_rings_delete(na);
2679 if (na->active_fds == 0)
2680 na->nm_krings_delete(na);
2682 if (na->active_fds == 0)
2683 memset(&na->na_lut, 0, sizeof(na->na_lut));
2685 netmap_mem_drop(na);
2693 * update kring and ring at the end of rxsync/txsync.
2696 nm_sync_finalize(struct netmap_kring *kring)
2699 * Update ring tail to what the kernel knows
2700 * After txsync: head/rhead/hwcur might be behind cur/rcur
2703 kring->ring->tail = kring->rtail = kring->nr_hwtail;
2705 nm_prdis(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2706 kring->name, kring->nr_hwcur, kring->nr_hwtail,
2707 kring->rhead, kring->rcur, kring->rtail);
2710 /* set ring timestamp */
2712 ring_timestamp_set(struct netmap_ring *ring)
2714 if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) {
2715 microtime(&ring->ts);
2719 static int nmreq_copyin(struct nmreq_header *, int);
2720 static int nmreq_copyout(struct nmreq_header *, int);
2721 static int nmreq_checkoptions(struct nmreq_header *);
2724 * ioctl(2) support for the "netmap" device.
2726 * Following a list of accepted commands:
2727 * - NIOCCTRL device control API
2728 * - NIOCTXSYNC sync TX rings
2729 * - NIOCRXSYNC sync RX rings
2730 * - SIOCGIFADDR just for convenience
2731 * - NIOCGINFO deprecated (legacy API)
2732 * - NIOCREGIF deprecated (legacy API)
2734 * Return 0 on success, errno otherwise.
2737 netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
2738 struct thread *td, int nr_body_is_user)
2740 struct mbq q; /* packets from RX hw queues to host stack */
2741 struct netmap_adapter *na = NULL;
2742 struct netmap_mem_d *nmd = NULL;
2743 struct ifnet *ifp = NULL;
2745 u_int i, qfirst, qlast;
2746 struct netmap_kring **krings;
2752 struct nmreq_header *hdr = (struct nmreq_header *)data;
2754 if (hdr->nr_version < NETMAP_MIN_API ||
2755 hdr->nr_version > NETMAP_MAX_API) {
2756 nm_prerr("API mismatch: got %d need %d",
2757 hdr->nr_version, NETMAP_API);
2761 /* Make a kernel-space copy of the user-space nr_body.
2762 * For convenience, the nr_body pointer and the pointers
2763 * in the options list will be replaced with their
2764 * kernel-space counterparts. The original pointers are
2765 * saved internally and later restored by nmreq_copyout
2767 error = nmreq_copyin(hdr, nr_body_is_user);
2772 /* Sanitize hdr->nr_name. */
2773 hdr->nr_name[sizeof(hdr->nr_name) - 1] = '\0';
2775 switch (hdr->nr_reqtype) {
2776 case NETMAP_REQ_REGISTER: {
2777 struct nmreq_register *req =
2778 (struct nmreq_register *)(uintptr_t)hdr->nr_body;
2779 struct netmap_if *nifp;
2781 /* Protect access to priv from concurrent requests. */
2784 struct nmreq_option *opt;
2787 if (priv->np_nifp != NULL) { /* thread already registered */
2793 opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_EXTMEM);
2795 struct nmreq_opt_extmem *e =
2796 (struct nmreq_opt_extmem *)opt;
2798 nmd = netmap_mem_ext_create(e->nro_usrptr,
2799 &e->nro_info, &error);
2800 opt->nro_status = error;
2804 #endif /* WITH_EXTMEM */
2806 if (nmd == NULL && req->nr_mem_id) {
2807 /* find the allocator and get a reference */
2808 nmd = netmap_mem_find(req->nr_mem_id);
2810 if (netmap_verbose) {
2811 nm_prerr("%s: failed to find mem_id %u",
2812 hdr->nr_name, req->nr_mem_id);
2818 /* find the interface and a reference */
2819 error = netmap_get_na(hdr, &na, &ifp, nmd,
2820 1 /* create */); /* keep reference */
2823 if (NETMAP_OWNED_BY_KERN(na)) {
2828 if (na->virt_hdr_len && !(req->nr_flags & NR_ACCEPT_VNET_HDR)) {
2829 nm_prerr("virt_hdr_len=%d, but application does "
2830 "not accept it", na->virt_hdr_len);
2835 error = netmap_do_regif(priv, na, hdr);
2836 if (error) { /* reg. failed, release priv and ref */
2840 opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_CSB);
2842 struct nmreq_opt_csb *csbo =
2843 (struct nmreq_opt_csb *)opt;
2844 error = netmap_csb_validate(priv, csbo);
2845 opt->nro_status = error;
2847 netmap_do_unregif(priv);
2852 nifp = priv->np_nifp;
2854 /* return the offset of the netmap_if object */
2855 req->nr_rx_rings = na->num_rx_rings;
2856 req->nr_tx_rings = na->num_tx_rings;
2857 req->nr_rx_slots = na->num_rx_desc;
2858 req->nr_tx_slots = na->num_tx_desc;
2859 req->nr_host_tx_rings = na->num_host_tx_rings;
2860 req->nr_host_rx_rings = na->num_host_rx_rings;
2861 error = netmap_mem_get_info(na->nm_mem, &req->nr_memsize, &memflags,
2864 netmap_do_unregif(priv);
2867 if (memflags & NETMAP_MEM_PRIVATE) {
2868 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2871 priv->np_si[t] = nm_si_user(priv, t) ?
2872 &na->si[t] : &NMR(na, t)[priv->np_qfirst[t]]->si;
2875 if (req->nr_extra_bufs) {
2877 nm_prinf("requested %d extra buffers",
2878 req->nr_extra_bufs);
2879 req->nr_extra_bufs = netmap_extra_alloc(na,
2880 &nifp->ni_bufs_head, req->nr_extra_bufs);
2882 nm_prinf("got %d extra buffers", req->nr_extra_bufs);
2884 nifp->ni_bufs_head = 0;
2886 req->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2888 error = nmreq_checkoptions(hdr);
2890 netmap_do_unregif(priv);
2894 /* store ifp reference so that priv destructor may release it */
2898 netmap_unget_na(na, ifp);
2900 /* release the reference from netmap_mem_find() or
2901 * netmap_mem_ext_create()
2904 netmap_mem_put(nmd);
2909 case NETMAP_REQ_PORT_INFO_GET: {
2910 struct nmreq_port_info_get *req =
2911 (struct nmreq_port_info_get *)(uintptr_t)hdr->nr_body;
2918 if (hdr->nr_name[0] != '\0') {
2919 /* Build a nmreq_register out of the nmreq_port_info_get,
2920 * so that we can call netmap_get_na(). */
2921 struct nmreq_register regreq;
2922 bzero(®req, sizeof(regreq));
2923 regreq.nr_mode = NR_REG_ALL_NIC;
2924 regreq.nr_tx_slots = req->nr_tx_slots;
2925 regreq.nr_rx_slots = req->nr_rx_slots;
2926 regreq.nr_tx_rings = req->nr_tx_rings;
2927 regreq.nr_rx_rings = req->nr_rx_rings;
2928 regreq.nr_host_tx_rings = req->nr_host_tx_rings;
2929 regreq.nr_host_rx_rings = req->nr_host_rx_rings;
2930 regreq.nr_mem_id = req->nr_mem_id;
2932 /* get a refcount */
2933 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2934 hdr->nr_body = (uintptr_t)®req;
2935 error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2936 hdr->nr_reqtype = NETMAP_REQ_PORT_INFO_GET; /* reset type */
2937 hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2943 nmd = na->nm_mem; /* get memory allocator */
2945 nmd = netmap_mem_find(req->nr_mem_id ? req->nr_mem_id : 1);
2948 nm_prerr("%s: failed to find mem_id %u",
2950 req->nr_mem_id ? req->nr_mem_id : 1);
2957 error = netmap_mem_get_info(nmd, &req->nr_memsize, &memflags,
2961 if (na == NULL) /* only memory info */
2963 netmap_update_config(na);
2964 req->nr_rx_rings = na->num_rx_rings;
2965 req->nr_tx_rings = na->num_tx_rings;
2966 req->nr_rx_slots = na->num_rx_desc;
2967 req->nr_tx_slots = na->num_tx_desc;
2968 req->nr_host_tx_rings = na->num_host_tx_rings;
2969 req->nr_host_rx_rings = na->num_host_rx_rings;
2971 netmap_unget_na(na, ifp);
2973 netmap_mem_put(nmd);
2978 case NETMAP_REQ_VALE_ATTACH: {
2979 error = netmap_bdg_attach(hdr, NULL /* userspace request */);
2983 case NETMAP_REQ_VALE_DETACH: {
2984 error = netmap_bdg_detach(hdr, NULL /* userspace request */);
2988 case NETMAP_REQ_PORT_HDR_SET: {
2989 struct nmreq_port_hdr *req =
2990 (struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2991 /* Build a nmreq_register out of the nmreq_port_hdr,
2992 * so that we can call netmap_get_bdg_na(). */
2993 struct nmreq_register regreq;
2994 bzero(®req, sizeof(regreq));
2995 regreq.nr_mode = NR_REG_ALL_NIC;
2997 /* For now we only support virtio-net headers, and only for
2998 * VALE ports, but this may change in future. Valid lengths
2999 * for the virtio-net header are 0 (no header), 10 and 12. */
3000 if (req->nr_hdr_len != 0 &&
3001 req->nr_hdr_len != sizeof(struct nm_vnet_hdr) &&
3002 req->nr_hdr_len != 12) {
3004 nm_prerr("invalid hdr_len %u", req->nr_hdr_len);
3009 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
3010 hdr->nr_body = (uintptr_t)®req;
3011 error = netmap_get_vale_na(hdr, &na, NULL, 0);
3012 hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_SET;
3013 hdr->nr_body = (uintptr_t)req;
3015 struct netmap_vp_adapter *vpna =
3016 (struct netmap_vp_adapter *)na;
3017 na->virt_hdr_len = req->nr_hdr_len;
3018 if (na->virt_hdr_len) {
3019 vpna->mfs = NETMAP_BUF_SIZE(na);
3022 nm_prinf("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
3023 netmap_adapter_put(na);
3031 case NETMAP_REQ_PORT_HDR_GET: {
3032 /* Get vnet-header length for this netmap port */
3033 struct nmreq_port_hdr *req =
3034 (struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
3035 /* Build a nmreq_register out of the nmreq_port_hdr,
3036 * so that we can call netmap_get_bdg_na(). */
3037 struct nmreq_register regreq;
3040 bzero(®req, sizeof(regreq));
3041 regreq.nr_mode = NR_REG_ALL_NIC;
3043 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
3044 hdr->nr_body = (uintptr_t)®req;
3045 error = netmap_get_na(hdr, &na, &ifp, NULL, 0);
3046 hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_GET;
3047 hdr->nr_body = (uintptr_t)req;
3049 req->nr_hdr_len = na->virt_hdr_len;
3051 netmap_unget_na(na, ifp);
3056 case NETMAP_REQ_VALE_LIST: {
3057 error = netmap_vale_list(hdr);
3061 case NETMAP_REQ_VALE_NEWIF: {
3062 error = nm_vi_create(hdr);
3066 case NETMAP_REQ_VALE_DELIF: {
3067 error = nm_vi_destroy(hdr->nr_name);
3070 #endif /* WITH_VALE */
3072 case NETMAP_REQ_VALE_POLLING_ENABLE:
3073 case NETMAP_REQ_VALE_POLLING_DISABLE: {
3074 error = nm_bdg_polling(hdr);
3077 case NETMAP_REQ_POOLS_INFO_GET: {
3078 /* Get information from the memory allocator used for
3080 struct nmreq_pools_info *req =
3081 (struct nmreq_pools_info *)(uintptr_t)hdr->nr_body;
3084 /* Build a nmreq_register out of the nmreq_pools_info,
3085 * so that we can call netmap_get_na(). */
3086 struct nmreq_register regreq;
3087 bzero(®req, sizeof(regreq));
3088 regreq.nr_mem_id = req->nr_mem_id;
3089 regreq.nr_mode = NR_REG_ALL_NIC;
3091 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
3092 hdr->nr_body = (uintptr_t)®req;
3093 error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
3094 hdr->nr_reqtype = NETMAP_REQ_POOLS_INFO_GET; /* reset type */
3095 hdr->nr_body = (uintptr_t)req; /* reset nr_body */
3101 nmd = na->nm_mem; /* grab the memory allocator */
3107 /* Finalize the memory allocator, get the pools
3108 * information and release the allocator. */
3109 error = netmap_mem_finalize(nmd, na);
3113 error = netmap_mem_pools_info_get(req, nmd);
3114 netmap_mem_drop(na);
3116 netmap_unget_na(na, ifp);
3121 case NETMAP_REQ_CSB_ENABLE: {
3122 struct nmreq_option *opt;
3124 opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_CSB);
3128 struct nmreq_opt_csb *csbo =
3129 (struct nmreq_opt_csb *)opt;
3131 error = netmap_csb_validate(priv, csbo);
3133 opt->nro_status = error;
3138 case NETMAP_REQ_SYNC_KLOOP_START: {
3139 error = netmap_sync_kloop(priv, hdr);
3143 case NETMAP_REQ_SYNC_KLOOP_STOP: {
3144 error = netmap_sync_kloop_stop(priv);
3153 /* Write back request body to userspace and reset the
3154 * user-space pointer. */
3155 error = nmreq_copyout(hdr, error);
3161 if (unlikely(priv->np_nifp == NULL)) {
3165 mb(); /* make sure following reads are not from cache */
3167 if (unlikely(priv->np_csb_atok_base)) {
3168 nm_prerr("Invalid sync in CSB mode");
3173 na = priv->np_na; /* we have a reference */
3176 t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
3177 krings = NMR(na, t);
3178 qfirst = priv->np_qfirst[t];
3179 qlast = priv->np_qlast[t];
3180 sync_flags = priv->np_sync_flags;
3182 for (i = qfirst; i < qlast; i++) {
3183 struct netmap_kring *kring = krings[i];
3184 struct netmap_ring *ring = kring->ring;
3186 if (unlikely(nm_kr_tryget(kring, 1, &error))) {
3187 error = (error ? EIO : 0);
3191 if (cmd == NIOCTXSYNC) {
3192 if (netmap_debug & NM_DEBUG_TXSYNC)
3193 nm_prinf("pre txsync ring %d cur %d hwcur %d",
3196 if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3197 netmap_ring_reinit(kring);
3198 } else if (kring->nm_sync(kring, sync_flags | NAF_FORCE_RECLAIM) == 0) {
3199 nm_sync_finalize(kring);
3201 if (netmap_debug & NM_DEBUG_TXSYNC)
3202 nm_prinf("post txsync ring %d cur %d hwcur %d",
3206 if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3207 netmap_ring_reinit(kring);
3209 if (nm_may_forward_up(kring)) {
3210 /* transparent forwarding, see netmap_poll() */
3211 netmap_grab_packets(kring, &q, netmap_fwd);
3213 if (kring->nm_sync(kring, sync_flags | NAF_FORCE_READ) == 0) {
3214 nm_sync_finalize(kring);
3216 ring_timestamp_set(ring);
3222 netmap_send_up(na->ifp, &q);
3229 return netmap_ioctl_legacy(priv, cmd, data, td);
3238 nmreq_size_by_type(uint16_t nr_reqtype)
3240 switch (nr_reqtype) {
3241 case NETMAP_REQ_REGISTER:
3242 return sizeof(struct nmreq_register);
3243 case NETMAP_REQ_PORT_INFO_GET:
3244 return sizeof(struct nmreq_port_info_get);
3245 case NETMAP_REQ_VALE_ATTACH:
3246 return sizeof(struct nmreq_vale_attach);
3247 case NETMAP_REQ_VALE_DETACH:
3248 return sizeof(struct nmreq_vale_detach);
3249 case NETMAP_REQ_VALE_LIST:
3250 return sizeof(struct nmreq_vale_list);
3251 case NETMAP_REQ_PORT_HDR_SET:
3252 case NETMAP_REQ_PORT_HDR_GET:
3253 return sizeof(struct nmreq_port_hdr);
3254 case NETMAP_REQ_VALE_NEWIF:
3255 return sizeof(struct nmreq_vale_newif);
3256 case NETMAP_REQ_VALE_DELIF:
3257 case NETMAP_REQ_SYNC_KLOOP_STOP:
3258 case NETMAP_REQ_CSB_ENABLE:
3260 case NETMAP_REQ_VALE_POLLING_ENABLE:
3261 case NETMAP_REQ_VALE_POLLING_DISABLE:
3262 return sizeof(struct nmreq_vale_polling);
3263 case NETMAP_REQ_POOLS_INFO_GET:
3264 return sizeof(struct nmreq_pools_info);
3265 case NETMAP_REQ_SYNC_KLOOP_START:
3266 return sizeof(struct nmreq_sync_kloop_start);
3272 nmreq_opt_size_by_type(uint32_t nro_reqtype, uint64_t nro_size)
3274 size_t rv = sizeof(struct nmreq_option);
3275 #ifdef NETMAP_REQ_OPT_DEBUG
3276 if (nro_reqtype & NETMAP_REQ_OPT_DEBUG)
3277 return (nro_reqtype & ~NETMAP_REQ_OPT_DEBUG);
3278 #endif /* NETMAP_REQ_OPT_DEBUG */
3279 switch (nro_reqtype) {
3281 case NETMAP_REQ_OPT_EXTMEM:
3282 rv = sizeof(struct nmreq_opt_extmem);
3284 #endif /* WITH_EXTMEM */
3285 case NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS:
3289 case NETMAP_REQ_OPT_CSB:
3290 rv = sizeof(struct nmreq_opt_csb);
3292 case NETMAP_REQ_OPT_SYNC_KLOOP_MODE:
3293 rv = sizeof(struct nmreq_opt_sync_kloop_mode);
3295 case NETMAP_REQ_OPT_OFFSETS:
3296 rv = sizeof(struct nmreq_opt_offsets);
3299 /* subtract the common header */
3300 return rv - sizeof(struct nmreq_option);
3304 * nmreq_copyin: create an in-kernel version of the request.
3306 * We build the following data structure:
3308 * hdr -> +-------+ buf
3309 * | | +---------------+
3310 * +-------+ |usr body ptr |
3311 * |options|-. +---------------+
3312 * +-------+ | |usr options ptr|
3313 * |body |--------->+---------------+
3315 * | | copy of body |
3317 * | +---------------+
3319 * | +---------------+
3321 * | | +---------------+ |
3323 * | | | +---------------+ \ option table
3324 * | | | | ... | / indexed by option
3325 * | | | +---------------+ | type
3327 * | | | +---------------+/
3328 * | | | |usr next ptr 1 |
3329 * `-|----->+---------------+
3330 * | | | copy of opt 1 |
3332 * | | .-| nro_next |
3333 * | | | +---------------+
3334 * | | | |usr next ptr 2 |
3335 * | `-`>+---------------+
3336 * | | copy of opt 2 |
3339 * | | +---------------+
3343 * `----->+---------------+
3344 * | |usr next ptr n |
3345 * `>+---------------+
3351 * The options and body fields of the hdr structure are overwritten
3352 * with in-kernel valid pointers inside the buf. The original user
3353 * pointers are saved in the buf and restored on copyout.
3354 * The list of options is copied and the pointers adjusted. The
3355 * original pointers are saved before the option they belonged.
3357 * The option table has an entry for every available option. Entries
3358 * for options that have not been passed contain NULL.
3363 nmreq_copyin(struct nmreq_header *hdr, int nr_body_is_user)
3365 size_t rqsz, optsz, bufsz;
3367 char *ker = NULL, *p;
3368 struct nmreq_option **next, *src, **opt_tab;
3369 struct nmreq_option buf;
3372 if (hdr->nr_reserved) {
3374 nm_prerr("nr_reserved must be zero");
3378 if (!nr_body_is_user)
3381 hdr->nr_reserved = nr_body_is_user;
3383 /* compute the total size of the buffer */
3384 rqsz = nmreq_size_by_type(hdr->nr_reqtype);
3385 if (rqsz > NETMAP_REQ_MAXSIZE) {
3389 if ((rqsz && hdr->nr_body == (uintptr_t)NULL) ||
3390 (!rqsz && hdr->nr_body != (uintptr_t)NULL)) {
3391 /* Request body expected, but not found; or
3392 * request body found but unexpected. */
3394 nm_prerr("nr_body expected but not found, or vice versa");
3399 bufsz = 2 * sizeof(void *) + rqsz +
3400 NETMAP_REQ_OPT_MAX * sizeof(opt_tab);
3401 /* compute the size of the buf below the option table.
3402 * It must contain a copy of every received option structure.
3403 * For every option we also need to store a copy of the user
3407 for (src = (struct nmreq_option *)(uintptr_t)hdr->nr_options; src;
3408 src = (struct nmreq_option *)(uintptr_t)buf.nro_next)
3410 error = copyin(src, &buf, sizeof(*src));
3413 optsz += sizeof(*src);
3414 optsz += nmreq_opt_size_by_type(buf.nro_reqtype, buf.nro_size);
3415 if (rqsz + optsz > NETMAP_REQ_MAXSIZE) {
3419 bufsz += sizeof(void *);
3423 ker = nm_os_malloc(bufsz);
3428 p = ker; /* write pointer into the buffer */
3430 /* make a copy of the user pointers */
3431 ptrs = (uint64_t*)p;
3432 *ptrs++ = hdr->nr_body;
3433 *ptrs++ = hdr->nr_options;
3437 error = copyin((void *)(uintptr_t)hdr->nr_body, p, rqsz);
3440 /* overwrite the user pointer with the in-kernel one */
3441 hdr->nr_body = (uintptr_t)p;
3443 /* start of the options table */
3444 opt_tab = (struct nmreq_option **)p;
3445 p += sizeof(opt_tab) * NETMAP_REQ_OPT_MAX;
3447 /* copy the options */
3448 next = (struct nmreq_option **)&hdr->nr_options;
3451 struct nmreq_option *opt;
3453 /* copy the option header */
3454 ptrs = (uint64_t *)p;
3455 opt = (struct nmreq_option *)(ptrs + 1);
3456 error = copyin(src, opt, sizeof(*src));
3459 /* make a copy of the user next pointer */
3460 *ptrs = opt->nro_next;
3461 /* overwrite the user pointer with the in-kernel one */
3464 /* initialize the option as not supported.
3465 * Recognized options will update this field.
3467 opt->nro_status = EOPNOTSUPP;
3469 /* check for invalid types */
3470 if (opt->nro_reqtype < 1) {
3472 nm_prinf("invalid option type: %u", opt->nro_reqtype);
3473 opt->nro_status = EINVAL;
3478 if (opt->nro_reqtype >= NETMAP_REQ_OPT_MAX) {
3479 /* opt->nro_status is already EOPNOTSUPP */
3484 /* if the type is valid, index the option in the table
3485 * unless it is a duplicate.
3487 if (opt_tab[opt->nro_reqtype] != NULL) {
3489 nm_prinf("duplicate option: %u", opt->nro_reqtype);
3490 opt->nro_status = EINVAL;
3491 opt_tab[opt->nro_reqtype]->nro_status = EINVAL;
3495 opt_tab[opt->nro_reqtype] = opt;
3497 p = (char *)(opt + 1);
3499 /* copy the option body */
3500 optsz = nmreq_opt_size_by_type(opt->nro_reqtype,
3503 /* the option body follows the option header */
3504 error = copyin(src + 1, p, optsz);
3511 /* move to next option */
3512 next = (struct nmreq_option **)&opt->nro_next;
3516 nmreq_copyout(hdr, error);
3520 ptrs = (uint64_t *)ker;
3521 hdr->nr_body = *ptrs++;
3522 hdr->nr_options = *ptrs++;
3523 hdr->nr_reserved = 0;
3530 nmreq_copyout(struct nmreq_header *hdr, int rerror)
3532 struct nmreq_option *src, *dst;
3533 void *ker = (void *)(uintptr_t)hdr->nr_body, *bufstart;
3538 if (!hdr->nr_reserved)
3541 /* restore the user pointers in the header */
3542 ptrs = (uint64_t *)ker - 2;
3544 hdr->nr_body = *ptrs++;
3545 src = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3546 hdr->nr_options = *ptrs;
3550 bodysz = nmreq_size_by_type(hdr->nr_reqtype);
3551 error = copyout(ker, (void *)(uintptr_t)hdr->nr_body, bodysz);
3558 /* copy the options */
3559 dst = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3564 /* restore the user pointer */
3565 next = src->nro_next;
3566 ptrs = (uint64_t *)src - 1;
3567 src->nro_next = *ptrs;
3569 /* always copy the option header */
3570 error = copyout(src, dst, sizeof(*src));
3576 /* copy the option body only if there was no error */
3577 if (!rerror && !src->nro_status) {
3578 optsz = nmreq_opt_size_by_type(src->nro_reqtype,
3581 error = copyout(src + 1, dst + 1, optsz);
3588 src = (struct nmreq_option *)(uintptr_t)next;
3589 dst = (struct nmreq_option *)(uintptr_t)*ptrs;
3594 hdr->nr_reserved = 0;
3595 nm_os_free(bufstart);
3599 struct nmreq_option *
3600 nmreq_getoption(struct nmreq_header *hdr, uint16_t reqtype)
3602 struct nmreq_option **opt_tab;
3604 if (!hdr->nr_options)
3607 opt_tab = (struct nmreq_option **)((uintptr_t)hdr->nr_options) -
3608 (NETMAP_REQ_OPT_MAX + 1);
3609 return opt_tab[reqtype];
3613 nmreq_checkoptions(struct nmreq_header *hdr)
3615 struct nmreq_option *opt;
3616 /* return error if there is still any option
3617 * marked as not supported
3620 for (opt = (struct nmreq_option *)(uintptr_t)hdr->nr_options; opt;
3621 opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
3622 if (opt->nro_status == EOPNOTSUPP)
3629 * select(2) and poll(2) handlers for the "netmap" device.
3631 * Can be called for one or more queues.
3632 * Return true the event mask corresponding to ready events.
3633 * If there are no ready events (and 'sr' is not NULL), do a
3634 * selrecord on either individual selinfo or on the global one.
3635 * Device-dependent parts (locking and sync of tx/rx rings)
3636 * are done through callbacks.
3638 * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
3639 * The first one is remapped to pwait as selrecord() uses the name as an
3643 netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
3645 struct netmap_adapter *na;
3646 struct netmap_kring *kring;
3647 struct netmap_ring *ring;
3648 u_int i, want[NR_TXRX], revents = 0;
3649 NM_SELINFO_T *si[NR_TXRX];
3650 #define want_tx want[NR_TX]
3651 #define want_rx want[NR_RX]
3652 struct mbq q; /* packets from RX hw queues to host stack */
3655 * In order to avoid nested locks, we need to "double check"
3656 * txsync and rxsync if we decide to do a selrecord().
3657 * retry_tx (and retry_rx, later) prevent looping forever.
3659 int retry_tx = 1, retry_rx = 1;
3661 /* Transparent mode: send_down is 1 if we have found some
3662 * packets to forward (host RX ring --> NIC) during the rx
3663 * scan and we have not sent them down to the NIC yet.
3664 * Transparent mode requires to bind all rings to a single
3668 int sync_flags = priv->np_sync_flags;
3672 if (unlikely(priv->np_nifp == NULL)) {
3675 mb(); /* make sure following reads are not from cache */
3679 if (unlikely(!nm_netmap_on(na)))
3682 if (unlikely(priv->np_csb_atok_base)) {
3683 nm_prerr("Invalid poll in CSB mode");
3687 if (netmap_debug & NM_DEBUG_ON)
3688 nm_prinf("device %s events 0x%x", na->name, events);
3689 want_tx = events & (POLLOUT | POLLWRNORM);
3690 want_rx = events & (POLLIN | POLLRDNORM);
3693 * If the card has more than one queue AND the file descriptor is
3694 * bound to all of them, we sleep on the "global" selinfo, otherwise
3695 * we sleep on individual selinfo (FreeBSD only allows two selinfo's
3696 * per file descriptor).
3697 * The interrupt routine in the driver wake one or the other
3698 * (or both) depending on which clients are active.
3700 * rxsync() is only called if we run out of buffers on a POLLIN.
3701 * txsync() is called if we run out of buffers on POLLOUT, or
3702 * there are pending packets to send. The latter can be disabled
3703 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
3705 si[NR_RX] = priv->np_si[NR_RX];
3706 si[NR_TX] = priv->np_si[NR_TX];
3710 * We start with a lock free round which is cheap if we have
3711 * slots available. If this fails, then lock and call the sync
3712 * routines. We can't do this on Linux, as the contract says
3713 * that we must call nm_os_selrecord() unconditionally.
3716 const enum txrx t = NR_TX;
3717 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3718 kring = NMR(na, t)[i];
3719 if (kring->ring->cur != kring->ring->tail) {
3720 /* Some unseen TX space is available, so what
3721 * we don't need to run txsync. */
3729 const enum txrx t = NR_RX;
3730 int rxsync_needed = 0;
3732 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3733 kring = NMR(na, t)[i];
3734 if (kring->ring->cur == kring->ring->tail
3735 || kring->rhead != kring->ring->head) {
3736 /* There are no unseen packets on this ring,
3737 * or there are some buffers to be returned
3738 * to the netmap port. We therefore go ahead
3739 * and run rxsync. */
3744 if (!rxsync_needed) {
3752 /* The selrecord must be unconditional on linux. */
3753 nm_os_selrecord(sr, si[NR_RX]);
3754 nm_os_selrecord(sr, si[NR_TX]);
3758 * If we want to push packets out (priv->np_txpoll) or
3759 * want_tx is still set, we must issue txsync calls
3760 * (on all rings, to avoid that the tx rings stall).
3761 * Fortunately, normal tx mode has np_txpoll set.
3763 if (priv->np_txpoll || want_tx) {
3765 * The first round checks if anyone is ready, if not
3766 * do a selrecord and another round to handle races.
3767 * want_tx goes to 0 if any space is found, and is
3768 * used to skip rings with no pending transmissions.
3771 for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
3774 kring = na->tx_rings[i];
3778 * Don't try to txsync this TX ring if we already found some
3779 * space in some of the TX rings (want_tx == 0) and there are no
3780 * TX slots in this ring that need to be flushed to the NIC
3783 if (!send_down && !want_tx && ring->head == kring->nr_hwcur)
3786 if (nm_kr_tryget(kring, 1, &revents))
3789 if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3790 netmap_ring_reinit(kring);
3793 if (kring->nm_sync(kring, sync_flags))
3796 nm_sync_finalize(kring);
3800 * If we found new slots, notify potential
3801 * listeners on the same ring.
3802 * Since we just did a txsync, look at the copies
3803 * of cur,tail in the kring.
3805 found = kring->rcur != kring->rtail;
3807 if (found) { /* notify other listeners */
3811 kring->nm_notify(kring, 0);
3815 /* if there were any packet to forward we must have handled them by now */
3817 if (want_tx && retry_tx && sr) {
3819 nm_os_selrecord(sr, si[NR_TX]);
3827 * If want_rx is still set scan receive rings.
3828 * Do it on all rings because otherwise we starve.
3831 /* two rounds here for race avoidance */
3833 for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
3836 kring = na->rx_rings[i];
3839 if (unlikely(nm_kr_tryget(kring, 1, &revents)))
3842 if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3843 netmap_ring_reinit(kring);
3846 /* now we can use kring->rcur, rtail */
3849 * transparent mode support: collect packets from
3850 * hw rxring(s) that have been released by the user
3852 if (nm_may_forward_up(kring)) {
3853 netmap_grab_packets(kring, &q, netmap_fwd);
3856 /* Clear the NR_FORWARD flag anyway, it may be set by
3857 * the nm_sync() below only on for the host RX ring (see
3858 * netmap_rxsync_from_host()). */
3859 kring->nr_kflags &= ~NR_FORWARD;
3860 if (kring->nm_sync(kring, sync_flags))
3863 nm_sync_finalize(kring);
3864 send_down |= (kring->nr_kflags & NR_FORWARD);
3865 ring_timestamp_set(ring);
3866 found = kring->rcur != kring->rtail;
3872 kring->nm_notify(kring, 0);
3878 if (retry_rx && sr) {
3879 nm_os_selrecord(sr, si[NR_RX]);
3882 if (send_down || retry_rx) {
3885 goto flush_tx; /* and retry_rx */
3892 * Transparent mode: released bufs (i.e. between kring->nr_hwcur and
3893 * ring->head) marked with NS_FORWARD on hw rx rings are passed up
3894 * to the host stack.
3898 netmap_send_up(na->ifp, &q);
3907 nma_intr_enable(struct netmap_adapter *na, int onoff)
3909 bool changed = false;
3914 for (i = 0; i < nma_get_nrings(na, t); i++) {
3915 struct netmap_kring *kring = NMR(na, t)[i];
3916 int on = !(kring->nr_kflags & NKR_NOINTR);
3918 if (!!onoff != !!on) {
3922 kring->nr_kflags &= ~NKR_NOINTR;
3924 kring->nr_kflags |= NKR_NOINTR;
3930 return 0; /* nothing to do */
3934 nm_prerr("Cannot %s interrupts for %s", onoff ? "enable" : "disable",
3939 na->nm_intr(na, onoff);
3945 /*-------------------- driver support routines -------------------*/
3947 /* default notify callback */
3949 netmap_notify(struct netmap_kring *kring, int flags)
3951 struct netmap_adapter *na = kring->notify_na;
3952 enum txrx t = kring->tx;
3954 nm_os_selwakeup(&kring->si);
3955 /* optimization: avoid a wake up on the global
3956 * queue if nobody has registered for more
3959 if (na->si_users[t] > 0)
3960 nm_os_selwakeup(&na->si[t]);
3962 return NM_IRQ_COMPLETED;
3965 /* called by all routines that create netmap_adapters.
3966 * provide some defaults and get a reference to the
3970 netmap_attach_common(struct netmap_adapter *na)
3972 if (!na->rx_buf_maxsize) {
3973 /* Set a conservative default (larger is safer). */
3974 na->rx_buf_maxsize = PAGE_SIZE;
3978 if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
3979 na->if_input = na->ifp->if_input; /* for netmap_send_up */
3981 na->pdev = na; /* make sure netmap_mem_map() is called */
3982 #endif /* __FreeBSD__ */
3983 if (na->na_flags & NAF_HOST_RINGS) {
3984 if (na->num_host_rx_rings == 0)
3985 na->num_host_rx_rings = 1;
3986 if (na->num_host_tx_rings == 0)
3987 na->num_host_tx_rings = 1;
3989 if (na->nm_krings_create == NULL) {
3990 /* we assume that we have been called by a driver,
3991 * since other port types all provide their own
3994 na->nm_krings_create = netmap_hw_krings_create;
3995 na->nm_krings_delete = netmap_hw_krings_delete;
3997 if (na->nm_notify == NULL)
3998 na->nm_notify = netmap_notify;
4001 if (na->nm_mem == NULL) {
4002 /* use iommu or global allocator */
4003 na->nm_mem = netmap_mem_get_iommu(na);
4005 if (na->nm_bdg_attach == NULL)
4006 /* no special nm_bdg_attach callback. On VALE
4007 * attach, we need to interpose a bwrap
4009 na->nm_bdg_attach = netmap_default_bdg_attach;
4014 /* Wrapper for the register callback provided netmap-enabled
4016 * nm_iszombie(na) means that the driver module has been
4017 * unloaded, so we cannot call into it.
4018 * nm_os_ifnet_lock() must guarantee mutual exclusion with
4022 netmap_hw_reg(struct netmap_adapter *na, int onoff)
4024 struct netmap_hw_adapter *hwna =
4025 (struct netmap_hw_adapter*)na;
4030 if (nm_iszombie(na)) {
4033 } else if (na != NULL) {
4034 na->na_flags &= ~NAF_NETMAP_ON;
4039 error = hwna->nm_hw_register(na, onoff);
4042 nm_os_ifnet_unlock();
4048 netmap_hw_dtor(struct netmap_adapter *na)
4050 if (na->ifp == NULL)
4053 NM_DETACH_NA(na->ifp);
4058 * Allocate a netmap_adapter object, and initialize it from the
4059 * 'arg' passed by the driver on attach.
4060 * We allocate a block of memory of 'size' bytes, which has room
4061 * for struct netmap_adapter plus additional room private to
4063 * Return 0 on success, ENOMEM otherwise.
4066 netmap_attach_ext(struct netmap_adapter *arg, size_t size, int override_reg)
4068 struct netmap_hw_adapter *hwna = NULL;
4069 struct ifnet *ifp = NULL;
4071 if (size < sizeof(struct netmap_hw_adapter)) {
4072 if (netmap_debug & NM_DEBUG_ON)
4073 nm_prerr("Invalid netmap adapter size %d", (int)size);
4077 if (arg == NULL || arg->ifp == NULL) {
4078 if (netmap_debug & NM_DEBUG_ON)
4079 nm_prerr("either arg or arg->ifp is NULL");
4083 if (arg->num_tx_rings == 0 || arg->num_rx_rings == 0) {
4084 if (netmap_debug & NM_DEBUG_ON)
4085 nm_prerr("%s: invalid rings tx %d rx %d",
4086 arg->name, arg->num_tx_rings, arg->num_rx_rings);
4091 if (NM_NA_CLASH(ifp)) {
4092 /* If NA(ifp) is not null but there is no valid netmap
4093 * adapter it means that someone else is using the same
4094 * pointer (e.g. ax25_ptr on linux). This happens for
4095 * instance when also PF_RING is in use. */
4096 nm_prerr("Error: netmap adapter hook is busy");
4100 hwna = nm_os_malloc(size);
4104 hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
4105 strlcpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
4107 hwna->nm_hw_register = hwna->up.nm_register;
4108 hwna->up.nm_register = netmap_hw_reg;
4110 if (netmap_attach_common(&hwna->up)) {
4114 netmap_adapter_get(&hwna->up);
4116 NM_ATTACH_NA(ifp, &hwna->up);
4118 nm_os_onattach(ifp);
4120 if (arg->nm_dtor == NULL) {
4121 hwna->up.nm_dtor = netmap_hw_dtor;
4124 if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
4125 hwna->up.num_tx_rings, hwna->up.num_tx_desc,
4126 hwna->up.num_rx_rings, hwna->up.num_rx_desc);
4130 nm_prerr("fail, arg %p ifp %p na %p", arg, ifp, hwna);
4131 return (hwna ? EINVAL : ENOMEM);
4136 netmap_attach(struct netmap_adapter *arg)
4138 return netmap_attach_ext(arg, sizeof(struct netmap_hw_adapter),
4139 1 /* override nm_reg */);
4144 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
4150 refcount_acquire(&na->na_refcount);
4154 /* returns 1 iff the netmap_adapter is destroyed */
4156 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
4161 if (!refcount_release(&na->na_refcount))
4167 if (na->tx_rings) { /* XXX should not happen */
4168 if (netmap_debug & NM_DEBUG_ON)
4169 nm_prerr("freeing leftover tx_rings");
4170 na->nm_krings_delete(na);
4172 netmap_pipe_dealloc(na);
4174 netmap_mem_put(na->nm_mem);
4175 bzero(na, sizeof(*na));
4181 /* nm_krings_create callback for all hardware native adapters */
4183 netmap_hw_krings_create(struct netmap_adapter *na)
4185 int ret = netmap_krings_create(na, 0);
4187 /* initialize the mbq for the sw rx ring */
4188 u_int lim = netmap_real_rings(na, NR_RX), i;
4189 for (i = na->num_rx_rings; i < lim; i++) {
4190 mbq_safe_init(&NMR(na, NR_RX)[i]->rx_queue);
4192 nm_prdis("initialized sw rx queue %d", na->num_rx_rings);
4200 * Called on module unload by the netmap-enabled drivers
4203 netmap_detach(struct ifnet *ifp)
4205 struct netmap_adapter *na;
4209 if (!NM_NA_VALID(ifp)) {
4215 netmap_set_all_rings(na, NM_KR_LOCKED);
4217 * if the netmap adapter is not native, somebody
4218 * changed it, so we can not release it here.
4219 * The NAF_ZOMBIE flag will notify the new owner that
4220 * the driver is gone.
4222 if (!(na->na_flags & NAF_NATIVE) || !netmap_adapter_put(na)) {
4223 na->na_flags |= NAF_ZOMBIE;
4225 /* give active users a chance to notice that NAF_ZOMBIE has been
4226 * turned on, so that they can stop and return an error to userspace.
4227 * Note that this becomes a NOP if there are no active users and,
4228 * therefore, the put() above has deleted the na, since now NA(ifp) is
4231 netmap_enable_all_rings(ifp);
4237 * Intercept packets from the network stack and pass them
4238 * to netmap as incoming packets on the 'software' ring.
4240 * We only store packets in a bounded mbq and then copy them
4241 * in the relevant rxsync routine.
4243 * We rely on the OS to make sure that the ifp and na do not go
4244 * away (typically the caller checks for IFF_DRV_RUNNING or the like).
4245 * In nm_register() or whenever there is a reinitialization,
4246 * we make sure to make the mode change visible here.
4249 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
4251 struct netmap_adapter *na = NA(ifp);
4252 struct netmap_kring *kring, *tx_kring;
4253 u_int len = MBUF_LEN(m);
4254 u_int error = ENOBUFS;
4261 if (i >= na->num_host_rx_rings) {
4262 i = i % na->num_host_rx_rings;
4264 kring = NMR(na, NR_RX)[nma_get_nrings(na, NR_RX) + i];
4266 // XXX [Linux] we do not need this lock
4267 // if we follow the down/configure/up protocol -gl
4268 // mtx_lock(&na->core_lock);
4270 if (!nm_netmap_on(na)) {
4271 nm_prerr("%s not in netmap mode anymore", na->name);
4277 if (txr >= na->num_tx_rings) {
4278 txr %= na->num_tx_rings;
4280 tx_kring = NMR(na, NR_TX)[txr];
4282 if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
4283 return MBUF_TRANSMIT(na, ifp, m);
4286 q = &kring->rx_queue;
4288 // XXX reconsider long packets if we handle fragments
4289 if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
4290 nm_prerr("%s from_host, drop packet size %d > %d", na->name,
4291 len, NETMAP_BUF_SIZE(na));
4295 if (!netmap_generic_hwcsum) {
4296 if (nm_os_mbuf_has_csum_offld(m)) {
4297 nm_prlim(1, "%s drop mbuf that needs checksum offload", na->name);
4302 if (nm_os_mbuf_has_seg_offld(m)) {
4303 nm_prlim(1, "%s drop mbuf that needs generic segmentation offload", na->name);
4308 ETHER_BPF_MTAP(ifp, m);
4309 #endif /* __FreeBSD__ */
4311 /* protect against netmap_rxsync_from_host(), netmap_sw_to_nic()
4312 * and maybe other instances of netmap_transmit (the latter
4313 * not possible on Linux).
4314 * We enqueue the mbuf only if we are sure there is going to be
4315 * enough room in the host RX ring, otherwise we drop it.
4319 busy = kring->nr_hwtail - kring->nr_hwcur;
4321 busy += kring->nkr_num_slots;
4322 if (busy + mbq_len(q) >= kring->nkr_num_slots - 1) {
4323 nm_prlim(2, "%s full hwcur %d hwtail %d qlen %d", na->name,
4324 kring->nr_hwcur, kring->nr_hwtail, mbq_len(q));
4327 nm_prdis(2, "%s %d bufs in queue", na->name, mbq_len(q));
4328 /* notify outside the lock */
4337 /* unconditionally wake up listeners */
4338 kring->nm_notify(kring, 0);
4339 /* this is normally netmap_notify(), but for nics
4340 * connected to a bridge it is netmap_bwrap_intr_notify(),
4341 * that possibly forwards the frames through the switch
4349 * Reset function to be called by the driver routines when reinitializing
4350 * a hardware ring. The driver is in charge of locking to protect the kring
4351 * while this operation is being performed. This is normally achieved by
4352 * calling netmap_disable_all_rings() before triggering a reset.
4353 * If the kring is not in netmap mode, return NULL to inform the caller
4354 * that this is the case.
4355 * If the kring is in netmap mode, set hwofs so that the netmap indices
4356 * seen by userspace (head/cut/tail) do not change, although the internal
4357 * NIC indices have been reset to 0.
4358 * In any case, adjust kring->nr_mode.
4360 struct netmap_slot *
4361 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
4364 struct netmap_kring *kring;
4365 u_int new_hwtail, new_hwofs;
4367 if (!nm_native_on(na)) {
4368 nm_prdis("interface not in native netmap mode");
4369 return NULL; /* nothing to reinitialize */
4373 if (n >= na->num_tx_rings)
4375 kring = na->tx_rings[n];
4377 * Set hwofs to rhead, so that slots[rhead] is mapped to
4378 * the NIC internal slot 0, and thus the netmap buffer
4379 * at rhead is the next to be transmitted. Transmissions
4380 * that were pending before the reset are considered as
4381 * sent, so that we can have hwcur = rhead. All the slots
4382 * are now owned by the user, so we can also reinit hwtail.
4384 new_hwofs = kring->rhead;
4385 new_hwtail = nm_prev(kring->rhead, kring->nkr_num_slots - 1);
4387 if (n >= na->num_rx_rings)
4389 kring = na->rx_rings[n];
4391 * Set hwofs to hwtail, so that slots[hwtail] is mapped to
4392 * the NIC internal slot 0, and thus the netmap buffer
4393 * at hwtail is the next to be given to the NIC.
4394 * Unread slots (the ones in [rhead,hwtail[) are owned by
4395 * the user, and thus the caller cannot give them
4396 * to the NIC right now.
4398 new_hwofs = kring->nr_hwtail;
4399 new_hwtail = kring->nr_hwtail;
4401 if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
4402 kring->nr_mode = NKR_NETMAP_OFF;
4405 if (netmap_verbose) {
4406 nm_prinf("%s, hc %u->%u, ht %u->%u, ho %u->%u", kring->name,
4407 kring->nr_hwcur, kring->rhead,
4408 kring->nr_hwtail, new_hwtail,
4409 kring->nkr_hwofs, new_hwofs);
4411 kring->nr_hwcur = kring->rhead;
4412 kring->nr_hwtail = new_hwtail;
4413 kring->nkr_hwofs = new_hwofs;
4416 * Wakeup on the individual and global selwait
4417 * We do the wakeup here, but the ring is not yet reconfigured.
4418 * However, we are under lock so there are no races.
4420 kring->nr_mode = NKR_NETMAP_ON;
4421 kring->nm_notify(kring, 0);
4422 return kring->ring->slot;
4427 * Dispatch rx/tx interrupts to the netmap rings.
4429 * "work_done" is non-null on the RX path, NULL for the TX path.
4430 * We rely on the OS to make sure that there is only one active
4431 * instance per queue, and that there is appropriate locking.
4433 * The 'notify' routine depends on what the ring is attached to.
4434 * - for a netmap file descriptor, do a selwakeup on the individual
4435 * waitqueue, plus one on the global one if needed
4436 * (see netmap_notify)
4437 * - for a nic connected to a switch, call the proper forwarding routine
4438 * (see netmap_bwrap_intr_notify)
4441 netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
4443 struct netmap_kring *kring;
4444 enum txrx t = (work_done ? NR_RX : NR_TX);
4446 q &= NETMAP_RING_MASK;
4448 if (netmap_debug & (NM_DEBUG_RXINTR|NM_DEBUG_TXINTR)) {
4449 nm_prlim(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
4452 if (q >= nma_get_nrings(na, t))
4453 return NM_IRQ_PASS; // not a physical queue
4455 kring = NMR(na, t)[q];
4457 if (kring->nr_mode == NKR_NETMAP_OFF) {
4462 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ?
4463 *work_done = 1; /* do not fire napi again */
4466 return kring->nm_notify(kring, 0);
4471 * Default functions to handle rx/tx interrupts from a physical device.
4472 * "work_done" is non-null on the RX path, NULL for the TX path.
4474 * If the card is not in netmap mode, simply return NM_IRQ_PASS,
4475 * so that the caller proceeds with regular processing.
4476 * Otherwise call netmap_common_irq().
4478 * If the card is connected to a netmap file descriptor,
4479 * do a selwakeup on the individual queue, plus one on the global one
4480 * if needed (multiqueue card _and_ there are multiqueue listeners),
4481 * and return NR_IRQ_COMPLETED.
4483 * Finally, if called on rx from an interface connected to a switch,
4484 * calls the proper forwarding routine.
4487 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
4489 struct netmap_adapter *na = NA(ifp);
4492 * XXX emulated netmap mode sets NAF_SKIP_INTR so
4493 * we still use the regular driver even though the previous
4494 * check fails. It is unclear whether we should use
4495 * nm_native_on() here.
4497 if (!nm_netmap_on(na))
4500 if (na->na_flags & NAF_SKIP_INTR) {
4501 nm_prdis("use regular interrupt");
4505 return netmap_common_irq(na, q, work_done);
4508 /* set/clear native flags and if_transmit/netdev_ops */
4510 nm_set_native_flags(struct netmap_adapter *na)
4512 struct ifnet *ifp = na->ifp;
4514 /* We do the setup for intercepting packets only if we are the
4515 * first user of this adapter. */
4516 if (na->active_fds > 0) {
4520 na->na_flags |= NAF_NETMAP_ON;
4522 netmap_update_hostrings_mode(na);
4526 nm_clear_native_flags(struct netmap_adapter *na)
4528 struct ifnet *ifp = na->ifp;
4530 /* We undo the setup for intercepting packets only if we are the
4531 * last user of this adapter. */
4532 if (na->active_fds > 0) {
4536 netmap_update_hostrings_mode(na);
4539 na->na_flags &= ~NAF_NETMAP_ON;
4543 netmap_krings_mode_commit(struct netmap_adapter *na, int onoff)
4550 for (i = 0; i < netmap_real_rings(na, t); i++) {
4551 struct netmap_kring *kring = NMR(na, t)[i];
4553 if (onoff && nm_kring_pending_on(kring))
4554 kring->nr_mode = NKR_NETMAP_ON;
4555 else if (!onoff && nm_kring_pending_off(kring))
4556 kring->nr_mode = NKR_NETMAP_OFF;
4562 * Module loader and unloader
4564 * netmap_init() creates the /dev/netmap device and initializes
4565 * all global variables. Returns 0 on success, errno on failure
4566 * (but there is no chance)
4568 * netmap_fini() destroys everything.
4571 static struct cdev *netmap_dev; /* /dev/netmap character device. */
4572 extern struct cdevsw netmap_cdevsw;
4579 destroy_dev(netmap_dev);
4580 /* we assume that there are no longer netmap users */
4582 netmap_uninit_bridges();
4585 nm_prinf("netmap: unloaded module.");
4596 error = netmap_mem_init();
4600 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
4601 * when the module is compiled in.
4602 * XXX could use make_dev_credv() to get error number
4604 netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
4605 &netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
4610 error = netmap_init_bridges();
4615 nm_os_vi_init_index();
4618 error = nm_os_ifnet_init();
4622 #if !defined(__FreeBSD__) || defined(KLD_MODULE)
4623 nm_prinf("netmap: loaded module");
4628 return (EINVAL); /* may be incorrect */