2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (C) 2011-2014 Matteo Landi
5 * Copyright (C) 2011-2016 Luigi Rizzo
6 * Copyright (C) 2011-2016 Giuseppe Lettieri
7 * Copyright (C) 2011-2016 Vincenzo Maffione
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * This module supports memory mapped access to network devices,
39 * The module uses a large, memory pool allocated by the kernel
40 * and accessible as mmapped memory by multiple userspace threads/processes.
41 * The memory pool contains packet buffers and "netmap rings",
42 * i.e. user-accessible copies of the interface's queues.
44 * Access to the network card works like this:
45 * 1. a process/thread issues one or more open() on /dev/netmap, to create
46 * select()able file descriptor on which events are reported.
47 * 2. on each descriptor, the process issues an ioctl() to identify
48 * the interface that should report events to the file descriptor.
49 * 3. on each descriptor, the process issues an mmap() request to
50 * map the shared memory region within the process' address space.
51 * The list of interesting queues is indicated by a location in
52 * the shared memory region.
53 * 4. using the functions in the netmap(4) userspace API, a process
54 * can look up the occupation state of a queue, access memory buffers,
55 * and retrieve received packets or enqueue packets to transmit.
56 * 5. using some ioctl()s the process can synchronize the userspace view
57 * of the queue with the actual status in the kernel. This includes both
58 * receiving the notification of new packets, and transmitting new
59 * packets on the output interface.
60 * 6. select() or poll() can be used to wait for events on individual
61 * transmit or receive queues (or all queues for a given interface).
64 SYNCHRONIZATION (USER)
66 The netmap rings and data structures may be shared among multiple
67 user threads or even independent processes.
68 Any synchronization among those threads/processes is delegated
69 to the threads themselves. Only one thread at a time can be in
70 a system call on the same netmap ring. The OS does not enforce
71 this and only guarantees against system crashes in case of
76 Within the kernel, access to the netmap rings is protected as follows:
78 - a spinlock on each ring, to handle producer/consumer races on
79 RX rings attached to the host stack (against multiple host
80 threads writing from the host stack to the same ring),
81 and on 'destination' rings attached to a VALE switch
82 (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
83 protecting multiple active senders for the same destination)
85 - an atomic variable to guarantee that there is at most one
86 instance of *_*xsync() on the ring at any time.
87 For rings connected to user file
88 descriptors, an atomic_test_and_set() protects this, and the
89 lock on the ring is not actually used.
90 For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
91 is also used to prevent multiple executions (the driver might indeed
92 already guarantee this).
93 For NIC TX rings connected to a VALE switch, the lock arbitrates
94 access to the queue (both when allocating buffers and when pushing
97 - *xsync() should be protected against initializations of the card.
98 On FreeBSD most devices have the reset routine protected by
99 a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
100 the RING protection on rx_reset(), this should be added.
102 On linux there is an external lock on the tx path, which probably
103 also arbitrates access to the reset routine. XXX to be revised
105 - a per-interface core_lock protecting access from the host stack
106 while interfaces may be detached from netmap mode.
107 XXX there should be no need for this lock if we detach the interfaces
108 only while they are down.
113 NMG_LOCK() serializes all modifications to switches and ports.
114 A switch cannot be deleted until all ports are gone.
116 For each switch, an SX lock (RWlock on linux) protects
117 deletion of ports. When configuring or deleting a new port, the
118 lock is acquired in exclusive mode (after holding NMG_LOCK).
119 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
120 The lock is held throughout the entire forwarding cycle,
121 during which the thread may incur in a page fault.
122 Hence it is important that sleepable shared locks are used.
124 On the rx ring, the per-port lock is grabbed initially to reserve
125 a number of slot in the ring, then the lock is released,
126 packets are copied from source to destination, and then
127 the lock is acquired again and the receive ring is updated.
128 (A similar thing is done on the tx ring for NIC and host stack
129 ports attached to the switch)
134 /* --- internals ----
136 * Roadmap to the code that implements the above.
138 * > 1. a process/thread issues one or more open() on /dev/netmap, to create
139 * > select()able file descriptor on which events are reported.
141 * Internally, we allocate a netmap_priv_d structure, that will be
142 * initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
143 * structure for each open().
146 * FreeBSD: see netmap_open() (netmap_freebsd.c)
147 * linux: see linux_netmap_open() (netmap_linux.c)
149 * > 2. on each descriptor, the process issues an ioctl() to identify
150 * > the interface that should report events to the file descriptor.
152 * Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
153 * Most important things happen in netmap_get_na() and
154 * netmap_do_regif(), called from there. Additional details can be
155 * found in the comments above those functions.
157 * In all cases, this action creates/takes-a-reference-to a
158 * netmap_*_adapter describing the port, and allocates a netmap_if
159 * and all necessary netmap rings, filling them with netmap buffers.
161 * In this phase, the sync callbacks for each ring are set (these are used
162 * in steps 5 and 6 below). The callbacks depend on the type of adapter.
163 * The adapter creation/initialization code puts them in the
164 * netmap_adapter (fields na->nm_txsync and na->nm_rxsync). Then, they
165 * are copied from there to the netmap_kring's during netmap_do_regif(), by
166 * the nm_krings_create() callback. All the nm_krings_create callbacks
167 * actually call netmap_krings_create() to perform this and the other
168 * common stuff. netmap_krings_create() also takes care of the host rings,
169 * if needed, by setting their sync callbacks appropriately.
171 * Additional actions depend on the kind of netmap_adapter that has been
174 * - netmap_hw_adapter: [netmap.c]
175 * This is a system netdev/ifp with native netmap support.
176 * The ifp is detached from the host stack by redirecting:
177 * - transmissions (from the network stack) to netmap_transmit()
178 * - receive notifications to the nm_notify() callback for
179 * this adapter. The callback is normally netmap_notify(), unless
180 * the ifp is attached to a bridge using bwrap, in which case it
181 * is netmap_bwrap_intr_notify().
183 * - netmap_generic_adapter: [netmap_generic.c]
184 * A system netdev/ifp without native netmap support.
186 * (the decision about native/non native support is taken in
187 * netmap_get_hw_na(), called by netmap_get_na())
189 * - netmap_vp_adapter [netmap_vale.c]
190 * Returned by netmap_get_bdg_na().
191 * This is a persistent or ephemeral VALE port. Ephemeral ports
192 * are created on the fly if they don't already exist, and are
193 * always attached to a bridge.
194 * Persistent VALE ports must must be created separately, and i
195 * then attached like normal NICs. The NIOCREGIF we are examining
196 * will find them only if they had previosly been created and
197 * attached (see VALE_CTL below).
199 * - netmap_pipe_adapter [netmap_pipe.c]
200 * Returned by netmap_get_pipe_na().
201 * Both pipe ends are created, if they didn't already exist.
203 * - netmap_monitor_adapter [netmap_monitor.c]
204 * Returned by netmap_get_monitor_na().
205 * If successful, the nm_sync callbacks of the monitored adapter
206 * will be intercepted by the returned monitor.
208 * - netmap_bwrap_adapter [netmap_vale.c]
209 * Cannot be obtained in this way, see VALE_CTL below
213 * linux: we first go through linux_netmap_ioctl() to
214 * adapt the FreeBSD interface to the linux one.
217 * > 3. on each descriptor, the process issues an mmap() request to
218 * > map the shared memory region within the process' address space.
219 * > The list of interesting queues is indicated by a location in
220 * > the shared memory region.
223 * FreeBSD: netmap_mmap_single (netmap_freebsd.c).
224 * linux: linux_netmap_mmap (netmap_linux.c).
226 * > 4. using the functions in the netmap(4) userspace API, a process
227 * > can look up the occupation state of a queue, access memory buffers,
228 * > and retrieve received packets or enqueue packets to transmit.
230 * these actions do not involve the kernel.
232 * > 5. using some ioctl()s the process can synchronize the userspace view
233 * > of the queue with the actual status in the kernel. This includes both
234 * > receiving the notification of new packets, and transmitting new
235 * > packets on the output interface.
237 * These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
238 * cases. They invoke the nm_sync callbacks on the netmap_kring
239 * structures, as initialized in step 2 and maybe later modified
240 * by a monitor. Monitors, however, will always call the original
241 * callback before doing anything else.
244 * > 6. select() or poll() can be used to wait for events on individual
245 * > transmit or receive queues (or all queues for a given interface).
247 * Implemented in netmap_poll(). This will call the same nm_sync()
248 * callbacks as in step 5 above.
251 * linux: we first go through linux_netmap_poll() to adapt
252 * the FreeBSD interface to the linux one.
255 * ---- VALE_CTL -----
257 * VALE switches are controlled by issuing a NIOCREGIF with a non-null
258 * nr_cmd in the nmreq structure. These subcommands are handled by
259 * netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
260 * and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
261 * subcommands, respectively.
263 * Any network interface known to the system (including a persistent VALE
264 * port) can be attached to a VALE switch by issuing the
265 * NETMAP_REQ_VALE_ATTACH command. After the attachment, persistent VALE ports
266 * look exactly like ephemeral VALE ports (as created in step 2 above). The
267 * attachment of other interfaces, instead, requires the creation of a
268 * netmap_bwrap_adapter. Moreover, the attached interface must be put in
269 * netmap mode. This may require the creation of a netmap_generic_adapter if
270 * we have no native support for the interface, or if generic adapters have
271 * been forced by sysctl.
273 * Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
274 * called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
275 * callback. In the case of the bwrap, the callback creates the
276 * netmap_bwrap_adapter. The initialization of the bwrap is then
277 * completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
278 * callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
279 * A generic adapter for the wrapped ifp will be created if needed, when
280 * netmap_get_bdg_na() calls netmap_get_hw_na().
283 * ---- DATAPATHS -----
285 * -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
287 * na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
289 * - tx from netmap userspace:
291 * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
292 * kring->nm_sync() == DEVICE_netmap_txsync()
293 * 2) device interrupt handler
294 * na->nm_notify() == netmap_notify()
295 * - rx from netmap userspace:
297 * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
298 * kring->nm_sync() == DEVICE_netmap_rxsync()
299 * 2) device interrupt handler
300 * na->nm_notify() == netmap_notify()
301 * - rx from host stack
305 * na->nm_notify == netmap_notify()
306 * 2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
307 * kring->nm_sync() == netmap_rxsync_from_host
308 * netmap_rxsync_from_host(na, NULL, NULL)
310 * ioctl(NIOCTXSYNC)/netmap_poll() in process context
311 * kring->nm_sync() == netmap_txsync_to_host
312 * netmap_txsync_to_host(na)
314 * FreeBSD: na->if_input() == ether_input()
315 * linux: netif_rx() with NM_MAGIC_PRIORITY_RX
318 * -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
320 * na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
322 * - tx from netmap userspace:
324 * 1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
325 * kring->nm_sync() == generic_netmap_txsync()
326 * nm_os_generic_xmit_frame()
327 * linux: dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
328 * ifp->ndo_start_xmit == generic_ndo_start_xmit()
329 * gna->save_start_xmit == orig. dev. start_xmit
330 * FreeBSD: na->if_transmit() == orig. dev if_transmit
331 * 2) generic_mbuf_destructor()
332 * na->nm_notify() == netmap_notify()
333 * - rx from netmap userspace:
334 * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
335 * kring->nm_sync() == generic_netmap_rxsync()
338 * generic_rx_handler()
340 * na->nm_notify() == netmap_notify()
341 * - rx from host stack
342 * FreeBSD: same as native
343 * Linux: same as native except:
345 * dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
346 * ifp->ndo_start_xmit == generic_ndo_start_xmit()
348 * na->nm_notify() == netmap_notify()
349 * - tx to host stack (same as native):
357 * ioctl(NIOCTXSYNC)/netmap_poll() in process context
358 * kring->nm_sync() == netmap_vp_txsync()
360 * - system device with native support:
363 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
364 * kring->nm_sync() == DEVICE_netmap_rxsync()
366 * kring->nm_sync() == DEVICE_netmap_rxsync()
369 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
370 * kring->nm_sync() == netmap_rxsync_from_host()
373 * - system device with generic support:
374 * from device driver:
375 * generic_rx_handler()
376 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
377 * kring->nm_sync() == generic_netmap_rxsync()
379 * kring->nm_sync() == generic_netmap_rxsync()
382 * na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
383 * kring->nm_sync() == netmap_rxsync_from_host()
386 * (all cases) --> nm_bdg_flush()
387 * dest_na->nm_notify() == (see below)
393 * 1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
394 * kring->nm_sync() == netmap_vp_rxsync()
395 * 2) from nm_bdg_flush()
396 * na->nm_notify() == netmap_notify()
398 * - system device with native support:
400 * na->nm_notify() == netmap_bwrap_notify()
402 * kring->nm_sync() == DEVICE_netmap_txsync()
406 * kring->nm_sync() == netmap_txsync_to_host
407 * netmap_vp_rxsync_locked()
409 * - system device with generic adapter:
411 * na->nm_notify() == netmap_bwrap_notify()
413 * kring->nm_sync() == generic_netmap_txsync()
417 * kring->nm_sync() == netmap_txsync_to_host
423 * OS-specific code that is used only within this file.
424 * Other OS-specific code that must be accessed by drivers
425 * is present in netmap_kern.h
428 #if defined(__FreeBSD__)
429 #include <sys/cdefs.h> /* prerequisite */
430 #include <sys/types.h>
431 #include <sys/errno.h>
432 #include <sys/param.h> /* defines used in kernel.h */
433 #include <sys/kernel.h> /* types used in module initialization */
434 #include <sys/conf.h> /* cdevsw struct, UID, GID */
435 #include <sys/filio.h> /* FIONBIO */
436 #include <sys/sockio.h>
437 #include <sys/socketvar.h> /* struct socket */
438 #include <sys/malloc.h>
439 #include <sys/poll.h>
440 #include <sys/rwlock.h>
441 #include <sys/socket.h> /* sockaddrs */
442 #include <sys/selinfo.h>
443 #include <sys/sysctl.h>
444 #include <sys/jail.h>
445 #include <net/vnet.h>
447 #include <net/if_var.h>
448 #include <net/bpf.h> /* BIOCIMMEDIATE */
449 #include <machine/bus.h> /* bus_dmamap_* */
450 #include <sys/endian.h>
451 #include <sys/refcount.h>
456 #include "bsd_glue.h"
458 #elif defined(__APPLE__)
460 #warning OSX support is only partial
461 #include "osx_glue.h"
463 #elif defined (_WIN32)
465 #include "win_glue.h"
469 #error Unsupported platform
471 #endif /* unsupported */
476 #include <net/netmap.h>
477 #include <dev/netmap/netmap_kern.h>
478 #include <dev/netmap/netmap_mem2.h>
481 /* user-controlled variables */
484 static int netmap_no_timestamp; /* don't timestamp on rxsync */
485 int netmap_no_pendintr = 1;
486 int netmap_txsync_retry = 2;
487 static int netmap_fwd = 0; /* force transparent forwarding */
490 * netmap_admode selects the netmap mode to use.
491 * Invalid values are reset to NETMAP_ADMODE_BEST
493 enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */
494 NETMAP_ADMODE_NATIVE, /* either native or none */
495 NETMAP_ADMODE_GENERIC, /* force generic */
496 NETMAP_ADMODE_LAST };
497 static int netmap_admode = NETMAP_ADMODE_BEST;
499 /* netmap_generic_mit controls mitigation of RX notifications for
500 * the generic netmap adapter. The value is a time interval in
502 int netmap_generic_mit = 100*1000;
504 /* We use by default netmap-aware qdiscs with generic netmap adapters,
505 * even if there can be a little performance hit with hardware NICs.
506 * However, using the qdisc is the safer approach, for two reasons:
507 * 1) it prevents non-fifo qdiscs to break the TX notification
508 * scheme, which is based on mbuf destructors when txqdisc is
510 * 2) it makes it possible to transmit over software devices that
511 * change skb->dev, like bridge, veth, ...
513 * Anyway users looking for the best performance should
514 * use native adapters.
517 int netmap_generic_txqdisc = 1;
520 /* Default number of slots and queues for generic adapters. */
521 int netmap_generic_ringsize = 1024;
522 int netmap_generic_rings = 1;
524 /* Non-zero to enable checksum offloading in NIC drivers */
525 int netmap_generic_hwcsum = 0;
527 /* Non-zero if ptnet devices are allowed to use virtio-net headers. */
528 int ptnet_vnet_hdr = 1;
530 /* 0 if ptnetmap should not use worker threads for TX processing */
531 int ptnetmap_tx_workers = 1;
534 * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
535 * in some other operating systems
539 SYSCTL_DECL(_dev_netmap);
540 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
541 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
542 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
543 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
544 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
545 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr,
546 0, "Always look for new received packets.");
547 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
548 &netmap_txsync_retry, 0, "Number of txsync loops in bridge's flush.");
550 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0,
551 "Force NR_FORWARD mode");
552 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0,
553 "Adapter mode. 0 selects the best option available,"
554 "1 forces native adapter, 2 forces emulated adapter");
555 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_hwcsum, CTLFLAG_RW, &netmap_generic_hwcsum,
556 0, "Hardware checksums. 0 to disable checksum generation by the NIC (default),"
557 "1 to enable checksum generation by the NIC");
558 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit,
559 0, "RX notification interval in nanoseconds");
560 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW,
561 &netmap_generic_ringsize, 0,
562 "Number of per-ring slots for emulated netmap mode");
563 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW,
564 &netmap_generic_rings, 0,
565 "Number of TX/RX queues for emulated netmap adapters");
567 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW,
568 &netmap_generic_txqdisc, 0, "Use qdisc for generic adapters");
570 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr,
571 0, "Allow ptnet devices to use virtio-net headers");
572 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnetmap_tx_workers, CTLFLAG_RW,
573 &ptnetmap_tx_workers, 0, "Use worker threads for pnetmap TX processing");
577 NMG_LOCK_T netmap_global_lock;
580 * mark the ring as stopped, and run through the locks
581 * to make sure other users get to see it.
582 * stopped must be either NR_KR_STOPPED (for unbounded stop)
583 * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
586 netmap_disable_ring(struct netmap_kring *kr, int stopped)
588 nm_kr_stop(kr, stopped);
589 // XXX check if nm_kr_stop is sufficient
590 mtx_lock(&kr->q_lock);
591 mtx_unlock(&kr->q_lock);
595 /* stop or enable a single ring */
597 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
600 netmap_disable_ring(NMR(na, t)[ring_id], stopped);
602 NMR(na, t)[ring_id]->nkr_stopped = 0;
606 /* stop or enable all the rings of na */
608 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
613 if (!nm_netmap_on(na))
617 for (i = 0; i < netmap_real_rings(na, t); i++) {
618 netmap_set_ring(na, i, t, stopped);
624 * Convenience function used in drivers. Waits for current txsync()s/rxsync()s
625 * to finish and prevents any new one from starting. Call this before turning
626 * netmap mode off, or before removing the hardware rings (e.g., on module
630 netmap_disable_all_rings(struct ifnet *ifp)
632 if (NM_NA_VALID(ifp)) {
633 netmap_set_all_rings(NA(ifp), NM_KR_STOPPED);
638 * Convenience function used in drivers. Re-enables rxsync and txsync on the
639 * adapter's rings In linux drivers, this should be placed near each
643 netmap_enable_all_rings(struct ifnet *ifp)
645 if (NM_NA_VALID(ifp)) {
646 netmap_set_all_rings(NA(ifp), 0 /* enabled */);
651 netmap_make_zombie(struct ifnet *ifp)
653 if (NM_NA_VALID(ifp)) {
654 struct netmap_adapter *na = NA(ifp);
655 netmap_set_all_rings(na, NM_KR_LOCKED);
656 na->na_flags |= NAF_ZOMBIE;
657 netmap_set_all_rings(na, 0);
662 netmap_undo_zombie(struct ifnet *ifp)
664 if (NM_NA_VALID(ifp)) {
665 struct netmap_adapter *na = NA(ifp);
666 if (na->na_flags & NAF_ZOMBIE) {
667 netmap_set_all_rings(na, NM_KR_LOCKED);
668 na->na_flags &= ~NAF_ZOMBIE;
669 netmap_set_all_rings(na, 0);
675 * generic bound_checking function
678 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
681 const char *op = NULL;
690 } else if (oldv > hi) {
695 nm_prinf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
701 * packet-dump function, user-supplied or static buffer.
702 * The destination buffer must be at least 30+4*len
705 nm_dump_buf(char *p, int len, int lim, char *dst)
707 static char _dst[8192];
709 static char hex[] ="0123456789abcdef";
710 char *o; /* output position */
712 #define P_HI(x) hex[((x) & 0xf0)>>4]
713 #define P_LO(x) hex[((x) & 0xf)]
714 #define P_C(x) ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
717 if (lim <= 0 || lim > len)
720 sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
722 /* hexdump routine */
723 for (i = 0; i < lim; ) {
724 sprintf(o, "%5d: ", i);
728 for (j=0; j < 16 && i < lim; i++, j++) {
730 o[j*3+1] = P_LO(p[i]);
733 for (j=0; j < 16 && i < lim; i++, j++)
734 o[j + 48] = P_C(p[i]);
747 * Fetch configuration from the device, to cope with dynamic
748 * reconfigurations after loading the module.
750 /* call with NMG_LOCK held */
752 netmap_update_config(struct netmap_adapter *na)
754 struct nm_config_info info;
756 bzero(&info, sizeof(info));
757 if (na->nm_config == NULL ||
758 na->nm_config(na, &info)) {
759 /* take whatever we had at init time */
760 info.num_tx_rings = na->num_tx_rings;
761 info.num_tx_descs = na->num_tx_desc;
762 info.num_rx_rings = na->num_rx_rings;
763 info.num_rx_descs = na->num_rx_desc;
764 info.rx_buf_maxsize = na->rx_buf_maxsize;
767 if (na->num_tx_rings == info.num_tx_rings &&
768 na->num_tx_desc == info.num_tx_descs &&
769 na->num_rx_rings == info.num_rx_rings &&
770 na->num_rx_desc == info.num_rx_descs &&
771 na->rx_buf_maxsize == info.rx_buf_maxsize)
772 return 0; /* nothing changed */
773 if (na->active_fds == 0) {
774 na->num_tx_rings = info.num_tx_rings;
775 na->num_tx_desc = info.num_tx_descs;
776 na->num_rx_rings = info.num_rx_rings;
777 na->num_rx_desc = info.num_rx_descs;
778 na->rx_buf_maxsize = info.rx_buf_maxsize;
779 D("configuration changed for %s: txring %d x %d, "
780 "rxring %d x %d, rxbufsz %d",
781 na->name, na->num_tx_rings, na->num_tx_desc,
782 na->num_rx_rings, na->num_rx_desc, na->rx_buf_maxsize);
785 D("WARNING: configuration changed for %s while active: "
786 "txring %d x %d, rxring %d x %d, rxbufsz %d",
787 na->name, info.num_tx_rings, info.num_tx_descs,
788 info.num_rx_rings, info.num_rx_descs,
789 info.rx_buf_maxsize);
793 /* nm_sync callbacks for the host rings */
794 static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
795 static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
797 /* create the krings array and initialize the fields common to all adapters.
798 * The array layout is this:
801 * na->tx_rings ----->| | \
802 * | | } na->num_tx_ring
806 * na->rx_rings ----> +----------+
808 * | | } na->num_rx_rings
813 * na->tailroom ----->| | \
814 * | | } tailroom bytes
818 * Note: for compatibility, host krings are created even when not needed.
819 * The tailroom space is currently used by vale ports for allocating leases.
821 /* call with NMG_LOCK held */
823 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
826 struct netmap_kring *kring;
830 if (na->tx_rings != NULL) {
831 D("warning: krings were already created");
835 /* account for the (possibly fake) host rings */
836 n[NR_TX] = netmap_all_rings(na, NR_TX);
837 n[NR_RX] = netmap_all_rings(na, NR_RX);
839 len = (n[NR_TX] + n[NR_RX]) *
840 (sizeof(struct netmap_kring) + sizeof(struct netmap_kring *))
843 na->tx_rings = nm_os_malloc((size_t)len);
844 if (na->tx_rings == NULL) {
845 D("Cannot allocate krings");
848 na->rx_rings = na->tx_rings + n[NR_TX];
849 na->tailroom = na->rx_rings + n[NR_RX];
851 /* link the krings in the krings array */
852 kring = (struct netmap_kring *)((char *)na->tailroom + tailroom);
853 for (i = 0; i < n[NR_TX] + n[NR_RX]; i++) {
854 na->tx_rings[i] = kring;
859 * All fields in krings are 0 except the one initialized below.
860 * but better be explicit on important kring fields.
863 ndesc = nma_get_ndesc(na, t);
864 for (i = 0; i < n[t]; i++) {
865 kring = NMR(na, t)[i];
866 bzero(kring, sizeof(*kring));
868 kring->notify_na = na;
871 kring->nkr_num_slots = ndesc;
872 kring->nr_mode = NKR_NETMAP_OFF;
873 kring->nr_pending_mode = NKR_NETMAP_OFF;
874 if (i < nma_get_nrings(na, t)) {
875 kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
877 if (!(na->na_flags & NAF_HOST_RINGS))
878 kring->nr_kflags |= NKR_FAKERING;
879 kring->nm_sync = (t == NR_TX ?
880 netmap_txsync_to_host:
881 netmap_rxsync_from_host);
883 kring->nm_notify = na->nm_notify;
884 kring->rhead = kring->rcur = kring->nr_hwcur = 0;
886 * IMPORTANT: Always keep one slot empty.
888 kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
889 snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
891 ND("ktx %s h %d c %d t %d",
892 kring->name, kring->rhead, kring->rcur, kring->rtail);
893 mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
894 nm_os_selinfo_init(&kring->si);
896 nm_os_selinfo_init(&na->si[t]);
904 /* undo the actions performed by netmap_krings_create */
905 /* call with NMG_LOCK held */
907 netmap_krings_delete(struct netmap_adapter *na)
909 struct netmap_kring **kring = na->tx_rings;
912 if (na->tx_rings == NULL) {
913 D("warning: krings were already deleted");
918 nm_os_selinfo_uninit(&na->si[t]);
920 /* we rely on the krings layout described above */
921 for ( ; kring != na->tailroom; kring++) {
922 mtx_destroy(&(*kring)->q_lock);
923 nm_os_selinfo_uninit(&(*kring)->si);
925 nm_os_free(na->tx_rings);
926 na->tx_rings = na->rx_rings = na->tailroom = NULL;
931 * Destructor for NIC ports. They also have an mbuf queue
932 * on the rings connected to the host so we need to purge
935 /* call with NMG_LOCK held */
937 netmap_hw_krings_delete(struct netmap_adapter *na)
939 u_int lim = netmap_real_rings(na, NR_RX), i;
941 for (i = nma_get_nrings(na, NR_RX); i < lim; i++) {
942 struct mbq *q = &NMR(na, NR_RX)[i]->rx_queue;
943 ND("destroy sw mbq with len %d", mbq_len(q));
947 netmap_krings_delete(na);
951 netmap_mem_drop(struct netmap_adapter *na)
953 int last = netmap_mem_deref(na->nm_mem, na);
954 /* if the native allocator had been overrided on regif,
955 * restore it now and drop the temporary one
957 if (last && na->nm_mem_prev) {
958 netmap_mem_put(na->nm_mem);
959 na->nm_mem = na->nm_mem_prev;
960 na->nm_mem_prev = NULL;
965 * Undo everything that was done in netmap_do_regif(). In particular,
966 * call nm_register(ifp,0) to stop netmap mode on the interface and
967 * revert to normal operation.
969 /* call with NMG_LOCK held */
970 static void netmap_unset_ringid(struct netmap_priv_d *);
971 static void netmap_krings_put(struct netmap_priv_d *);
973 netmap_do_unregif(struct netmap_priv_d *priv)
975 struct netmap_adapter *na = priv->np_na;
979 /* unset nr_pending_mode and possibly release exclusive mode */
980 netmap_krings_put(priv);
983 /* XXX check whether we have to do something with monitor
984 * when rings change nr_mode. */
985 if (na->active_fds <= 0) {
986 /* walk through all the rings and tell any monitor
987 * that the port is going to exit netmap mode
989 netmap_monitor_stop(na);
993 if (na->active_fds <= 0 || nm_kring_pending(priv)) {
994 na->nm_register(na, 0);
997 /* delete rings and buffers that are no longer needed */
998 netmap_mem_rings_delete(na);
1000 if (na->active_fds <= 0) { /* last instance */
1002 * (TO CHECK) We enter here
1003 * when the last reference to this file descriptor goes
1004 * away. This means we cannot have any pending poll()
1005 * or interrupt routine operating on the structure.
1006 * XXX The file may be closed in a thread while
1007 * another thread is using it.
1008 * Linux keeps the file opened until the last reference
1009 * by any outstanding ioctl/poll or mmap is gone.
1010 * FreeBSD does not track mmap()s (but we do) and
1011 * wakes up any sleeping poll(). Need to check what
1012 * happens if the close() occurs while a concurrent
1013 * syscall is running.
1016 D("deleting last instance for %s", na->name);
1018 if (nm_netmap_on(na)) {
1019 D("BUG: netmap on while going to delete the krings");
1022 na->nm_krings_delete(na);
1025 /* possibily decrement counter of tx_si/rx_si users */
1026 netmap_unset_ringid(priv);
1027 /* delete the nifp */
1028 netmap_mem_if_delete(na, priv->np_nifp);
1029 /* drop the allocator */
1030 netmap_mem_drop(na);
1031 /* mark the priv as unregistered */
1033 priv->np_nifp = NULL;
1036 /* call with NMG_LOCK held */
1038 nm_si_user(struct netmap_priv_d *priv, enum txrx t)
1040 return (priv->np_na != NULL &&
1041 (priv->np_qlast[t] - priv->np_qfirst[t] > 1));
1044 struct netmap_priv_d*
1045 netmap_priv_new(void)
1047 struct netmap_priv_d *priv;
1049 priv = nm_os_malloc(sizeof(struct netmap_priv_d));
1058 * Destructor of the netmap_priv_d, called when the fd is closed
1059 * Action: undo all the things done by NIOCREGIF,
1060 * On FreeBSD we need to track whether there are active mmap()s,
1061 * and we use np_active_mmaps for that. On linux, the field is always 0.
1062 * Return: 1 if we can free priv, 0 otherwise.
1065 /* call with NMG_LOCK held */
1067 netmap_priv_delete(struct netmap_priv_d *priv)
1069 struct netmap_adapter *na = priv->np_na;
1071 /* number of active references to this fd */
1072 if (--priv->np_refs > 0) {
1077 netmap_do_unregif(priv);
1079 netmap_unget_na(na, priv->np_ifp);
1080 bzero(priv, sizeof(*priv)); /* for safety */
1085 /* call with NMG_LOCK *not* held */
1087 netmap_dtor(void *data)
1089 struct netmap_priv_d *priv = data;
1092 netmap_priv_delete(priv);
1098 * Handlers for synchronization of the rings from/to the host stack.
1099 * These are associated to a network interface and are just another
1100 * ring pair managed by userspace.
1102 * Netmap also supports transparent forwarding (NS_FORWARD and NR_FORWARD
1105 * - Before releasing buffers on hw RX rings, the application can mark
1106 * them with the NS_FORWARD flag. During the next RXSYNC or poll(), they
1107 * will be forwarded to the host stack, similarly to what happened if
1108 * the application moved them to the host TX ring.
1110 * - Before releasing buffers on the host RX ring, the application can
1111 * mark them with the NS_FORWARD flag. During the next RXSYNC or poll(),
1112 * they will be forwarded to the hw TX rings, saving the application
1113 * from doing the same task in user-space.
1115 * Transparent fowarding can be enabled per-ring, by setting the NR_FORWARD
1116 * flag, or globally with the netmap_fwd sysctl.
1118 * The transfer NIC --> host is relatively easy, just encapsulate
1119 * into mbufs and we are done. The host --> NIC side is slightly
1120 * harder because there might not be room in the tx ring so it
1121 * might take a while before releasing the buffer.
1126 * Pass a whole queue of mbufs to the host stack as coming from 'dst'
1127 * We do not need to lock because the queue is private.
1128 * After this call the queue is empty.
1131 netmap_send_up(struct ifnet *dst, struct mbq *q)
1134 struct mbuf *head = NULL, *prev = NULL;
1136 /* Send packets up, outside the lock; head/prev machinery
1137 * is only useful for Windows. */
1138 while ((m = mbq_dequeue(q)) != NULL) {
1139 if (netmap_verbose & NM_VERB_HOST)
1140 D("sending up pkt %p size %d", m, MBUF_LEN(m));
1141 prev = nm_os_send_up(dst, m, prev);
1146 nm_os_send_up(dst, NULL, head);
1152 * Scan the buffers from hwcur to ring->head, and put a copy of those
1153 * marked NS_FORWARD (or all of them if forced) into a queue of mbufs.
1154 * Drop remaining packets in the unlikely event
1155 * of an mbuf shortage.
1158 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1160 u_int const lim = kring->nkr_num_slots - 1;
1161 u_int const head = kring->rhead;
1163 struct netmap_adapter *na = kring->na;
1165 for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1167 struct netmap_slot *slot = &kring->ring->slot[n];
1169 if ((slot->flags & NS_FORWARD) == 0 && !force)
1171 if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1172 RD(5, "bad pkt at %d len %d", n, slot->len);
1175 slot->flags &= ~NS_FORWARD; // XXX needed ?
1176 /* XXX TODO: adapt to the case of a multisegment packet */
1177 m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1186 _nm_may_forward(struct netmap_kring *kring)
1188 return ((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
1189 kring->na->na_flags & NAF_HOST_RINGS &&
1190 kring->tx == NR_RX);
1194 nm_may_forward_up(struct netmap_kring *kring)
1196 return _nm_may_forward(kring) &&
1197 kring->ring_id != kring->na->num_rx_rings;
1201 nm_may_forward_down(struct netmap_kring *kring, int sync_flags)
1203 return _nm_may_forward(kring) &&
1204 (sync_flags & NAF_CAN_FORWARD_DOWN) &&
1205 kring->ring_id == kring->na->num_rx_rings;
1209 * Send to the NIC rings packets marked NS_FORWARD between
1210 * kring->nr_hwcur and kring->rhead.
1211 * Called under kring->rx_queue.lock on the sw rx ring.
1213 * It can only be called if the user opened all the TX hw rings,
1214 * see NAF_CAN_FORWARD_DOWN flag.
1215 * We can touch the TX netmap rings (slots, head and cur) since
1216 * we are in poll/ioctl system call context, and the application
1217 * is not supposed to touch the ring (using a different thread)
1218 * during the execution of the system call.
1221 netmap_sw_to_nic(struct netmap_adapter *na)
1223 struct netmap_kring *kring = na->rx_rings[na->num_rx_rings];
1224 struct netmap_slot *rxslot = kring->ring->slot;
1225 u_int i, rxcur = kring->nr_hwcur;
1226 u_int const head = kring->rhead;
1227 u_int const src_lim = kring->nkr_num_slots - 1;
1230 /* scan rings to find space, then fill as much as possible */
1231 for (i = 0; i < na->num_tx_rings; i++) {
1232 struct netmap_kring *kdst = na->tx_rings[i];
1233 struct netmap_ring *rdst = kdst->ring;
1234 u_int const dst_lim = kdst->nkr_num_slots - 1;
1236 /* XXX do we trust ring or kring->rcur,rtail ? */
1237 for (; rxcur != head && !nm_ring_empty(rdst);
1238 rxcur = nm_next(rxcur, src_lim) ) {
1239 struct netmap_slot *src, *dst, tmp;
1240 u_int dst_head = rdst->head;
1242 src = &rxslot[rxcur];
1243 if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1248 dst = &rdst->slot[dst_head];
1252 src->buf_idx = dst->buf_idx;
1253 src->flags = NS_BUF_CHANGED;
1255 dst->buf_idx = tmp.buf_idx;
1257 dst->flags = NS_BUF_CHANGED;
1259 rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
1261 /* if (sent) XXX txsync ? it would be just an optimization */
1268 * netmap_txsync_to_host() passes packets up. We are called from a
1269 * system call in user process context, and the only contention
1270 * can be among multiple user threads erroneously calling
1271 * this routine concurrently.
1274 netmap_txsync_to_host(struct netmap_kring *kring, int flags)
1276 struct netmap_adapter *na = kring->na;
1277 u_int const lim = kring->nkr_num_slots - 1;
1278 u_int const head = kring->rhead;
1281 /* Take packets from hwcur to head and pass them up.
1282 * Force hwcur = head since netmap_grab_packets() stops at head
1285 netmap_grab_packets(kring, &q, 1 /* force */);
1286 ND("have %d pkts in queue", mbq_len(&q));
1287 kring->nr_hwcur = head;
1288 kring->nr_hwtail = head + lim;
1289 if (kring->nr_hwtail > lim)
1290 kring->nr_hwtail -= lim + 1;
1292 netmap_send_up(na->ifp, &q);
1298 * rxsync backend for packets coming from the host stack.
1299 * They have been put in kring->rx_queue by netmap_transmit().
1300 * We protect access to the kring using kring->rx_queue.lock
1302 * also moves to the nic hw rings any packet the user has marked
1303 * for transparent-mode forwarding, then sets the NR_FORWARD
1304 * flag in the kring to let the caller push them out
1307 netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
1309 struct netmap_adapter *na = kring->na;
1310 struct netmap_ring *ring = kring->ring;
1312 u_int const lim = kring->nkr_num_slots - 1;
1313 u_int const head = kring->rhead;
1315 struct mbq *q = &kring->rx_queue, fq;
1317 mbq_init(&fq); /* fq holds packets to be freed */
1321 /* First part: import newly received packets */
1323 if (n) { /* grab packets from the queue */
1327 nm_i = kring->nr_hwtail;
1328 stop_i = nm_prev(kring->nr_hwcur, lim);
1329 while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1330 int len = MBUF_LEN(m);
1331 struct netmap_slot *slot = &ring->slot[nm_i];
1333 m_copydata(m, 0, len, NMB(na, slot));
1334 ND("nm %d len %d", nm_i, len);
1336 D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1340 nm_i = nm_next(nm_i, lim);
1341 mbq_enqueue(&fq, m);
1343 kring->nr_hwtail = nm_i;
1347 * Second part: skip past packets that userspace has released.
1349 nm_i = kring->nr_hwcur;
1350 if (nm_i != head) { /* something was released */
1351 if (nm_may_forward_down(kring, flags)) {
1352 ret = netmap_sw_to_nic(na);
1354 kring->nr_kflags |= NR_FORWARD;
1358 kring->nr_hwcur = head;
1370 /* Get a netmap adapter for the port.
1372 * If it is possible to satisfy the request, return 0
1373 * with *na containing the netmap adapter found.
1374 * Otherwise return an error code, with *na containing NULL.
1376 * When the port is attached to a bridge, we always return
1378 * Otherwise, if the port is already bound to a file descriptor,
1379 * then we unconditionally return the existing adapter into *na.
1380 * In all the other cases, we return (into *na) either native,
1381 * generic or NULL, according to the following table:
1384 * active_fds dev.netmap.admode YES NO
1385 * -------------------------------------------------------
1386 * >0 * NA(ifp) NA(ifp)
1388 * 0 NETMAP_ADMODE_BEST NATIVE GENERIC
1389 * 0 NETMAP_ADMODE_NATIVE NATIVE NULL
1390 * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC
1393 static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
1395 netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na)
1397 /* generic support */
1398 int i = netmap_admode; /* Take a snapshot. */
1399 struct netmap_adapter *prev_na;
1402 *na = NULL; /* default */
1404 /* reset in case of invalid value */
1405 if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1406 i = netmap_admode = NETMAP_ADMODE_BEST;
1408 if (NM_NA_VALID(ifp)) {
1410 /* If an adapter already exists, return it if
1411 * there are active file descriptors or if
1412 * netmap is not forced to use generic
1415 if (NETMAP_OWNED_BY_ANY(prev_na)
1416 || i != NETMAP_ADMODE_GENERIC
1417 || prev_na->na_flags & NAF_FORCE_NATIVE
1419 /* ugly, but we cannot allow an adapter switch
1420 * if some pipe is referring to this one
1422 || prev_na->na_next_pipe > 0
1430 /* If there isn't native support and netmap is not allowed
1431 * to use generic adapters, we cannot satisfy the request.
1433 if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
1436 /* Otherwise, create a generic adapter and return it,
1437 * saving the previously used netmap adapter, if any.
1439 * Note that here 'prev_na', if not NULL, MUST be a
1440 * native adapter, and CANNOT be a generic one. This is
1441 * true because generic adapters are created on demand, and
1442 * destroyed when not used anymore. Therefore, if the adapter
1443 * currently attached to an interface 'ifp' is generic, it
1445 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1446 * Consequently, if NA(ifp) is generic, we will enter one of
1447 * the branches above. This ensures that we never override
1448 * a generic adapter with another generic adapter.
1450 error = generic_netmap_attach(ifp);
1457 if (nmd != NULL && !((*na)->na_flags & NAF_MEM_OWNER) &&
1458 (*na)->active_fds == 0 && ((*na)->nm_mem != nmd)) {
1459 (*na)->nm_mem_prev = (*na)->nm_mem;
1460 (*na)->nm_mem = netmap_mem_get(nmd);
1467 * MUST BE CALLED UNDER NMG_LOCK()
1469 * Get a refcounted reference to a netmap adapter attached
1470 * to the interface specified by req.
1471 * This is always called in the execution of an ioctl().
1473 * Return ENXIO if the interface specified by the request does
1474 * not exist, ENOTSUP if netmap is not supported by the interface,
1475 * EBUSY if the interface is already attached to a bridge,
1476 * EINVAL if parameters are invalid, ENOMEM if needed resources
1477 * could not be allocated.
1478 * If successful, hold a reference to the netmap adapter.
1480 * If the interface specified by req is a system one, also keep
1481 * a reference to it and return a valid *ifp.
1484 netmap_get_na(struct nmreq_header *hdr,
1485 struct netmap_adapter **na, struct ifnet **ifp,
1486 struct netmap_mem_d *nmd, int create)
1488 struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1490 struct netmap_adapter *ret = NULL;
1493 *na = NULL; /* default return value */
1496 if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1500 if (req->nr_mode == NR_REG_PIPE_MASTER ||
1501 req->nr_mode == NR_REG_PIPE_SLAVE) {
1502 /* Do not accept deprecated pipe modes. */
1503 D("Deprecated pipe nr_mode, use xx{yy or xx}yy syntax");
1509 /* if the request contain a memid, try to find the
1510 * corresponding memory region
1512 if (nmd == NULL && req->nr_mem_id) {
1513 nmd = netmap_mem_find(req->nr_mem_id);
1516 /* keep the rereference */
1520 /* We cascade through all possible types of netmap adapter.
1521 * All netmap_get_*_na() functions return an error and an na,
1522 * with the following combinations:
1525 * 0 NULL type doesn't match
1526 * !0 NULL type matches, but na creation/lookup failed
1527 * 0 !NULL type matches and na created/found
1528 * !0 !NULL impossible
1531 /* try to see if this is a ptnetmap port */
1532 error = netmap_get_pt_host_na(hdr, na, nmd, create);
1533 if (error || *na != NULL)
1536 /* try to see if this is a monitor port */
1537 error = netmap_get_monitor_na(hdr, na, nmd, create);
1538 if (error || *na != NULL)
1541 /* try to see if this is a pipe port */
1542 error = netmap_get_pipe_na(hdr, na, nmd, create);
1543 if (error || *na != NULL)
1546 /* try to see if this is a bridge port */
1547 error = netmap_get_vale_na(hdr, na, nmd, create);
1551 if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1555 * This must be a hardware na, lookup the name in the system.
1556 * Note that by hardware we actually mean "it shows up in ifconfig".
1557 * This may still be a tap, a veth/epair, or even a
1558 * persistent VALE port.
1560 *ifp = ifunit_ref(hdr->nr_name);
1566 error = netmap_get_hw_na(*ifp, nmd, &ret);
1571 netmap_adapter_get(ret);
1576 netmap_adapter_put(ret);
1583 netmap_mem_put(nmd);
1588 /* undo netmap_get_na() */
1590 netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)
1595 netmap_adapter_put(na);
1599 #define NM_FAIL_ON(t) do { \
1600 if (unlikely(t)) { \
1601 RD(5, "%s: fail '" #t "' " \
1603 "rh %d rc %d rt %d " \
1606 head, cur, ring->tail, \
1607 kring->rhead, kring->rcur, kring->rtail, \
1608 kring->nr_hwcur, kring->nr_hwtail); \
1609 return kring->nkr_num_slots; \
1614 * validate parameters on entry for *_txsync()
1615 * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1618 * rhead, rcur and rtail=hwtail are stored from previous round.
1619 * hwcur is the next packet to send to the ring.
1622 * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1624 * hwcur, rhead, rtail and hwtail are reliable
1627 nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1629 u_int head = ring->head; /* read only once */
1630 u_int cur = ring->cur; /* read only once */
1631 u_int n = kring->nkr_num_slots;
1633 ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1635 kring->nr_hwcur, kring->nr_hwtail,
1636 ring->head, ring->cur, ring->tail);
1637 #if 1 /* kernel sanity checks; but we can trust the kring. */
1638 NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
1639 kring->rtail >= n || kring->nr_hwtail >= n);
1640 #endif /* kernel sanity checks */
1642 * user sanity checks. We only use head,
1643 * A, B, ... are possible positions for head:
1645 * 0 A rhead B rtail C n-1
1646 * 0 D rtail E rhead F n-1
1648 * B, F, D are valid. A, C, E are wrong
1650 if (kring->rtail >= kring->rhead) {
1651 /* want rhead <= head <= rtail */
1652 NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
1653 /* and also head <= cur <= rtail */
1654 NM_FAIL_ON(cur < head || cur > kring->rtail);
1655 } else { /* here rtail < rhead */
1656 /* we need head outside rtail .. rhead */
1657 NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
1659 /* two cases now: head <= rtail or head >= rhead */
1660 if (head <= kring->rtail) {
1661 /* want head <= cur <= rtail */
1662 NM_FAIL_ON(cur < head || cur > kring->rtail);
1663 } else { /* head >= rhead */
1664 /* cur must be outside rtail..head */
1665 NM_FAIL_ON(cur > kring->rtail && cur < head);
1668 if (ring->tail != kring->rtail) {
1669 RD(5, "%s tail overwritten was %d need %d", kring->name,
1670 ring->tail, kring->rtail);
1671 ring->tail = kring->rtail;
1673 kring->rhead = head;
1680 * validate parameters on entry for *_rxsync()
1681 * Returns ring->head if ok, kring->nkr_num_slots on error.
1683 * For a valid configuration,
1684 * hwcur <= head <= cur <= tail <= hwtail
1686 * We only consider head and cur.
1687 * hwcur and hwtail are reliable.
1691 nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1693 uint32_t const n = kring->nkr_num_slots;
1696 ND(5,"%s kc %d kt %d h %d c %d t %d",
1698 kring->nr_hwcur, kring->nr_hwtail,
1699 ring->head, ring->cur, ring->tail);
1701 * Before storing the new values, we should check they do not
1702 * move backwards. However:
1703 * - head is not an issue because the previous value is hwcur;
1704 * - cur could in principle go back, however it does not matter
1705 * because we are processing a brand new rxsync()
1707 cur = kring->rcur = ring->cur; /* read only once */
1708 head = kring->rhead = ring->head; /* read only once */
1709 #if 1 /* kernel sanity checks */
1710 NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
1711 #endif /* kernel sanity checks */
1712 /* user sanity checks */
1713 if (kring->nr_hwtail >= kring->nr_hwcur) {
1714 /* want hwcur <= rhead <= hwtail */
1715 NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
1716 /* and also rhead <= rcur <= hwtail */
1717 NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1719 /* we need rhead outside hwtail..hwcur */
1720 NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
1721 /* two cases now: head <= hwtail or head >= hwcur */
1722 if (head <= kring->nr_hwtail) {
1723 /* want head <= cur <= hwtail */
1724 NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1726 /* cur must be outside hwtail..head */
1727 NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
1730 if (ring->tail != kring->rtail) {
1731 RD(5, "%s tail overwritten was %d need %d",
1733 ring->tail, kring->rtail);
1734 ring->tail = kring->rtail;
1741 * Error routine called when txsync/rxsync detects an error.
1742 * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1743 * Return 1 on reinit.
1745 * This routine is only called by the upper half of the kernel.
1746 * It only reads hwcur (which is changed only by the upper half, too)
1747 * and hwtail (which may be changed by the lower half, but only on
1748 * a tx ring and only to increase it, so any error will be recovered
1749 * on the next call). For the above, we don't strictly need to call
1753 netmap_ring_reinit(struct netmap_kring *kring)
1755 struct netmap_ring *ring = kring->ring;
1756 u_int i, lim = kring->nkr_num_slots - 1;
1759 // XXX KASSERT nm_kr_tryget
1760 RD(10, "called for %s", kring->name);
1761 // XXX probably wrong to trust userspace
1762 kring->rhead = ring->head;
1763 kring->rcur = ring->cur;
1764 kring->rtail = ring->tail;
1766 if (ring->cur > lim)
1768 if (ring->head > lim)
1770 if (ring->tail > lim)
1772 for (i = 0; i <= lim; i++) {
1773 u_int idx = ring->slot[i].buf_idx;
1774 u_int len = ring->slot[i].len;
1775 if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1776 RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1777 ring->slot[i].buf_idx = 0;
1778 ring->slot[i].len = 0;
1779 } else if (len > NETMAP_BUF_SIZE(kring->na)) {
1780 ring->slot[i].len = 0;
1781 RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1785 RD(10, "total %d errors", errors);
1786 RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1788 ring->cur, kring->nr_hwcur,
1789 ring->tail, kring->nr_hwtail);
1790 ring->head = kring->rhead = kring->nr_hwcur;
1791 ring->cur = kring->rcur = kring->nr_hwcur;
1792 ring->tail = kring->rtail = kring->nr_hwtail;
1794 return (errors ? 1 : 0);
1797 /* interpret the ringid and flags fields of an nmreq, by translating them
1798 * into a pair of intervals of ring indices:
1800 * [priv->np_txqfirst, priv->np_txqlast) and
1801 * [priv->np_rxqfirst, priv->np_rxqlast)
1805 netmap_interp_ringid(struct netmap_priv_d *priv, uint32_t nr_mode,
1806 uint16_t nr_ringid, uint64_t nr_flags)
1808 struct netmap_adapter *na = priv->np_na;
1809 int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
1813 if ((nr_flags & NR_PTNETMAP_HOST) && ((nr_mode != NR_REG_ALL_NIC) ||
1814 nr_flags & (NR_RX_RINGS_ONLY|NR_TX_RINGS_ONLY))) {
1815 D("Error: only NR_REG_ALL_NIC supported with netmap passthrough");
1820 if (nr_flags & excluded_direction[t]) {
1821 priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1825 case NR_REG_ALL_NIC:
1826 priv->np_qfirst[t] = 0;
1827 priv->np_qlast[t] = nma_get_nrings(na, t);
1828 ND("ALL/PIPE: %s %d %d", nm_txrx2str(t),
1829 priv->np_qfirst[t], priv->np_qlast[t]);
1833 if (!(na->na_flags & NAF_HOST_RINGS)) {
1834 D("host rings not supported");
1837 priv->np_qfirst[t] = (nr_mode == NR_REG_SW ?
1838 nma_get_nrings(na, t) : 0);
1839 priv->np_qlast[t] = netmap_all_rings(na, t);
1840 ND("%s: %s %d %d", nr_mode == NR_REG_SW ? "SW" : "NIC+SW",
1842 priv->np_qfirst[t], priv->np_qlast[t]);
1844 case NR_REG_ONE_NIC:
1845 if (nr_ringid >= na->num_tx_rings &&
1846 nr_ringid >= na->num_rx_rings) {
1847 D("invalid ring id %d", nr_ringid);
1850 /* if not enough rings, use the first one */
1852 if (j >= nma_get_nrings(na, t))
1854 priv->np_qfirst[t] = j;
1855 priv->np_qlast[t] = j + 1;
1856 ND("ONE_NIC: %s %d %d", nm_txrx2str(t),
1857 priv->np_qfirst[t], priv->np_qlast[t]);
1860 D("invalid regif type %d", nr_mode);
1864 priv->np_flags = nr_flags | nr_mode; // TODO
1866 /* Allow transparent forwarding mode in the host --> nic
1867 * direction only if all the TX hw rings have been opened. */
1868 if (priv->np_qfirst[NR_TX] == 0 &&
1869 priv->np_qlast[NR_TX] >= na->num_tx_rings) {
1870 priv->np_sync_flags |= NAF_CAN_FORWARD_DOWN;
1873 if (netmap_verbose) {
1874 D("%s: tx [%d,%d) rx [%d,%d) id %d",
1876 priv->np_qfirst[NR_TX],
1877 priv->np_qlast[NR_TX],
1878 priv->np_qfirst[NR_RX],
1879 priv->np_qlast[NR_RX],
1887 * Set the ring ID. For devices with a single queue, a request
1888 * for all rings is the same as a single ring.
1891 netmap_set_ringid(struct netmap_priv_d *priv, uint32_t nr_mode,
1892 uint16_t nr_ringid, uint64_t nr_flags)
1894 struct netmap_adapter *na = priv->np_na;
1898 error = netmap_interp_ringid(priv, nr_mode, nr_ringid, nr_flags);
1903 priv->np_txpoll = (nr_flags & NR_NO_TX_POLL) ? 0 : 1;
1905 /* optimization: count the users registered for more than
1906 * one ring, which are the ones sleeping on the global queue.
1907 * The default netmap_notify() callback will then
1908 * avoid signaling the global queue if nobody is using it
1911 if (nm_si_user(priv, t))
1918 netmap_unset_ringid(struct netmap_priv_d *priv)
1920 struct netmap_adapter *na = priv->np_na;
1924 if (nm_si_user(priv, t))
1926 priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1929 priv->np_txpoll = 0;
1933 /* Set the nr_pending_mode for the requested rings.
1934 * If requested, also try to get exclusive access to the rings, provided
1935 * the rings we want to bind are not exclusively owned by a previous bind.
1938 netmap_krings_get(struct netmap_priv_d *priv)
1940 struct netmap_adapter *na = priv->np_na;
1942 struct netmap_kring *kring;
1943 int excl = (priv->np_flags & NR_EXCLUSIVE);
1947 D("%s: grabbing tx [%d, %d) rx [%d, %d)",
1949 priv->np_qfirst[NR_TX],
1950 priv->np_qlast[NR_TX],
1951 priv->np_qfirst[NR_RX],
1952 priv->np_qlast[NR_RX]);
1954 /* first round: check that all the requested rings
1955 * are neither alread exclusively owned, nor we
1956 * want exclusive ownership when they are already in use
1959 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1960 kring = NMR(na, t)[i];
1961 if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
1962 (kring->users && excl))
1964 ND("ring %s busy", kring->name);
1970 /* second round: increment usage count (possibly marking them
1971 * as exclusive) and set the nr_pending_mode
1974 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1975 kring = NMR(na, t)[i];
1978 kring->nr_kflags |= NKR_EXCLUSIVE;
1979 kring->nr_pending_mode = NKR_NETMAP_ON;
1987 /* Undo netmap_krings_get(). This is done by clearing the exclusive mode
1988 * if was asked on regif, and unset the nr_pending_mode if we are the
1989 * last users of the involved rings. */
1991 netmap_krings_put(struct netmap_priv_d *priv)
1993 struct netmap_adapter *na = priv->np_na;
1995 struct netmap_kring *kring;
1996 int excl = (priv->np_flags & NR_EXCLUSIVE);
1999 ND("%s: releasing tx [%d, %d) rx [%d, %d)",
2001 priv->np_qfirst[NR_TX],
2002 priv->np_qlast[NR_TX],
2003 priv->np_qfirst[NR_RX],
2004 priv->np_qlast[MR_RX]);
2007 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
2008 kring = NMR(na, t)[i];
2010 kring->nr_kflags &= ~NKR_EXCLUSIVE;
2012 if (kring->users == 0)
2013 kring->nr_pending_mode = NKR_NETMAP_OFF;
2019 nm_priv_rx_enabled(struct netmap_priv_d *priv)
2021 return (priv->np_qfirst[NR_RX] != priv->np_qlast[NR_RX]);
2025 * possibly move the interface to netmap-mode.
2026 * If success it returns a pointer to netmap_if, otherwise NULL.
2027 * This must be called with NMG_LOCK held.
2029 * The following na callbacks are called in the process:
2031 * na->nm_config() [by netmap_update_config]
2032 * (get current number and size of rings)
2034 * We have a generic one for linux (netmap_linux_config).
2035 * The bwrap has to override this, since it has to forward
2036 * the request to the wrapped adapter (netmap_bwrap_config).
2039 * na->nm_krings_create()
2040 * (create and init the krings array)
2042 * One of the following:
2044 * * netmap_hw_krings_create, (hw ports)
2045 * creates the standard layout for the krings
2046 * and adds the mbq (used for the host rings).
2048 * * netmap_vp_krings_create (VALE ports)
2049 * add leases and scratchpads
2051 * * netmap_pipe_krings_create (pipes)
2052 * create the krings and rings of both ends and
2055 * * netmap_monitor_krings_create (monitors)
2056 * avoid allocating the mbq
2058 * * netmap_bwrap_krings_create (bwraps)
2059 * create both the brap krings array,
2060 * the krings array of the wrapped adapter, and
2061 * (if needed) the fake array for the host adapter
2063 * na->nm_register(, 1)
2064 * (put the adapter in netmap mode)
2066 * This may be one of the following:
2068 * * netmap_hw_reg (hw ports)
2069 * checks that the ifp is still there, then calls
2070 * the hardware specific callback;
2072 * * netmap_vp_reg (VALE ports)
2073 * If the port is connected to a bridge,
2074 * set the NAF_NETMAP_ON flag under the
2075 * bridge write lock.
2077 * * netmap_pipe_reg (pipes)
2078 * inform the other pipe end that it is no
2079 * longer responsible for the lifetime of this
2082 * * netmap_monitor_reg (monitors)
2083 * intercept the sync callbacks of the monitored
2086 * * netmap_bwrap_reg (bwraps)
2087 * cross-link the bwrap and hwna rings,
2088 * forward the request to the hwna, override
2089 * the hwna notify callback (to get the frames
2090 * coming from outside go through the bridge).
2095 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
2096 uint32_t nr_mode, uint16_t nr_ringid, uint64_t nr_flags)
2098 struct netmap_if *nifp = NULL;
2102 priv->np_na = na; /* store the reference */
2103 error = netmap_mem_finalize(na->nm_mem, na);
2107 if (na->active_fds == 0) {
2109 /* cache the allocator info in the na */
2110 error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
2113 ND("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
2114 na->na_lut.objsize);
2116 /* ring configuration may have changed, fetch from the card */
2117 netmap_update_config(na);
2120 /* compute the range of tx and rx rings to monitor */
2121 error = netmap_set_ringid(priv, nr_mode, nr_ringid, nr_flags);
2125 if (na->active_fds == 0) {
2127 * If this is the first registration of the adapter,
2128 * perform sanity checks and create the in-kernel view
2129 * of the netmap rings (the netmap krings).
2131 if (na->ifp && nm_priv_rx_enabled(priv)) {
2132 /* This netmap adapter is attached to an ifnet. */
2133 unsigned nbs = NETMAP_BUF_SIZE(na);
2134 unsigned mtu = nm_os_ifnet_mtu(na->ifp);
2136 ND("%s: mtu %d rx_buf_maxsize %d netmap_buf_size %d",
2137 na->name, mtu, na->rx_buf_maxsize, nbs);
2139 if (na->rx_buf_maxsize == 0) {
2140 D("%s: error: rx_buf_maxsize == 0", na->name);
2145 if (mtu <= na->rx_buf_maxsize) {
2146 /* The MTU fits a single NIC slot. We only
2147 * Need to check that netmap buffers are
2148 * large enough to hold an MTU. NS_MOREFRAG
2149 * cannot be used in this case. */
2151 nm_prerr("error: netmap buf size (%u) "
2152 "< device MTU (%u)\n", nbs, mtu);
2157 /* More NIC slots may be needed to receive
2158 * or transmit a single packet. Check that
2159 * the adapter supports NS_MOREFRAG and that
2160 * netmap buffers are large enough to hold
2161 * the maximum per-slot size. */
2162 if (!(na->na_flags & NAF_MOREFRAG)) {
2163 nm_prerr("error: large MTU (%d) needed "
2164 "but %s does not support "
2165 "NS_MOREFRAG\n", mtu,
2169 } else if (nbs < na->rx_buf_maxsize) {
2170 nm_prerr("error: using NS_MOREFRAG on "
2171 "%s requires netmap buf size "
2172 ">= %u\n", na->ifp->if_xname,
2173 na->rx_buf_maxsize);
2177 nm_prinf("info: netmap application on "
2178 "%s needs to support "
2180 "(MTU=%u,netmap_buf_size=%u)\n",
2181 na->ifp->if_xname, mtu, nbs);
2187 * Depending on the adapter, this may also create
2188 * the netmap rings themselves
2190 error = na->nm_krings_create(na);
2196 /* now the krings must exist and we can check whether some
2197 * previous bind has exclusive ownership on them, and set
2200 error = netmap_krings_get(priv);
2202 goto err_del_krings;
2204 /* create all needed missing netmap rings */
2205 error = netmap_mem_rings_create(na);
2209 /* in all cases, create a new netmap if */
2210 nifp = netmap_mem_if_new(na, priv);
2216 if (nm_kring_pending(priv)) {
2217 /* Some kring is switching mode, tell the adapter to
2219 error = na->nm_register(na, 1);
2224 /* Commit the reference. */
2228 * advertise that the interface is ready by setting np_nifp.
2229 * The barrier is needed because readers (poll, *SYNC and mmap)
2230 * check for priv->np_nifp != NULL without locking
2232 mb(); /* make sure previous writes are visible to all CPUs */
2233 priv->np_nifp = nifp;
2238 netmap_mem_if_delete(na, nifp);
2240 netmap_krings_put(priv);
2241 netmap_mem_rings_delete(na);
2243 if (na->active_fds == 0)
2244 na->nm_krings_delete(na);
2246 if (na->active_fds == 0)
2247 memset(&na->na_lut, 0, sizeof(na->na_lut));
2249 netmap_mem_drop(na);
2257 * update kring and ring at the end of rxsync/txsync.
2260 nm_sync_finalize(struct netmap_kring *kring)
2263 * Update ring tail to what the kernel knows
2264 * After txsync: head/rhead/hwcur might be behind cur/rcur
2267 kring->ring->tail = kring->rtail = kring->nr_hwtail;
2269 ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2270 kring->name, kring->nr_hwcur, kring->nr_hwtail,
2271 kring->rhead, kring->rcur, kring->rtail);
2274 /* set ring timestamp */
2276 ring_timestamp_set(struct netmap_ring *ring)
2278 if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) {
2279 microtime(&ring->ts);
2283 static int nmreq_copyin(struct nmreq_header *, int);
2284 static int nmreq_copyout(struct nmreq_header *, int);
2285 static int nmreq_checkoptions(struct nmreq_header *);
2288 * ioctl(2) support for the "netmap" device.
2290 * Following a list of accepted commands:
2291 * - NIOCCTRL device control API
2292 * - NIOCTXSYNC sync TX rings
2293 * - NIOCRXSYNC sync RX rings
2294 * - SIOCGIFADDR just for convenience
2295 * - NIOCGINFO deprecated (legacy API)
2296 * - NIOCREGIF deprecated (legacy API)
2298 * Return 0 on success, errno otherwise.
2301 netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
2302 struct thread *td, int nr_body_is_user)
2304 struct mbq q; /* packets from RX hw queues to host stack */
2305 struct netmap_adapter *na = NULL;
2306 struct netmap_mem_d *nmd = NULL;
2307 struct ifnet *ifp = NULL;
2309 u_int i, qfirst, qlast;
2310 struct netmap_if *nifp;
2311 struct netmap_kring **krings;
2317 struct nmreq_header *hdr = (struct nmreq_header *)data;
2319 if (hdr->nr_version != NETMAP_API) {
2320 D("API mismatch for reqtype %d: got %d need %d",
2322 hdr->nr_version, NETMAP_API);
2323 hdr->nr_version = NETMAP_API;
2325 if (hdr->nr_version < NETMAP_MIN_API ||
2326 hdr->nr_version > NETMAP_MAX_API) {
2330 /* Make a kernel-space copy of the user-space nr_body.
2331 * For convenince, the nr_body pointer and the pointers
2332 * in the options list will be replaced with their
2333 * kernel-space counterparts. The original pointers are
2334 * saved internally and later restored by nmreq_copyout
2336 error = nmreq_copyin(hdr, nr_body_is_user);
2341 /* Sanitize hdr->nr_name. */
2342 hdr->nr_name[sizeof(hdr->nr_name) - 1] = '\0';
2344 switch (hdr->nr_reqtype) {
2345 case NETMAP_REQ_REGISTER: {
2346 struct nmreq_register *req =
2347 (struct nmreq_register *)(uintptr_t)hdr->nr_body;
2348 /* Protect access to priv from concurrent requests. */
2353 struct nmreq_option *opt;
2354 #endif /* WITH_EXTMEM */
2356 if (priv->np_nifp != NULL) { /* thread already registered */
2362 opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
2363 NETMAP_REQ_OPT_EXTMEM);
2365 struct nmreq_opt_extmem *e =
2366 (struct nmreq_opt_extmem *)opt;
2368 error = nmreq_checkduplicate(opt);
2370 opt->nro_status = error;
2373 nmd = netmap_mem_ext_create(e->nro_usrptr,
2374 &e->nro_info, &error);
2375 opt->nro_status = error;
2379 #endif /* WITH_EXTMEM */
2381 if (nmd == NULL && req->nr_mem_id) {
2382 /* find the allocator and get a reference */
2383 nmd = netmap_mem_find(req->nr_mem_id);
2389 /* find the interface and a reference */
2390 error = netmap_get_na(hdr, &na, &ifp, nmd,
2391 1 /* create */); /* keep reference */
2394 if (NETMAP_OWNED_BY_KERN(na)) {
2399 if (na->virt_hdr_len && !(req->nr_flags & NR_ACCEPT_VNET_HDR)) {
2404 error = netmap_do_regif(priv, na, req->nr_mode,
2405 req->nr_ringid, req->nr_flags);
2406 if (error) { /* reg. failed, release priv and ref */
2409 nifp = priv->np_nifp;
2410 priv->np_td = td; /* for debugging purposes */
2412 /* return the offset of the netmap_if object */
2413 req->nr_rx_rings = na->num_rx_rings;
2414 req->nr_tx_rings = na->num_tx_rings;
2415 req->nr_rx_slots = na->num_rx_desc;
2416 req->nr_tx_slots = na->num_tx_desc;
2417 error = netmap_mem_get_info(na->nm_mem, &req->nr_memsize, &memflags,
2420 netmap_do_unregif(priv);
2423 if (memflags & NETMAP_MEM_PRIVATE) {
2424 *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2427 priv->np_si[t] = nm_si_user(priv, t) ?
2428 &na->si[t] : &NMR(na, t)[priv->np_qfirst[t]]->si;
2431 if (req->nr_extra_bufs) {
2433 D("requested %d extra buffers",
2434 req->nr_extra_bufs);
2435 req->nr_extra_bufs = netmap_extra_alloc(na,
2436 &nifp->ni_bufs_head, req->nr_extra_bufs);
2438 D("got %d extra buffers", req->nr_extra_bufs);
2440 req->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2442 error = nmreq_checkoptions(hdr);
2444 netmap_do_unregif(priv);
2448 /* store ifp reference so that priv destructor may release it */
2452 netmap_unget_na(na, ifp);
2454 /* release the reference from netmap_mem_find() or
2455 * netmap_mem_ext_create()
2458 netmap_mem_put(nmd);
2463 case NETMAP_REQ_PORT_INFO_GET: {
2464 struct nmreq_port_info_get *req =
2465 (struct nmreq_port_info_get *)(uintptr_t)hdr->nr_body;
2471 if (hdr->nr_name[0] != '\0') {
2472 /* Build a nmreq_register out of the nmreq_port_info_get,
2473 * so that we can call netmap_get_na(). */
2474 struct nmreq_register regreq;
2475 bzero(®req, sizeof(regreq));
2476 regreq.nr_tx_slots = req->nr_tx_slots;
2477 regreq.nr_rx_slots = req->nr_rx_slots;
2478 regreq.nr_tx_rings = req->nr_tx_rings;
2479 regreq.nr_rx_rings = req->nr_rx_rings;
2480 regreq.nr_mem_id = req->nr_mem_id;
2482 /* get a refcount */
2483 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2484 hdr->nr_body = (uintptr_t)®req;
2485 error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2486 hdr->nr_reqtype = NETMAP_REQ_PORT_INFO_GET; /* reset type */
2487 hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2493 nmd = na->nm_mem; /* get memory allocator */
2495 nmd = netmap_mem_find(req->nr_mem_id ? req->nr_mem_id : 1);
2502 error = netmap_mem_get_info(nmd, &req->nr_memsize, &memflags,
2506 if (na == NULL) /* only memory info */
2509 req->nr_rx_slots = req->nr_tx_slots = 0;
2510 netmap_update_config(na);
2511 req->nr_rx_rings = na->num_rx_rings;
2512 req->nr_tx_rings = na->num_tx_rings;
2513 req->nr_rx_slots = na->num_rx_desc;
2514 req->nr_tx_slots = na->num_tx_desc;
2516 netmap_unget_na(na, ifp);
2521 case NETMAP_REQ_VALE_ATTACH: {
2522 error = nm_bdg_ctl_attach(hdr, NULL /* userspace request */);
2526 case NETMAP_REQ_VALE_DETACH: {
2527 error = nm_bdg_ctl_detach(hdr, NULL /* userspace request */);
2531 case NETMAP_REQ_VALE_LIST: {
2532 error = netmap_bdg_list(hdr);
2536 case NETMAP_REQ_PORT_HDR_SET: {
2537 struct nmreq_port_hdr *req =
2538 (struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2539 /* Build a nmreq_register out of the nmreq_port_hdr,
2540 * so that we can call netmap_get_bdg_na(). */
2541 struct nmreq_register regreq;
2542 bzero(®req, sizeof(regreq));
2543 /* For now we only support virtio-net headers, and only for
2544 * VALE ports, but this may change in future. Valid lengths
2545 * for the virtio-net header are 0 (no header), 10 and 12. */
2546 if (req->nr_hdr_len != 0 &&
2547 req->nr_hdr_len != sizeof(struct nm_vnet_hdr) &&
2548 req->nr_hdr_len != 12) {
2553 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2554 hdr->nr_body = (uintptr_t)®req;
2555 error = netmap_get_vale_na(hdr, &na, NULL, 0);
2556 hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_SET;
2557 hdr->nr_body = (uintptr_t)req;
2559 struct netmap_vp_adapter *vpna =
2560 (struct netmap_vp_adapter *)na;
2561 na->virt_hdr_len = req->nr_hdr_len;
2562 if (na->virt_hdr_len) {
2563 vpna->mfs = NETMAP_BUF_SIZE(na);
2565 D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
2566 netmap_adapter_put(na);
2574 case NETMAP_REQ_PORT_HDR_GET: {
2575 /* Get vnet-header length for this netmap port */
2576 struct nmreq_port_hdr *req =
2577 (struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2578 /* Build a nmreq_register out of the nmreq_port_hdr,
2579 * so that we can call netmap_get_bdg_na(). */
2580 struct nmreq_register regreq;
2583 bzero(®req, sizeof(regreq));
2585 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2586 hdr->nr_body = (uintptr_t)®req;
2587 error = netmap_get_na(hdr, &na, &ifp, NULL, 0);
2588 hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_GET;
2589 hdr->nr_body = (uintptr_t)req;
2591 req->nr_hdr_len = na->virt_hdr_len;
2593 netmap_unget_na(na, ifp);
2598 case NETMAP_REQ_VALE_NEWIF: {
2599 error = nm_vi_create(hdr);
2603 case NETMAP_REQ_VALE_DELIF: {
2604 error = nm_vi_destroy(hdr->nr_name);
2608 case NETMAP_REQ_VALE_POLLING_ENABLE:
2609 case NETMAP_REQ_VALE_POLLING_DISABLE: {
2610 error = nm_bdg_polling(hdr);
2613 #endif /* WITH_VALE */
2614 case NETMAP_REQ_POOLS_INFO_GET: {
2615 struct nmreq_pools_info *req =
2616 (struct nmreq_pools_info *)(uintptr_t)hdr->nr_body;
2617 /* Get information from the memory allocator. This
2618 * netmap device must already be bound to a port.
2619 * Note that hdr->nr_name is ignored. */
2621 if (priv->np_na && priv->np_na->nm_mem) {
2622 struct netmap_mem_d *nmd = priv->np_na->nm_mem;
2623 error = netmap_mem_pools_info_get(req, nmd);
2636 /* Write back request body to userspace and reset the
2637 * user-space pointer. */
2638 error = nmreq_copyout(hdr, error);
2644 nifp = priv->np_nifp;
2650 mb(); /* make sure following reads are not from cache */
2652 na = priv->np_na; /* we have a reference */
2655 D("Internal error: nifp != NULL && na == NULL");
2661 t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
2662 krings = NMR(na, t);
2663 qfirst = priv->np_qfirst[t];
2664 qlast = priv->np_qlast[t];
2665 sync_flags = priv->np_sync_flags;
2667 for (i = qfirst; i < qlast; i++) {
2668 struct netmap_kring *kring = krings[i];
2669 struct netmap_ring *ring = kring->ring;
2671 if (unlikely(nm_kr_tryget(kring, 1, &error))) {
2672 error = (error ? EIO : 0);
2676 if (cmd == NIOCTXSYNC) {
2677 if (netmap_verbose & NM_VERB_TXSYNC)
2678 D("pre txsync ring %d cur %d hwcur %d",
2681 if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2682 netmap_ring_reinit(kring);
2683 } else if (kring->nm_sync(kring, sync_flags | NAF_FORCE_RECLAIM) == 0) {
2684 nm_sync_finalize(kring);
2686 if (netmap_verbose & NM_VERB_TXSYNC)
2687 D("post txsync ring %d cur %d hwcur %d",
2691 if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2692 netmap_ring_reinit(kring);
2694 if (nm_may_forward_up(kring)) {
2695 /* transparent forwarding, see netmap_poll() */
2696 netmap_grab_packets(kring, &q, netmap_fwd);
2698 if (kring->nm_sync(kring, sync_flags | NAF_FORCE_READ) == 0) {
2699 nm_sync_finalize(kring);
2701 ring_timestamp_set(ring);
2707 netmap_send_up(na->ifp, &q);
2714 return netmap_ioctl_legacy(priv, cmd, data, td);
2723 nmreq_size_by_type(uint16_t nr_reqtype)
2725 switch (nr_reqtype) {
2726 case NETMAP_REQ_REGISTER:
2727 return sizeof(struct nmreq_register);
2728 case NETMAP_REQ_PORT_INFO_GET:
2729 return sizeof(struct nmreq_port_info_get);
2730 case NETMAP_REQ_VALE_ATTACH:
2731 return sizeof(struct nmreq_vale_attach);
2732 case NETMAP_REQ_VALE_DETACH:
2733 return sizeof(struct nmreq_vale_detach);
2734 case NETMAP_REQ_VALE_LIST:
2735 return sizeof(struct nmreq_vale_list);
2736 case NETMAP_REQ_PORT_HDR_SET:
2737 case NETMAP_REQ_PORT_HDR_GET:
2738 return sizeof(struct nmreq_port_hdr);
2739 case NETMAP_REQ_VALE_NEWIF:
2740 return sizeof(struct nmreq_vale_newif);
2741 case NETMAP_REQ_VALE_DELIF:
2743 case NETMAP_REQ_VALE_POLLING_ENABLE:
2744 case NETMAP_REQ_VALE_POLLING_DISABLE:
2745 return sizeof(struct nmreq_vale_polling);
2746 case NETMAP_REQ_POOLS_INFO_GET:
2747 return sizeof(struct nmreq_pools_info);
2753 nmreq_opt_size_by_type(uint16_t nro_reqtype)
2755 size_t rv = sizeof(struct nmreq_option);
2756 #ifdef NETMAP_REQ_OPT_DEBUG
2757 if (nro_reqtype & NETMAP_REQ_OPT_DEBUG)
2758 return (nro_reqtype & ~NETMAP_REQ_OPT_DEBUG);
2759 #endif /* NETMAP_REQ_OPT_DEBUG */
2760 switch (nro_reqtype) {
2762 case NETMAP_REQ_OPT_EXTMEM:
2763 rv = sizeof(struct nmreq_opt_extmem);
2765 #endif /* WITH_EXTMEM */
2767 /* subtract the common header */
2768 return rv - sizeof(struct nmreq_option);
2772 nmreq_copyin(struct nmreq_header *hdr, int nr_body_is_user)
2774 size_t rqsz, optsz, bufsz;
2776 char *ker = NULL, *p;
2777 struct nmreq_option **next, *src;
2778 struct nmreq_option buf;
2781 if (hdr->nr_reserved)
2784 if (!nr_body_is_user)
2787 hdr->nr_reserved = nr_body_is_user;
2789 /* compute the total size of the buffer */
2790 rqsz = nmreq_size_by_type(hdr->nr_reqtype);
2791 if (rqsz > NETMAP_REQ_MAXSIZE) {
2795 if ((rqsz && hdr->nr_body == (uintptr_t)NULL) ||
2796 (!rqsz && hdr->nr_body != (uintptr_t)NULL)) {
2797 /* Request body expected, but not found; or
2798 * request body found but unexpected. */
2803 bufsz = 2 * sizeof(void *) + rqsz;
2805 for (src = (struct nmreq_option *)(uintptr_t)hdr->nr_options; src;
2806 src = (struct nmreq_option *)(uintptr_t)buf.nro_next)
2808 error = copyin(src, &buf, sizeof(*src));
2811 optsz += sizeof(*src);
2812 optsz += nmreq_opt_size_by_type(buf.nro_reqtype);
2813 if (rqsz + optsz > NETMAP_REQ_MAXSIZE) {
2817 bufsz += optsz + sizeof(void *);
2820 ker = nm_os_malloc(bufsz);
2827 /* make a copy of the user pointers */
2828 ptrs = (uint64_t*)p;
2829 *ptrs++ = hdr->nr_body;
2830 *ptrs++ = hdr->nr_options;
2834 error = copyin((void *)(uintptr_t)hdr->nr_body, p, rqsz);
2837 /* overwrite the user pointer with the in-kernel one */
2838 hdr->nr_body = (uintptr_t)p;
2841 /* copy the options */
2842 next = (struct nmreq_option **)&hdr->nr_options;
2845 struct nmreq_option *opt;
2847 /* copy the option header */
2848 ptrs = (uint64_t *)p;
2849 opt = (struct nmreq_option *)(ptrs + 1);
2850 error = copyin(src, opt, sizeof(*src));
2853 /* make a copy of the user next pointer */
2854 *ptrs = opt->nro_next;
2855 /* overwrite the user pointer with the in-kernel one */
2858 /* initialize the option as not supported.
2859 * Recognized options will update this field.
2861 opt->nro_status = EOPNOTSUPP;
2863 p = (char *)(opt + 1);
2865 /* copy the option body */
2866 optsz = nmreq_opt_size_by_type(opt->nro_reqtype);
2868 /* the option body follows the option header */
2869 error = copyin(src + 1, p, optsz);
2875 /* move to next option */
2876 next = (struct nmreq_option **)&opt->nro_next;
2882 ptrs = (uint64_t *)ker;
2883 hdr->nr_body = *ptrs++;
2884 hdr->nr_options = *ptrs++;
2885 hdr->nr_reserved = 0;
2892 nmreq_copyout(struct nmreq_header *hdr, int rerror)
2894 struct nmreq_option *src, *dst;
2895 void *ker = (void *)(uintptr_t)hdr->nr_body, *bufstart;
2900 if (!hdr->nr_reserved)
2903 /* restore the user pointers in the header */
2904 ptrs = (uint64_t *)ker - 2;
2906 hdr->nr_body = *ptrs++;
2907 src = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
2908 hdr->nr_options = *ptrs;
2912 bodysz = nmreq_size_by_type(hdr->nr_reqtype);
2913 error = copyout(ker, (void *)(uintptr_t)hdr->nr_body, bodysz);
2920 /* copy the options */
2921 dst = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
2926 /* restore the user pointer */
2927 next = src->nro_next;
2928 ptrs = (uint64_t *)src - 1;
2929 src->nro_next = *ptrs;
2931 /* always copy the option header */
2932 error = copyout(src, dst, sizeof(*src));
2938 /* copy the option body only if there was no error */
2939 if (!rerror && !src->nro_status) {
2940 optsz = nmreq_opt_size_by_type(src->nro_reqtype);
2942 error = copyout(src + 1, dst + 1, optsz);
2949 src = (struct nmreq_option *)(uintptr_t)next;
2950 dst = (struct nmreq_option *)(uintptr_t)*ptrs;
2955 hdr->nr_reserved = 0;
2956 nm_os_free(bufstart);
2960 struct nmreq_option *
2961 nmreq_findoption(struct nmreq_option *opt, uint16_t reqtype)
2963 for ( ; opt; opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
2964 if (opt->nro_reqtype == reqtype)
2970 nmreq_checkduplicate(struct nmreq_option *opt) {
2971 uint16_t type = opt->nro_reqtype;
2974 while ((opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)opt->nro_next,
2977 opt->nro_status = EINVAL;
2979 return (dup ? EINVAL : 0);
2983 nmreq_checkoptions(struct nmreq_header *hdr)
2985 struct nmreq_option *opt;
2986 /* return error if there is still any option
2987 * marked as not supported
2990 for (opt = (struct nmreq_option *)(uintptr_t)hdr->nr_options; opt;
2991 opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
2992 if (opt->nro_status == EOPNOTSUPP)
2999 * select(2) and poll(2) handlers for the "netmap" device.
3001 * Can be called for one or more queues.
3002 * Return true the event mask corresponding to ready events.
3003 * If there are no ready events, do a selrecord on either individual
3004 * selinfo or on the global one.
3005 * Device-dependent parts (locking and sync of tx/rx rings)
3006 * are done through callbacks.
3008 * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
3009 * The first one is remapped to pwait as selrecord() uses the name as an
3013 netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
3015 struct netmap_adapter *na;
3016 struct netmap_kring *kring;
3017 struct netmap_ring *ring;
3018 u_int i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
3019 #define want_tx want[NR_TX]
3020 #define want_rx want[NR_RX]
3021 struct mbq q; /* packets from RX hw queues to host stack */
3024 * In order to avoid nested locks, we need to "double check"
3025 * txsync and rxsync if we decide to do a selrecord().
3026 * retry_tx (and retry_rx, later) prevent looping forever.
3028 int retry_tx = 1, retry_rx = 1;
3030 /* Transparent mode: send_down is 1 if we have found some
3031 * packets to forward (host RX ring --> NIC) during the rx
3032 * scan and we have not sent them down to the NIC yet.
3033 * Transparent mode requires to bind all rings to a single
3037 int sync_flags = priv->np_sync_flags;
3041 if (priv->np_nifp == NULL) {
3042 D("No if registered");
3045 mb(); /* make sure following reads are not from cache */
3049 if (!nm_netmap_on(na))
3052 if (netmap_verbose & 0x8000)
3053 D("device %s events 0x%x", na->name, events);
3054 want_tx = events & (POLLOUT | POLLWRNORM);
3055 want_rx = events & (POLLIN | POLLRDNORM);
3058 * check_all_{tx|rx} are set if the card has more than one queue AND
3059 * the file descriptor is bound to all of them. If so, we sleep on
3060 * the "global" selinfo, otherwise we sleep on individual selinfo
3061 * (FreeBSD only allows two selinfo's per file descriptor).
3062 * The interrupt routine in the driver wake one or the other
3063 * (or both) depending on which clients are active.
3065 * rxsync() is only called if we run out of buffers on a POLLIN.
3066 * txsync() is called if we run out of buffers on POLLOUT, or
3067 * there are pending packets to send. The latter can be disabled
3068 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
3070 check_all_tx = nm_si_user(priv, NR_TX);
3071 check_all_rx = nm_si_user(priv, NR_RX);
3075 * We start with a lock free round which is cheap if we have
3076 * slots available. If this fails, then lock and call the sync
3077 * routines. We can't do this on Linux, as the contract says
3078 * that we must call nm_os_selrecord() unconditionally.
3081 enum txrx t = NR_TX;
3082 for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) {
3083 kring = NMR(na, t)[i];
3084 /* XXX compare ring->cur and kring->tail */
3085 if (!nm_ring_empty(kring->ring)) {
3087 want[t] = 0; /* also breaks the loop */
3092 enum txrx t = NR_RX;
3093 want_rx = 0; /* look for a reason to run the handlers */
3094 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3095 kring = NMR(na, t)[i];
3096 if (kring->ring->cur == kring->ring->tail /* try fetch new buffers */
3097 || kring->rhead != kring->ring->head /* release buffers */) {
3102 revents |= events & (POLLIN | POLLRDNORM); /* we have data */
3107 /* The selrecord must be unconditional on linux. */
3108 nm_os_selrecord(sr, check_all_tx ?
3109 &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]]->si);
3110 nm_os_selrecord(sr, check_all_rx ?
3111 &na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]]->si);
3115 * If we want to push packets out (priv->np_txpoll) or
3116 * want_tx is still set, we must issue txsync calls
3117 * (on all rings, to avoid that the tx rings stall).
3118 * Fortunately, normal tx mode has np_txpoll set.
3120 if (priv->np_txpoll || want_tx) {
3122 * The first round checks if anyone is ready, if not
3123 * do a selrecord and another round to handle races.
3124 * want_tx goes to 0 if any space is found, and is
3125 * used to skip rings with no pending transmissions.
3128 for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
3131 kring = na->tx_rings[i];
3135 * Don't try to txsync this TX ring if we already found some
3136 * space in some of the TX rings (want_tx == 0) and there are no
3137 * TX slots in this ring that need to be flushed to the NIC
3140 if (!send_down && !want_tx && ring->head == kring->nr_hwcur)
3143 if (nm_kr_tryget(kring, 1, &revents))
3146 if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3147 netmap_ring_reinit(kring);
3150 if (kring->nm_sync(kring, sync_flags))
3153 nm_sync_finalize(kring);
3157 * If we found new slots, notify potential
3158 * listeners on the same ring.
3159 * Since we just did a txsync, look at the copies
3160 * of cur,tail in the kring.
3162 found = kring->rcur != kring->rtail;
3164 if (found) { /* notify other listeners */
3168 kring->nm_notify(kring, 0);
3172 /* if there were any packet to forward we must have handled them by now */
3174 if (want_tx && retry_tx && sr) {
3176 nm_os_selrecord(sr, check_all_tx ?
3177 &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]]->si);
3185 * If want_rx is still set scan receive rings.
3186 * Do it on all rings because otherwise we starve.
3189 /* two rounds here for race avoidance */
3191 for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
3194 kring = na->rx_rings[i];
3197 if (unlikely(nm_kr_tryget(kring, 1, &revents)))
3200 if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3201 netmap_ring_reinit(kring);
3204 /* now we can use kring->rcur, rtail */
3207 * transparent mode support: collect packets from
3208 * hw rxring(s) that have been released by the user
3210 if (nm_may_forward_up(kring)) {
3211 netmap_grab_packets(kring, &q, netmap_fwd);
3214 /* Clear the NR_FORWARD flag anyway, it may be set by
3215 * the nm_sync() below only on for the host RX ring (see
3216 * netmap_rxsync_from_host()). */
3217 kring->nr_kflags &= ~NR_FORWARD;
3218 if (kring->nm_sync(kring, sync_flags))
3221 nm_sync_finalize(kring);
3222 send_down |= (kring->nr_kflags & NR_FORWARD);
3223 ring_timestamp_set(ring);
3224 found = kring->rcur != kring->rtail;
3230 kring->nm_notify(kring, 0);
3236 if (retry_rx && sr) {
3237 nm_os_selrecord(sr, check_all_rx ?
3238 &na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]]->si);
3241 if (send_down || retry_rx) {
3244 goto flush_tx; /* and retry_rx */
3251 * Transparent mode: released bufs (i.e. between kring->nr_hwcur and
3252 * ring->head) marked with NS_FORWARD on hw rx rings are passed up
3253 * to the host stack.
3257 netmap_send_up(na->ifp, &q);
3266 nma_intr_enable(struct netmap_adapter *na, int onoff)
3268 bool changed = false;
3273 for (i = 0; i < nma_get_nrings(na, t); i++) {
3274 struct netmap_kring *kring = NMR(na, t)[i];
3275 int on = !(kring->nr_kflags & NKR_NOINTR);
3277 if (!!onoff != !!on) {
3281 kring->nr_kflags &= ~NKR_NOINTR;
3283 kring->nr_kflags |= NKR_NOINTR;
3289 return 0; /* nothing to do */
3293 D("Cannot %s interrupts for %s", onoff ? "enable" : "disable",
3298 na->nm_intr(na, onoff);
3304 /*-------------------- driver support routines -------------------*/
3306 /* default notify callback */
3308 netmap_notify(struct netmap_kring *kring, int flags)
3310 struct netmap_adapter *na = kring->notify_na;
3311 enum txrx t = kring->tx;
3313 nm_os_selwakeup(&kring->si);
3314 /* optimization: avoid a wake up on the global
3315 * queue if nobody has registered for more
3318 if (na->si_users[t] > 0)
3319 nm_os_selwakeup(&na->si[t]);
3321 return NM_IRQ_COMPLETED;
3324 /* called by all routines that create netmap_adapters.
3325 * provide some defaults and get a reference to the
3329 netmap_attach_common(struct netmap_adapter *na)
3331 if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
3332 D("%s: invalid rings tx %d rx %d",
3333 na->name, na->num_tx_rings, na->num_rx_rings);
3337 if (!na->rx_buf_maxsize) {
3338 /* Set a conservative default (larger is safer). */
3339 na->rx_buf_maxsize = PAGE_SIZE;
3343 if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
3344 na->if_input = na->ifp->if_input; /* for netmap_send_up */
3346 na->pdev = na; /* make sure netmap_mem_map() is called */
3347 #endif /* __FreeBSD__ */
3348 if (na->na_flags & NAF_HOST_RINGS) {
3349 if (na->num_host_rx_rings == 0)
3350 na->num_host_rx_rings = 1;
3351 if (na->num_host_tx_rings == 0)
3352 na->num_host_tx_rings = 1;
3354 if (na->nm_krings_create == NULL) {
3355 /* we assume that we have been called by a driver,
3356 * since other port types all provide their own
3359 na->nm_krings_create = netmap_hw_krings_create;
3360 na->nm_krings_delete = netmap_hw_krings_delete;
3362 if (na->nm_notify == NULL)
3363 na->nm_notify = netmap_notify;
3366 if (na->nm_mem == NULL) {
3367 /* use the global allocator */
3368 na->nm_mem = netmap_mem_get(&nm_mem);
3371 if (na->nm_bdg_attach == NULL)
3372 /* no special nm_bdg_attach callback. On VALE
3373 * attach, we need to interpose a bwrap
3375 na->nm_bdg_attach = netmap_default_bdg_attach;
3381 /* Wrapper for the register callback provided netmap-enabled
3383 * nm_iszombie(na) means that the driver module has been
3384 * unloaded, so we cannot call into it.
3385 * nm_os_ifnet_lock() must guarantee mutual exclusion with
3389 netmap_hw_reg(struct netmap_adapter *na, int onoff)
3391 struct netmap_hw_adapter *hwna =
3392 (struct netmap_hw_adapter*)na;
3397 if (nm_iszombie(na)) {
3400 } else if (na != NULL) {
3401 na->na_flags &= ~NAF_NETMAP_ON;
3406 error = hwna->nm_hw_register(na, onoff);
3409 nm_os_ifnet_unlock();
3415 netmap_hw_dtor(struct netmap_adapter *na)
3417 if (na->ifp == NULL)
3420 NM_DETACH_NA(na->ifp);
3425 * Allocate a netmap_adapter object, and initialize it from the
3426 * 'arg' passed by the driver on attach.
3427 * We allocate a block of memory of 'size' bytes, which has room
3428 * for struct netmap_adapter plus additional room private to
3430 * Return 0 on success, ENOMEM otherwise.
3433 netmap_attach_ext(struct netmap_adapter *arg, size_t size, int override_reg)
3435 struct netmap_hw_adapter *hwna = NULL;
3436 struct ifnet *ifp = NULL;
3438 if (size < sizeof(struct netmap_hw_adapter)) {
3439 D("Invalid netmap adapter size %d", (int)size);
3443 if (arg == NULL || arg->ifp == NULL)
3447 if (NM_NA_CLASH(ifp)) {
3448 /* If NA(ifp) is not null but there is no valid netmap
3449 * adapter it means that someone else is using the same
3450 * pointer (e.g. ax25_ptr on linux). This happens for
3451 * instance when also PF_RING is in use. */
3452 D("Error: netmap adapter hook is busy");
3456 hwna = nm_os_malloc(size);
3460 hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
3461 strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
3463 hwna->nm_hw_register = hwna->up.nm_register;
3464 hwna->up.nm_register = netmap_hw_reg;
3466 if (netmap_attach_common(&hwna->up)) {
3470 netmap_adapter_get(&hwna->up);
3472 NM_ATTACH_NA(ifp, &hwna->up);
3474 nm_os_onattach(ifp);
3476 if (arg->nm_dtor == NULL) {
3477 hwna->up.nm_dtor = netmap_hw_dtor;
3480 if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
3481 hwna->up.num_tx_rings, hwna->up.num_tx_desc,
3482 hwna->up.num_rx_rings, hwna->up.num_rx_desc);
3486 D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
3487 return (hwna ? EINVAL : ENOMEM);
3492 netmap_attach(struct netmap_adapter *arg)
3494 return netmap_attach_ext(arg, sizeof(struct netmap_hw_adapter),
3495 1 /* override nm_reg */);
3500 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
3506 refcount_acquire(&na->na_refcount);
3510 /* returns 1 iff the netmap_adapter is destroyed */
3512 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
3517 if (!refcount_release(&na->na_refcount))
3523 if (na->tx_rings) { /* XXX should not happen */
3524 D("freeing leftover tx_rings");
3525 na->nm_krings_delete(na);
3527 netmap_pipe_dealloc(na);
3529 netmap_mem_put(na->nm_mem);
3530 bzero(na, sizeof(*na));
3536 /* nm_krings_create callback for all hardware native adapters */
3538 netmap_hw_krings_create(struct netmap_adapter *na)
3540 int ret = netmap_krings_create(na, 0);
3542 /* initialize the mbq for the sw rx ring */
3543 u_int lim = netmap_real_rings(na, NR_RX), i;
3544 for (i = na->num_rx_rings; i < lim; i++) {
3545 mbq_safe_init(&NMR(na, NR_RX)[i]->rx_queue);
3547 ND("initialized sw rx queue %d", na->num_rx_rings);
3555 * Called on module unload by the netmap-enabled drivers
3558 netmap_detach(struct ifnet *ifp)
3560 struct netmap_adapter *na = NA(ifp);
3566 netmap_set_all_rings(na, NM_KR_LOCKED);
3568 * if the netmap adapter is not native, somebody
3569 * changed it, so we can not release it here.
3570 * The NAF_ZOMBIE flag will notify the new owner that
3571 * the driver is gone.
3573 if (!(na->na_flags & NAF_NATIVE) || !netmap_adapter_put(na)) {
3574 na->na_flags |= NAF_ZOMBIE;
3576 /* give active users a chance to notice that NAF_ZOMBIE has been
3577 * turned on, so that they can stop and return an error to userspace.
3578 * Note that this becomes a NOP if there are no active users and,
3579 * therefore, the put() above has deleted the na, since now NA(ifp) is
3582 netmap_enable_all_rings(ifp);
3588 * Intercept packets from the network stack and pass them
3589 * to netmap as incoming packets on the 'software' ring.
3591 * We only store packets in a bounded mbq and then copy them
3592 * in the relevant rxsync routine.
3594 * We rely on the OS to make sure that the ifp and na do not go
3595 * away (typically the caller checks for IFF_DRV_RUNNING or the like).
3596 * In nm_register() or whenever there is a reinitialization,
3597 * we make sure to make the mode change visible here.
3600 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
3602 struct netmap_adapter *na = NA(ifp);
3603 struct netmap_kring *kring, *tx_kring;
3604 u_int len = MBUF_LEN(m);
3605 u_int error = ENOBUFS;
3612 if (i >= na->num_host_rx_rings) {
3613 i = i % na->num_host_rx_rings;
3615 kring = NMR(na, NR_RX)[nma_get_nrings(na, NR_RX) + i];
3617 // XXX [Linux] we do not need this lock
3618 // if we follow the down/configure/up protocol -gl
3619 // mtx_lock(&na->core_lock);
3621 if (!nm_netmap_on(na)) {
3622 D("%s not in netmap mode anymore", na->name);
3628 if (txr >= na->num_tx_rings) {
3629 txr %= na->num_tx_rings;
3631 tx_kring = NMR(na, NR_TX)[txr];
3633 if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
3634 return MBUF_TRANSMIT(na, ifp, m);
3637 q = &kring->rx_queue;
3639 // XXX reconsider long packets if we handle fragments
3640 if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
3641 D("%s from_host, drop packet size %d > %d", na->name,
3642 len, NETMAP_BUF_SIZE(na));
3646 if (!netmap_generic_hwcsum) {
3647 if (nm_os_mbuf_has_csum_offld(m)) {
3648 RD(1, "%s drop mbuf that needs checksum offload", na->name);
3653 if (nm_os_mbuf_has_seg_offld(m)) {
3654 RD(1, "%s drop mbuf that needs generic segmentation offload", na->name);
3658 /* protect against netmap_rxsync_from_host(), netmap_sw_to_nic()
3659 * and maybe other instances of netmap_transmit (the latter
3660 * not possible on Linux).
3661 * We enqueue the mbuf only if we are sure there is going to be
3662 * enough room in the host RX ring, otherwise we drop it.
3666 busy = kring->nr_hwtail - kring->nr_hwcur;
3668 busy += kring->nkr_num_slots;
3669 if (busy + mbq_len(q) >= kring->nkr_num_slots - 1) {
3670 RD(2, "%s full hwcur %d hwtail %d qlen %d", na->name,
3671 kring->nr_hwcur, kring->nr_hwtail, mbq_len(q));
3674 ND(2, "%s %d bufs in queue", na->name, mbq_len(q));
3675 /* notify outside the lock */
3684 /* unconditionally wake up listeners */
3685 kring->nm_notify(kring, 0);
3686 /* this is normally netmap_notify(), but for nics
3687 * connected to a bridge it is netmap_bwrap_intr_notify(),
3688 * that possibly forwards the frames through the switch
3696 * netmap_reset() is called by the driver routines when reinitializing
3697 * a ring. The driver is in charge of locking to protect the kring.
3698 * If native netmap mode is not set just return NULL.
3699 * If native netmap mode is set, in particular, we have to set nr_mode to
3702 struct netmap_slot *
3703 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
3706 struct netmap_kring *kring;
3709 if (!nm_native_on(na)) {
3710 ND("interface not in native netmap mode");
3711 return NULL; /* nothing to reinitialize */
3714 /* XXX note- in the new scheme, we are not guaranteed to be
3715 * under lock (e.g. when called on a device reset).
3716 * In this case, we should set a flag and do not trust too
3717 * much the values. In practice: TODO
3718 * - set a RESET flag somewhere in the kring
3719 * - do the processing in a conservative way
3720 * - let the *sync() fixup at the end.
3723 if (n >= na->num_tx_rings)
3726 kring = na->tx_rings[n];
3728 if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3729 kring->nr_mode = NKR_NETMAP_OFF;
3733 // XXX check whether we should use hwcur or rcur
3734 new_hwofs = kring->nr_hwcur - new_cur;
3736 if (n >= na->num_rx_rings)
3738 kring = na->rx_rings[n];
3740 if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3741 kring->nr_mode = NKR_NETMAP_OFF;
3745 new_hwofs = kring->nr_hwtail - new_cur;
3747 lim = kring->nkr_num_slots - 1;
3748 if (new_hwofs > lim)
3749 new_hwofs -= lim + 1;
3751 /* Always set the new offset value and realign the ring. */
3753 D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
3755 tx == NR_TX ? "TX" : "RX", n,
3756 kring->nkr_hwofs, new_hwofs,
3758 tx == NR_TX ? lim : kring->nr_hwtail);
3759 kring->nkr_hwofs = new_hwofs;
3761 kring->nr_hwtail = kring->nr_hwcur + lim;
3762 if (kring->nr_hwtail > lim)
3763 kring->nr_hwtail -= lim + 1;
3767 * Wakeup on the individual and global selwait
3768 * We do the wakeup here, but the ring is not yet reconfigured.
3769 * However, we are under lock so there are no races.
3771 kring->nr_mode = NKR_NETMAP_ON;
3772 kring->nm_notify(kring, 0);
3773 return kring->ring->slot;
3778 * Dispatch rx/tx interrupts to the netmap rings.
3780 * "work_done" is non-null on the RX path, NULL for the TX path.
3781 * We rely on the OS to make sure that there is only one active
3782 * instance per queue, and that there is appropriate locking.
3784 * The 'notify' routine depends on what the ring is attached to.
3785 * - for a netmap file descriptor, do a selwakeup on the individual
3786 * waitqueue, plus one on the global one if needed
3787 * (see netmap_notify)
3788 * - for a nic connected to a switch, call the proper forwarding routine
3789 * (see netmap_bwrap_intr_notify)
3792 netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
3794 struct netmap_kring *kring;
3795 enum txrx t = (work_done ? NR_RX : NR_TX);
3797 q &= NETMAP_RING_MASK;
3799 if (netmap_verbose) {
3800 RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
3803 if (q >= nma_get_nrings(na, t))
3804 return NM_IRQ_PASS; // not a physical queue
3806 kring = NMR(na, t)[q];
3808 if (kring->nr_mode == NKR_NETMAP_OFF) {
3813 kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ?
3814 *work_done = 1; /* do not fire napi again */
3817 return kring->nm_notify(kring, 0);
3822 * Default functions to handle rx/tx interrupts from a physical device.
3823 * "work_done" is non-null on the RX path, NULL for the TX path.
3825 * If the card is not in netmap mode, simply return NM_IRQ_PASS,
3826 * so that the caller proceeds with regular processing.
3827 * Otherwise call netmap_common_irq().
3829 * If the card is connected to a netmap file descriptor,
3830 * do a selwakeup on the individual queue, plus one on the global one
3831 * if needed (multiqueue card _and_ there are multiqueue listeners),
3832 * and return NR_IRQ_COMPLETED.
3834 * Finally, if called on rx from an interface connected to a switch,
3835 * calls the proper forwarding routine.
3838 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3840 struct netmap_adapter *na = NA(ifp);
3843 * XXX emulated netmap mode sets NAF_SKIP_INTR so
3844 * we still use the regular driver even though the previous
3845 * check fails. It is unclear whether we should use
3846 * nm_native_on() here.
3848 if (!nm_netmap_on(na))
3851 if (na->na_flags & NAF_SKIP_INTR) {
3852 ND("use regular interrupt");
3856 return netmap_common_irq(na, q, work_done);
3859 /* set/clear native flags and if_transmit/netdev_ops */
3861 nm_set_native_flags(struct netmap_adapter *na)
3863 struct ifnet *ifp = na->ifp;
3865 /* We do the setup for intercepting packets only if we are the
3866 * first user of this adapapter. */
3867 if (na->active_fds > 0) {
3871 na->na_flags |= NAF_NETMAP_ON;
3873 nm_update_hostrings_mode(na);
3877 nm_clear_native_flags(struct netmap_adapter *na)
3879 struct ifnet *ifp = na->ifp;
3881 /* We undo the setup for intercepting packets only if we are the
3882 * last user of this adapapter. */
3883 if (na->active_fds > 0) {
3887 nm_update_hostrings_mode(na);
3890 na->na_flags &= ~NAF_NETMAP_ON;
3895 * Module loader and unloader
3897 * netmap_init() creates the /dev/netmap device and initializes
3898 * all global variables. Returns 0 on success, errno on failure
3899 * (but there is no chance)
3901 * netmap_fini() destroys everything.
3904 static struct cdev *netmap_dev; /* /dev/netmap character device. */
3905 extern struct cdevsw netmap_cdevsw;
3912 destroy_dev(netmap_dev);
3913 /* we assume that there are no longer netmap users */
3915 netmap_uninit_bridges();
3918 nm_prinf("netmap: unloaded module.\n");
3929 error = netmap_mem_init();
3933 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
3934 * when the module is compiled in.
3935 * XXX could use make_dev_credv() to get error number
3937 netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
3938 &netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
3943 error = netmap_init_bridges();
3948 nm_os_vi_init_index();
3951 error = nm_os_ifnet_init();
3955 nm_prinf("netmap: loaded module\n");
3959 return (EINVAL); /* may be incorrect */