From 046b863e6b3832b6ce026fbc387baad3ca4bed14 Mon Sep 17 00:00:00 2001 From: luigi Date: Tue, 18 Feb 2014 05:58:36 +0000 Subject: [PATCH] MFH: sync the netmap code with the one in HEAD (enhanced VALE switch, netmap pipes, emulated netmap mode). See details in the log for svn 261909. git-svn-id: svn://svn.freebsd.org/base/stable/9@262153 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f --- share/man/man4/netmap.4 | 1144 ++++++-- sys/conf/files | 6 + sys/dev/e1000/if_em.c | 2 +- sys/dev/e1000/if_igb.c | 4 +- sys/dev/e1000/if_lem.c | 2 +- sys/dev/ixgbe/ixgbe.c | 2 +- sys/dev/netmap/if_em_netmap.h | 250 +- sys/dev/netmap/if_igb_netmap.h | 289 +- sys/dev/netmap/if_lem_netmap.h | 284 +- sys/dev/netmap/if_re_netmap.h | 339 +-- sys/dev/netmap/ixgbe_netmap.h | 469 ++- sys/dev/netmap/netmap.c | 4166 +++++++++------------------ sys/dev/netmap/netmap_freebsd.c | 655 +++++ sys/dev/netmap/netmap_generic.c | 806 ++++++ sys/dev/netmap/netmap_kern.h | 1066 +++++-- sys/dev/netmap/netmap_mbq.c | 163 ++ sys/dev/netmap/netmap_mbq.h | 78 + sys/dev/netmap/netmap_mem2.c | 639 ++-- sys/dev/netmap/netmap_mem2.h | 29 +- sys/dev/netmap/netmap_offloadings.c | 401 +++ sys/dev/netmap/netmap_pipe.c | 711 +++++ sys/dev/netmap/netmap_vale.c | 2103 ++++++++++++++ sys/modules/netmap/Makefile | 10 +- sys/net/netmap.h | 576 ++-- sys/net/netmap_user.h | 658 ++++- tools/tools/netmap/Makefile | 28 +- tools/tools/netmap/README | 17 +- tools/tools/netmap/bridge.c | 203 +- tools/tools/netmap/click-test.cfg | 19 - tools/tools/netmap/nm_util.c | 244 -- tools/tools/netmap/nm_util.h | 183 -- tools/tools/netmap/pcap.c | 654 ----- tools/tools/netmap/pkt-gen.c | 798 ++--- tools/tools/netmap/vale-ctl.c | 43 +- 34 files changed, 10890 insertions(+), 6151 deletions(-) create mode 100644 sys/dev/netmap/netmap_freebsd.c create mode 100644 sys/dev/netmap/netmap_generic.c create mode 100644 sys/dev/netmap/netmap_mbq.c create mode 100644 sys/dev/netmap/netmap_mbq.h create mode 100644 sys/dev/netmap/netmap_offloadings.c create mode 100644 sys/dev/netmap/netmap_pipe.c create mode 100644 sys/dev/netmap/netmap_vale.c delete mode 100644 tools/tools/netmap/click-test.cfg delete mode 100644 tools/tools/netmap/nm_util.c delete mode 100644 tools/tools/netmap/nm_util.h delete mode 100644 tools/tools/netmap/pcap.c diff --git a/share/man/man4/netmap.4 b/share/man/man4/netmap.4 index ca82b59c5..1b2dc7a23 100644 --- a/share/man/man4/netmap.4 +++ b/share/man/man4/netmap.4 @@ -1,4 +1,4 @@ -.\" Copyright (c) 2011 Matteo Landi, Luigi Rizzo, Universita` di Pisa +.\" Copyright (c) 2011-2014 Matteo Landi, Luigi Rizzo, Universita` di Pisa .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without @@ -26,219 +26,631 @@ .\" distributed with 4.3BSD Unix. .\" .\" $FreeBSD$ -.\" $Id: netmap.4 9662 2011-11-16 13:18:06Z luigi $: stable/8/share/man/man4/bpf.4 181694 2008-08-13 17:45:06Z ed $ .\" -.Dd November 16, 2011 +.Dd February 13, 2014 .Dt NETMAP 4 .Os .Sh NAME .Nm netmap .Nd a framework for fast packet I/O +.br +.Nm VALE +.Nd a fast VirtuAl Local Ethernet using the netmap API +.br +.Nm netmap pipes +.Nd a shared memory packet transport channel .Sh SYNOPSIS .Cd device netmap .Sh DESCRIPTION .Nm -is a framework for fast and safe access to network devices -(reaching 14.88 Mpps at less than 1 GHz). +is a framework for extremely fast and efficient packet I/O +for both userspace and kernel clients. +It runs on FreeBSD and Linux, +and includes +.Nm VALE , +a very fast and modular in-kernel software switch/dataplane, +and +.Nm netmap pipes , +a shared memory packet transport channel. +All these are accessed interchangeably with the same API. +.Pp +.Nm , VALE +and +.Nm netmap pipes +are at least one order of magnitude faster than +standard OS mechanisms +(sockets, bpf, tun/tap interfaces, native switches, pipes), +reaching 14.88 million packets per second (Mpps) +with much less than one core on a 10 Gbit NIC, +about 20 Mpps per core for VALE ports, +and over 100 Mpps for netmap pipes. +.Pp +Userspace clients can dynamically switch NICs into +.Nm +mode and send and receive raw packets through +memory mapped buffers. +Similarly, +.Nm VALE +switch instances and ports, and +.Nm netmap pipes +can be created dynamically, +providing high speed packet I/O between processes, +virtual machines, NICs and the host stack. +.Pp .Nm -uses memory mapped buffers and metadata -(buffer indexes and lengths) to communicate with the kernel, -which is in charge of validating information through -.Pa ioctl() +suports both non-blocking I/O through +.Xr ioctls() , +synchronization and blocking I/O through a file descriptor +and standard OS mechanisms such as +.Xr select 2 , +.Xr poll 2 , +.Xr epoll 2 , +.Xr kqueue 2 . +.Nm VALE and -.Pa select()/poll() . +.Nm netmap pipes +are implemented by a single kernel module, which also emulates the +.Nm +API over standard drivers for devices without native +.Nm +support. +For best performance, .Nm -can exploit the parallelism in multiqueue devices and -multicore systems. +requires explicit support in device drivers. .Pp +In the rest of this (long) manual page we document +various aspects of the +.Nm +and +.Nm VALE +architecture, features and usage. .Pp +.Sh ARCHITECTURE .Nm -requires explicit support in device drivers. -For a list of supported devices, see the end of this manual page. -.Sh OPERATION +supports raw packet I/O through a +.Em port , +which can be connected to a physical interface +.Em ( NIC ) , +to the host stack, +or to a +.Nm VALE +switch). +Ports use preallocated circular queues of buffers +.Em ( rings ) +residing in an mmapped region. +There is one ring for each transmit/receive queue of a +NIC or virtual port. +An additional ring pair connects to the host stack. +.Pp +After binding a file descriptor to a port, a +.Nm +client can send or receive packets in batches through +the rings, and possibly implement zero-copy forwarding +between ports. +.Pp +All NICs operating in +.Nm +mode use the same memory region, +accessible to all processes who own +.Nm /dev/netmap +file descriptors bound to NICs. +Independent +.Nm VALE +and +.Nm netmap pipe +ports +by default use separate memory regions, +but can be independently configured to share memory. +.Pp +.Sh ENTERING AND EXITING NETMAP MODE +The following section describes the system calls to create +and control +.Nm netmap +ports (including +.Nm VALE +and +.Nm netmap pipe +ports). +Simpler, higher level functions are described in section +.Xr LIBRARIES . +.Pp +Ports and rings are created and controlled through a file descriptor, +created by opening a special device +.Dl fd = open("/dev/netmap"); +and then bound to a specific port with an +.Dl ioctl(fd, NIOCREGIF, (struct nmreq *)arg); +.Pp .Nm -clients must first open the -.Pa open("/dev/netmap") , -and then issue an -.Pa ioctl(...,NIOCREGIF,...) -to bind the file descriptor to a network device. +has multiple modes of operation controlled by the +.Vt struct nmreq +argument. +.Va arg.nr_name +specifies the port name, as follows: +.Bl -tag -width XXXX +.It Dv OS network interface name (e.g. 'em0', 'eth1', ... ) +the data path of the NIC is disconnected from the host stack, +and the file descriptor is bound to the NIC (one or all queues), +or to the host stack; +.It Dv valeXXX:YYY (arbitrary XXX and YYY) +the file descriptor is bound to port YYY of a VALE switch called XXX, +both dynamically created if necessary. +The string cannot exceed IFNAMSIZ characters, and YYY cannot +be the name of any existing OS network interface. +.El .Pp -When a device is put in +On return, +.Va arg +indicates the size of the shared memory region, +and the number, size and location of all the .Nm -mode, its data path is disconnected from the host stack. -The processes owning the file descriptor -can exchange packets with the device, or with the host stack, -through an mmapped memory region that contains pre-allocated -buffers and metadata. +data structures, which can be accessed by mmapping the memory +.Dl char *mem = mmap(0, arg.nr_memsize, fd); .Pp Non blocking I/O is done with special -.Pa ioctl()'s , -whereas the file descriptor can be passed to -.Pa select()/poll() -to be notified about incoming packet or available transmit buffers. -.Ss Data structures -All data structures for all devices in -.Nm -mode are in a memory -region shared by the kernel and all processes -who open -.Pa /dev/netmap -(NOTE: visibility may be restricted in future implementations). -All references between the shared data structure -are relative (offsets or indexes). Some macros help converting -them into actual pointers. +.Xr ioctl 2 +.Xr select 2 +and +.Xr poll 2 +on the file descriptor permit blocking I/O. +.Xr epoll 2 +and +.Xr kqueue 2 +are not supported on +.Nm +file descriptors. .Pp -The data structures in shared memory are the following: +While a NIC is in +.Nm +mode, the OS will still believe the interface is up and running. +OS-generated packets for that NIC end up into a +.Nm +ring, and another ring is used to send packets into the OS network stack. +A +.Xr close 2 +on the file descriptor removes the binding, +and returns the NIC to normal mode (reconnecting the data path +to the host stack), or destroys the virtual port. .Pp +.Sh DATA STRUCTURES +The data structures in the mmapped memory region are detailed in +.Xr sys/net/netmap.h , +which is the ultimate reference for the +.Nm +API. The main structures and fields are indicated below: .Bl -tag -width XXX .It Dv struct netmap_if (one per interface) -indicates the number of rings supported by an interface, their -sizes, and the offsets of the -.Pa netmap_rings -associated to the interface. -The offset of a -.Pa struct netmap_if -in the shared memory region is indicated by the -.Pa nr_offset -field in the structure returned by the -.Pa NIOCREGIF -(see below). .Bd -literal struct netmap_if { - char ni_name[IFNAMSIZ]; /* name of the interface. */ - const u_int ni_num_queues; /* number of hw ring pairs */ - const ssize_t ring_ofs[]; /* offset of tx and rx rings */ + ... + const uint32_t ni_flags; /* properties */ + ... + const uint32_t ni_tx_rings; /* NIC tx rings */ + const uint32_t ni_rx_rings; /* NIC rx rings */ + uint32_t ni_bufs_head; /* head of extra bufs list */ + ... }; .Ed +.Pp +Indicates the number of available rings +.Pa ( struct netmap_rings ) +and their position in the mmapped region. +The number of tx and rx rings +.Pa ( ni_tx_rings , ni_rx_rings ) +normally depends on the hardware. +NICs also have an extra tx/rx ring pair connected to the host stack. +.Em NIOCREGIF +can also request additional unbound buffers in the same memory space, +to be used as temporary storage for packets. +.Pa ni_bufs_head +contains the index of the first of these free rings, +which are connected in a list (the first uint32_t of each +buffer being the index of the next buffer in the list). +A 0 indicates the end of the list. +.Pp .It Dv struct netmap_ring (one per ring) -contains the index of the current read or write slot (cur), -the number of slots available for reception or transmission (avail), -and an array of -.Pa slots -describing the buffers. -There is one ring pair for each of the N hardware ring pairs -supported by the card (numbered 0..N-1), plus -one ring pair (numbered N) for packets from/to the host stack. .Bd -literal struct netmap_ring { - const ssize_t buf_ofs; - const uint32_t num_slots; /* number of slots in the ring. */ - uint32_t avail; /* number of usable slots */ - uint32_t cur; /* 'current' index for the user side */ - - const uint16_t nr_buf_size; - uint16_t flags; - struct netmap_slot slot[0]; /* array of slots. */ + ... + const uint32_t num_slots; /* slots in each ring */ + const uint32_t nr_buf_size; /* size of each buffer */ + ... + uint32_t head; /* (u) first buf owned by user */ + uint32_t cur; /* (u) wakeup position */ + const uint32_t tail; /* (k) first buf owned by kernel */ + ... + uint32_t flags; + struct timeval ts; /* (k) time of last rxsync() */ + ... + struct netmap_slot slot[0]; /* array of slots */ } .Ed -.It Dv struct netmap_slot (one per packet) -contains the metadata for a packet: a buffer index (buf_idx), -a buffer length (len), and some flags. +.Pp +Implements transmit and receive rings, with read/write +pointers, metadata and and an array of +.Pa slots +describing the buffers. +.Pp +.It Dv struct netmap_slot (one per buffer) .Bd -literal struct netmap_slot { - uint32_t buf_idx; /* buffer index */ - uint16_t len; /* packet length */ - uint16_t flags; /* buf changed, etc. */ -#define NS_BUF_CHANGED 0x0001 /* must resync, buffer changed */ -#define NS_REPORT 0x0002 /* tell hw to report results - * e.g. by generating an interrupt - */ + uint32_t buf_idx; /* buffer index */ + uint16_t len; /* packet length */ + uint16_t flags; /* buf changed, etc. */ + uint64_t ptr; /* address for indirect buffers */ }; .Ed +.Pp +Describes a packet buffer, which normally is identified by +an index and resides in the mmapped region. .It Dv packet buffers -are fixed size (approximately 2k) buffers allocated by the kernel -that contain packet data. Buffers addresses are computed through -macros. +Fixed size (normally 2 KB) packet buffers allocated by the kernel. .El .Pp -Some macros support the access to objects in the shared memory -region. In particular: +The offset of the +.Pa struct netmap_if +in the mmapped region is indicated by the +.Pa nr_offset +field in the structure returned by +.Pa NIOCREGIF . +From there, all other objects are reachable through +relative references (offsets or indexes). +Macros and functions in +help converting them into actual pointers: +.Pp +.Dl struct netmap_if *nifp = NETMAP_IF(mem, arg.nr_offset); +.Dl struct netmap_ring *txr = NETMAP_TXRING(nifp, ring_index); +.Dl struct netmap_ring *rxr = NETMAP_RXRING(nifp, ring_index); +.Pp +.Dl char *buf = NETMAP_BUF(ring, buffer_index); +.Sh RINGS, BUFFERS AND DATA I/O +.Va Rings +are circular queues of packets with three indexes/pointers +.Va ( head , cur , tail ) ; +one slot is always kept empty. +The ring size +.Va ( num_slots ) +should not be assumed to be a power of two. +.br +(NOTE: older versions of netmap used head/count format to indicate +the content of a ring). +.Pp +.Va head +is the first slot available to userspace; +.br +.Va cur +is the wakeup point: +select/poll will unblock when +.Va tail +passes +.Va cur ; +.br +.Va tail +is the first slot reserved to the kernel. +.Pp +Slot indexes MUST only move forward; +for convenience, the function +.Dl nm_ring_next(ring, index) +returns the next index modulo the ring size. +.Pp +.Va head +and +.Va cur +are only modified by the user program; +.Va tail +is only modified by the kernel. +The kernel only reads/writes the +.Vt struct netmap_ring +slots and buffers +during the execution of a netmap-related system call. +The only exception are slots (and buffers) in the range +.Va tail\ . . . head-1 , +that are explicitly assigned to the kernel. +.Pp +.Ss TRANSMIT RINGS +On transmit rings, after a +.Nm +system call, slots in the range +.Va head\ . . . tail-1 +are available for transmission. +User code should fill the slots sequentially +and advance +.Va head +and +.Va cur +past slots ready to transmit. +.Va cur +may be moved further ahead if the user code needs +more slots before further transmissions (see +.Sx SCATTER GATHER I/O ) . +.Pp +At the next NIOCTXSYNC/select()/poll(), +slots up to +.Va head-1 +are pushed to the port, and +.Va tail +may advance if further slots have become available. +Below is an example of the evolution of a TX ring: +.Pp +.Bd -literal + after the syscall, slots between cur and tail are (a)vailable + head=cur tail + | | + v v + TX [.....aaaaaaaaaaa.............] + + user creates new packets to (T)ransmit + head=cur tail + | | + v v + TX [.....TTTTTaaaaaa.............] + + NIOCTXSYNC/poll()/select() sends packets and reports new slots + head=cur tail + | | + v v + TX [..........aaaaaaaaaaa........] +.Ed +.Pp +select() and poll() wlll block if there is no space in the ring, i.e. +.Dl ring->cur == ring->tail +and return when new slots have become available. +.Pp +High speed applications may want to amortize the cost of system calls +by preparing as many packets as possible before issuing them. +.Pp +A transmit ring with pending transmissions has +.Dl ring->head != ring->tail + 1 (modulo the ring size). +The function +.Va int nm_tx_pending(ring) +implements this test. +.Pp +.Ss RECEIVE RINGS +On receive rings, after a +.Nm +system call, the slots in the range +.Va head\& . . . tail-1 +contain received packets. +User code should process them and advance +.Va head +and +.Va cur +past slots it wants to return to the kernel. +.Va cur +may be moved further ahead if the user code wants to +wait for more packets +without returning all the previous slots to the kernel. +.Pp +At the next NIOCRXSYNC/select()/poll(), +slots up to +.Va head-1 +are returned to the kernel for further receives, and +.Va tail +may advance to report new incoming packets. +.br +Below is an example of the evolution of an RX ring: .Bd -literal -struct netmap_if *nifp; -struct netmap_ring *txring = NETMAP_TXRING(nifp, i); -struct netmap_ring *rxring = NETMAP_RXRING(nifp, i); -int i = txring->slot[txring->cur].buf_idx; -char *buf = NETMAP_BUF(txring, i); + after the syscall, there are some (h)eld and some (R)eceived slots + head cur tail + | | | + v v v + RX [..hhhhhhRRRRRRRR..........] + + user advances head and cur, releasing some slots and holding others + head cur tail + | | | + v v v + RX [..*****hhhRRRRRR...........] + + NICRXSYNC/poll()/select() recovers slots and reports new packets + head cur tail + | | | + v v v + RX [.......hhhRRRRRRRRRRRR....] .Ed -.Ss IOCTLS .Pp +.Sh SLOTS AND PACKET BUFFERS +Normally, packets should be stored in the netmap-allocated buffers +assigned to slots when ports are bound to a file descriptor. +One packet is fully contained in a single buffer. +.Pp +The following flags affect slot and buffer processing: +.Bl -tag -width XXX +.It NS_BUF_CHANGED +it MUST be used when the buf_idx in the slot is changed. +This can be used to implement +zero-copy forwarding, see +.Sx ZERO-COPY FORWARDING . +.Pp +.It NS_REPORT +reports when this buffer has been transmitted. +Normally, +.Nm +notifies transmit completions in batches, hence signals +can be delayed indefinitely. This flag helps detecting +when packets have been send and a file descriptor can be closed. +.It NS_FORWARD +When a ring is in 'transparent' mode (see +.Sx TRANSPARENT MODE ) , +packets marked with this flags are forwarded to the other endpoint +at the next system call, thus restoring (in a selective way) +the connection between a NIC and the host stack. +.It NS_NO_LEARN +tells the forwarding code that the SRC MAC address for this +packet must not be used in the learning bridge code. +.It NS_INDIRECT +indicates that the packet's payload is in a user-supplied buffer, +whose user virtual address is in the 'ptr' field of the slot. +The size can reach 65535 bytes. +.br +This is only supported on the transmit ring of +.Nm VALE +ports, and it helps reducing data copies in the interconnection +of virtual machines. +.It NS_MOREFRAG +indicates that the packet continues with subsequent buffers; +the last buffer in a packet must have the flag clear. +.El +.Sh SCATTER GATHER I/O +Packets can span multiple slots if the +.Va NS_MOREFRAG +flag is set in all but the last slot. +The maximum length of a chain is 64 buffers. +This is normally used with +.Nm VALE +ports when connecting virtual machines, as they generate large +TSO segments that are not split unless they reach a physical device. +.Pp +NOTE: The length field always refers to the individual +fragment; there is no place with the total length of a packet. +.Pp +On receive rings the macro +.Va NS_RFRAGS(slot) +indicates the remaining number of slots for this packet, +including the current one. +Slots with a value greater than 1 also have NS_MOREFRAG set. +.Sh IOCTLS .Nm -supports some ioctl() to synchronize the state of the rings -between the kernel and the user processes, plus some -to query and configure the interface. -The former do not require any argument, whereas the latter -use a -.Pa struct netmap_req -defined as follows: +uses two ioctls (NIOCTXSYNC, NIOCRXSYNC) +for non-blocking I/O. They take no argument. +Two more ioctls (NIOCGINFO, NIOCREGIF) are used +to query and configure ports, with the following argument: .Bd -literal struct nmreq { - char nr_name[IFNAMSIZ]; - uint32_t nr_offset; /* nifp offset in the shared region */ - uint32_t nr_memsize; /* size of the shared region */ - uint32_t nr_numdescs; /* descriptors per queue */ - uint16_t nr_numqueues; - uint16_t nr_ringid; /* ring(s) we care about */ -#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */ -#define NETMAP_SW_RING 0x2000 /* we process the sw ring */ -#define NETMAP_NO_TX_POLL 0x1000 /* no gratuitous txsync on poll */ -#define NETMAP_RING_MASK 0xfff /* the actual ring number */ + char nr_name[IFNAMSIZ]; /* (i) port name */ + uint32_t nr_version; /* (i) API version */ + uint32_t nr_offset; /* (o) nifp offset in mmap region */ + uint32_t nr_memsize; /* (o) size of the mmap region */ + uint32_t nr_tx_slots; /* (i/o) slots in tx rings */ + uint32_t nr_rx_slots; /* (i/o) slots in rx rings */ + uint16_t nr_tx_rings; /* (i/o) number of tx rings */ + uint16_t nr_rx_rings; /* (i/o) number of tx rings */ + uint16_t nr_ringid; /* (i/o) ring(s) we care about */ + uint16_t nr_cmd; /* (i) special command */ + uint16_t nr_arg1; /* (i/o) extra arguments */ + uint16_t nr_arg2; /* (i/o) extra arguments */ + uint32_t nr_arg3; /* (i/o) extra arguments */ + uint32_t nr_flags /* (i/o) open mode */ + ... }; - .Ed -A device descriptor obtained through +.Pp +A file descriptor obtained through .Pa /dev/netmap -also supports the ioctl supported by network devices. +also supports the ioctl supported by network devices, see +.Xr netintro 4 . .Pp -The netmap-specific -.Xr ioctl 2 -command codes below are defined in -.In net/netmap.h -and are: .Bl -tag -width XXXX .It Dv NIOCGINFO -returns information about the interface named in nr_name. -On return, nr_memsize indicates the size of the shared netmap -memory region (this is device-independent), -nr_numslots indicates how many buffers are in a ring, -nr_numrings indicates the number of rings supported by the hardware. +returns EINVAL if the named port does not support netmap. +Otherwise, it returns 0 and (advisory) information +about the port. +Note that all the information below can change before the +interface is actually put in netmap mode. .Pp -If the device does not support netmap, the ioctl returns EINVAL. +.Bl -tag -width XX +.It Pa nr_memsize +indicates the size of the +.Nm +memory region. NICs in +.Nm +mode all share the same memory region, +whereas +.Nm VALE +ports have independent regions for each port. +.It Pa nr_tx_slots , nr_rx_slots +indicate the size of transmit and receive rings. +.It Pa nr_tx_rings , nr_rx_rings +indicate the number of transmit +and receive rings. +Both ring number and sizes may be configured at runtime +using interface-specific functions (e.g. +.Xr ethtool +). +.El .It Dv NIOCREGIF -puts the interface named in nr_name into netmap mode, disconnecting -it from the host stack, and/or defines which rings are controlled -through this file descriptor. -On return, it gives the same info as NIOCGINFO, and nr_ringid -indicates the identity of the rings controlled through the file +binds the port named in +.Va nr_name +to the file descriptor. For a physical device this also switches it into +.Nm +mode, disconnecting +it from the host stack. +Multiple file descriptors can be bound to the same port, +with proper synchronization left to the user. +.Pp +.Dv NIOCREGIF can also bind a file descriptor to one endpoint of a +.Em netmap pipe , +consisting of two netmap ports with a crossover connection. +A netmap pipe share the same memory space of the parent port, +and is meant to enable configuration where a master process acts +as a dispatcher towards slave processes. +.Pp +To enable this function, the +.Pa nr_arg1 +field of the structure can be used as a hint to the kernel to +indicate how many pipes we expect to use, and reserve extra space +in the memory region. +.Pp +On return, it gives the same info as NIOCGINFO, +with +.Pa nr_ringid +and +.Pa nr_flags +indicating the identity of the rings controlled through the file descriptor. .Pp -Possible values for nr_ringid are +.Va nr_flags +.Va nr_ringid +selects which rings are controlled through this file descriptor. +Possible values of +.Pa nr_flags +are indicated below, together with the naming schemes +that application libraries (such as the +.Nm nm_open +indicated below) can use to indicate the specific set of rings. +In the example below, "netmap:foo" is any valid netmap port name. +.Pp .Bl -tag -width XXXXX -.It 0 -default, all hardware rings -.It NETMAP_SW_RING -the ``host rings'' connecting to the host stack -.It NETMAP_HW_RING + i -the i-th hardware ring +.It NR_REG_ALL_NIC "netmap:foo" +(default) all hardware ring pairs +.It NR_REG_SW_NIC "netmap:foo^" +the ``host rings'', connecting to the host stack. +.It NR_RING_NIC_SW "netmap:foo+ +all hardware rings and the host rings +.It NR_REG_ONE_NIC "netmap:foo-i" +only the i-th hardware ring pair, where the number is in +.Pa nr_ringid ; +.It NR_REG_PIPE_MASTER "netmap:foo{i" +the master side of the netmap pipe whose identifier (i) is in +.Pa nr_ringid ; +.It NR_REG_PIPE_SLAVE "netmap:foo}i" +the slave side of the netmap pipe whose identifier (i) is in +.Pa nr_ringid . +.Pp +The identifier of a pipe must be thought as part of the pipe name, +and does not need to be sequential. On return the pipe +will only have a single ring pair with index 0, +irrespective of the value of i. .El +.Pp By default, a -.Nm poll +.Xr poll 2 or -.Nm select +.Xr select 2 call pushes out any pending packets on the transmit ring, even if no write events are specified. The feature can be disabled by or-ing -.Nm NETMAP_NO_TX_SYNC -to nr_ringid. -But normally you should keep this feature unless you are using -separate file descriptors for the send and receive rings, because -otherwise packets are pushed out only if NETMAP_TXSYNC is called, -or the send queue is full. -.Pp -.Pa NIOCREGIF -can be used multiple times to change the association of a -file descriptor to a ring pair, always within the same device. -.It Dv NIOCUNREGIF -brings an interface back to normal mode. +.Va NETMAP_NO_TX_SYNC +to the value written to +.Va nr_ringid. +When this feature is used, +packets are transmitted only on +.Va ioctl(NIOCTXSYNC) +or select()/poll() are called with a write event (POLLOUT/wfdset) or a full ring. +.Pp +When registering a virtual interface that is dynamically created to a +.Xr vale 4 +switch, we can specify the desired number of rings (1 by default, +and currently up to 16) on it using nr_tx_rings and nr_rx_rings fields. .It Dv NIOCTXSYNC tells the hardware of new packets to transmit, and updates the number of slots available for transmission. @@ -246,54 +658,418 @@ number of slots available for transmission. tells the hardware of consumed packets, and asks for newly available packets. .El -.Ss SYSTEM CALLS +.Sh SELECT, POLL, EPOLL, KQUEUE. +.Xr select 2 +and +.Xr poll 2 +on a +.Nm +file descriptor process rings as indicated in +.Sx TRANSMIT RINGS +and +.Sx RECEIVE RINGS , +respectively when write (POLLOUT) and read (POLLIN) events are requested. +Both block if no slots are available in the ring +.Va ( ring->cur == ring->tail ) . +Depending on the platform, +.Xr epoll 2 +and +.Xr kqueue 2 +are supported too. +.Pp +Packets in transmit rings are normally pushed out +(and buffers reclaimed) even without +requesting write events. Passing the NETMAP_NO_TX_SYNC flag to +.Em NIOCREGIF +disables this feature. +By default, receive rings are processed only if read +events are requested. Passing the NETMAP_DO_RX_SYNC flag to +.Em NIOCREGIF updates receive rings even without read events. +Note that on epoll and kqueue, NETMAP_NO_TX_SYNC and NETMAP_DO_RX_SYNC +only have an effect when some event is posted for the file descriptor. +.Sh LIBRARIES +The +.Nm +API is supposed to be used directly, both because of its simplicity and +for efficient integration with applications. +.Pp +For conveniency, the +.Va +header provides a few macros and functions to ease creating +a file descriptor and doing I/O with a +.Nm +port. These are loosely modeled after the +.Xr pcap 3 +API, to ease porting of libpcap-based applications to +.Nm . +To use these extra functions, programs should +.Dl #define NETMAP_WITH_LIBS +before +.Dl #include +.Pp +The following functions are available: +.Bl -tag -width XXXXX +.It Va struct nm_desc * nm_open(const char *ifname, const struct nmreq *req, uint64_t flags, const struct nm_desc *arg) +similar to +.Xr pcap_open , +binds a file descriptor to a port. +.Bl -tag -width XX +.It Va ifname +is a port name, in the form "netmap:XXX" for a NIC and "valeXXX:YYY" for a +.Nm VALE +port. +.It Va req +provides the initial values for the argument to the NIOCREGIF ioctl. +The nm_flags and nm_ringid values are overwritten by parsing +ifname and flags, and other fields can be overridden through +the other two arguments. +.It Va arg +points to a struct nm_desc containing arguments (e.g. from a previously +open file descriptor) that should override the defaults. +The fields are used as described below +.It Va flags +can be set to a combination of the following flags: +.Va NETMAP_NO_TX_POLL , +.Va NETMAP_DO_RX_POLL +(copied into nr_ringid); +.Va NM_OPEN_NO_MMAP (if arg points to the same memory region, +avoids the mmap and uses the values from it); +.Va NM_OPEN_IFNAME (ignores ifname and uses the values in arg); +.Va NM_OPEN_ARG1 , +.Va NM_OPEN_ARG2 , +.Va NM_OPEN_ARG3 (uses the fields from arg); +.Va NM_OPEN_RING_CFG (uses the ring number and sizes from arg). +.El +.It Va int nm_close(struct nm_desc *d) +closes the file descriptor, unmaps memory, frees resources. +.It Va int nm_inject(struct nm_desc *d, const void *buf, size_t size) +similar to pcap_inject(), pushes a packet to a ring, returns the size +of the packet is successful, or 0 on error; +.It Va int nm_dispatch(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg) +similar to pcap_dispatch(), applies a callback to incoming packets +.It Va u_char * nm_nextpkt(struct nm_desc *d, struct nm_pkthdr *hdr) +similar to pcap_next(), fetches the next packet +.Pp +.El +.Sh SUPPORTED DEVICES +.Nm +natively supports the following devices: +.Pp +On FreeBSD: +.Xr em 4 , +.Xr igb 4 , +.Xr ixgbe 4 , +.Xr lem 4 , +.Xr re 4 . +.Pp +On Linux +.Xr e1000 4 , +.Xr e1000e 4 , +.Xr igb 4 , +.Xr ixgbe 4 , +.Xr mlx4 4 , +.Xr forcedeth 4 , +.Xr r8169 4 . +.Pp +NICs without native support can still be used in +.Nm +mode through emulation. Performance is inferior to native netmap +mode but still significantly higher than sockets, and approaching +that of in-kernel solutions such as Linux's +.Xr pktgen . +.Pp +Emulation is also available for devices with native netmap support, +which can be used for testing or performance comparison. +The sysctl variable +.Va dev.netmap.admode +globally controls how netmap mode is implemented. +.Sh SYSCTL VARIABLES AND MODULE PARAMETERS +Some aspect of the operation of +.Nm +are controlled through sysctl variables on FreeBSD +.Em ( dev.netmap.* ) +and module parameters on Linux +.Em ( /sys/module/netmap_lin/parameters/* ) : +.Pp +.Bl -tag -width indent +.It Va dev.netmap.admode: 0 +Controls the use of native or emulated adapter mode. +0 uses the best available option, 1 forces native and +fails if not available, 2 forces emulated hence never fails. +.It Va dev.netmap.generic_ringsize: 1024 +Ring size used for emulated netmap mode +.It Va dev.netmap.generic_mit: 100000 +Controls interrupt moderation for emulated mode +.It Va dev.netmap.mmap_unreg: 0 +.It Va dev.netmap.fwd: 0 +Forces NS_FORWARD mode +.It Va dev.netmap.flags: 0 +.It Va dev.netmap.txsync_retry: 2 +.It Va dev.netmap.no_pendintr: 1 +Forces recovery of transmit buffers on system calls +.It Va dev.netmap.mitigate: 1 +Propagates interrupt mitigation to user processes +.It Va dev.netmap.no_timestamp: 0 +Disables the update of the timestamp in the netmap ring +.It Va dev.netmap.verbose: 0 +Verbose kernel messages +.It Va dev.netmap.buf_num: 163840 +.It Va dev.netmap.buf_size: 2048 +.It Va dev.netmap.ring_num: 200 +.It Va dev.netmap.ring_size: 36864 +.It Va dev.netmap.if_num: 100 +.It Va dev.netmap.if_size: 1024 +Sizes and number of objects (netmap_if, netmap_ring, buffers) +for the global memory region. The only parameter worth modifying is +.Va dev.netmap.buf_num +as it impacts the total amount of memory used by netmap. +.It Va dev.netmap.buf_curr_num: 0 +.It Va dev.netmap.buf_curr_size: 0 +.It Va dev.netmap.ring_curr_num: 0 +.It Va dev.netmap.ring_curr_size: 0 +.It Va dev.netmap.if_curr_num: 0 +.It Va dev.netmap.if_curr_size: 0 +Actual values in use. +.It Va dev.netmap.bridge_batch: 1024 +Batch size used when moving packets across a +.Nm VALE +switch. Values above 64 generally guarantee good +performance. +.El +.Sh SYSTEM CALLS .Nm uses -.Nm select +.Xr select 2 , +.Xr poll 2 , +.Xr epoll and -.Nm poll -to wake up processes when significant events occur. +.Xr kqueue +to wake up processes when significant events occur, and +.Xr mmap 2 +to map memory. +.Xr ioctl 2 +is used to configure ports and +.Nm VALE switches . +.Pp +Applications may need to create threads and bind them to +specific cores to improve performance, using standard +OS primitives, see +.Xr pthread 3 . +In particular, +.Xr pthread_setaffinity_np 3 +may be of use. +.Sh CAVEATS +No matter how fast the CPU and OS are, +achieving line rate on 10G and faster interfaces +requires hardware with sufficient performance. +Several NICs are unable to sustain line rate with +small packet sizes. Insufficient PCIe or memory bandwidth +can also cause reduced performance. +.Pp +Another frequent reason for low performance is the use +of flow control on the link: a slow receiver can limit +the transmit speed. +Be sure to disable flow control when running high +speed experiments. +.Pp +.Ss SPECIAL NIC FEATURES +.Nm +is orthogonal to some NIC features such as +multiqueue, schedulers, packet filters. +.Pp +Multiple transmit and receive rings are supported natively +and can be configured with ordinary OS tools, +such as +.Xr ethtool +or +device-specific sysctl variables. +The same goes for Receive Packet Steering (RPS) +and filtering of incoming traffic. +.Pp +.Nm +.Em does not use +features such as +.Em checksum offloading , TCP segmentation offloading , +.Em encryption , VLAN encapsulation/decapsulation , +etc. . +When using netmap to exchange packets with the host stack, +make sure to disable these features. .Sh EXAMPLES +.Ss TEST PROGRAMS +.Nm +comes with a few programs that can be used for testing or +simple applications. +See the +.Va examples/ +directory in +.Nm +distributions, or +.Va tools/tools/netmap/ +directory in FreeBSD distributions. +.Pp +.Xr pkt-gen +is a general purpose traffic source/sink. +.Pp +As an example +.Dl pkt-gen -i ix0 -f tx -l 60 +can generate an infinite stream of minimum size packets, and +.Dl pkt-gen -i ix0 -f rx +is a traffic sink. +Both print traffic statistics, to help monitor +how the system performs. +.Pp +.Xr pkt-gen +has many options can be uses to set packet sizes, addresses, +rates, and use multiple send/receive threads and cores. +.Pp +.Xr bridge +is another test program which interconnects two +.Nm +ports. It can be used for transparent forwarding between +interfaces, as in +.Dl bridge -i ix0 -i ix1 +or even connect the NIC to the host stack using netmap +.Dl bridge -i ix0 -i ix0 +.Ss USING THE NATIVE API The following code implements a traffic generator .Pp .Bd -literal -compact -#include #include -struct netmap_if *nifp; -struct netmap_ring *ring; -struct netmap_request nmr; +... +void sender(void) +{ + struct netmap_if *nifp; + struct netmap_ring *ring; + struct nmreq nmr; + struct pollfd fds; -fd = open("/dev/netmap", O_RDWR); -bzero(&nmr, sizeof(nmr)); -strcpy(nmr.nm_name, "ix0"); -ioctl(fd, NIOCREG, &nmr); -p = mmap(0, nmr.memsize, fd); -nifp = NETMAP_IF(p, nmr.offset); -ring = NETMAP_TXRING(nifp, 0); -fds.fd = fd; -fds.events = POLLOUT; -for (;;) { - poll(list, 1, -1); - while (ring->avail-- > 0) { - i = ring->cur; - buf = NETMAP_BUF(ring, ring->slot[i].buf_index); - ... prepare packet in buf ... - ring->slot[i].len = ... packet length ... - ring->cur = NETMAP_RING_NEXT(ring, i); + fd = open("/dev/netmap", O_RDWR); + bzero(&nmr, sizeof(nmr)); + strcpy(nmr.nr_name, "ix0"); + nmr.nm_version = NETMAP_API; + ioctl(fd, NIOCREGIF, &nmr); + p = mmap(0, nmr.nr_memsize, fd); + nifp = NETMAP_IF(p, nmr.nr_offset); + ring = NETMAP_TXRING(nifp, 0); + fds.fd = fd; + fds.events = POLLOUT; + for (;;) { + poll(&fds, 1, -1); + while (!nm_ring_empty(ring)) { + i = ring->cur; + buf = NETMAP_BUF(ring, ring->slot[i].buf_index); + ... prepare packet in buf ... + ring->slot[i].len = ... packet length ... + ring->head = ring->cur = nm_ring_next(ring, i); + } } } .Ed -.Sh SUPPORTED INTERFACES +.Ss HELPER FUNCTIONS +A simple receiver can be implemented using the helper functions +.Bd -literal -compact +#define NETMAP_WITH_LIBS +#include +... +void receiver(void) +{ + struct nm_desc *d; + struct pollfd fds; + u_char *buf; + struct nm_pkthdr h; + ... + d = nm_open("netmap:ix0", NULL, 0, 0); + fds.fd = NETMAP_FD(d); + fds.events = POLLIN; + for (;;) { + poll(&fds, 1, -1); + while ( (buf = nm_nextpkt(d, &h)) ) + consume_pkt(buf, h->len); + } + nm_close(d); +} +.Ed +.Ss ZERO-COPY FORWARDING +Since physical interfaces share the same memory region, +it is possible to do packet forwarding between ports +swapping buffers. The buffer from the transmit ring is used +to replenish the receive ring: +.Bd -literal -compact + uint32_t tmp; + struct netmap_slot *src, *dst; + ... + src = &src_ring->slot[rxr->cur]; + dst = &dst_ring->slot[txr->cur]; + tmp = dst->buf_idx; + dst->buf_idx = src->buf_idx; + dst->len = src->len; + dst->flags = NS_BUF_CHANGED; + src->buf_idx = tmp; + src->flags = NS_BUF_CHANGED; + rxr->head = rxr->cur = nm_ring_next(rxr, rxr->cur); + txr->head = txr->cur = nm_ring_next(txr, txr->cur); + ... +.Ed +.Ss ACCESSING THE HOST STACK +The host stack is for all practical purposes just a regular ring pair, +which you can access with the netmap API (e.g. with +.Dl nm_open("netmap:eth0^", ... ) ; +All packets that the host would send to an interface in .Nm -supports the following interfaces: -.Xr em 4 , -.Xr ixgbe 4 , -.Xr re 4 , +mode end up into the RX ring, whereas all packets queued to the +TX ring are send up to the host stack. +.Ss VALE SWITCH +A simple way to test the performance of a +.Nm VALE +switch is to attach a sender and a receiver to it, +e.g. running the following in two different terminals: +.Dl pkt-gen -i vale1:a -f rx # receiver +.Dl pkt-gen -i vale1:b -f tx # sender +The same example can be used to test netmap pipes, by simply +changing port names, e.g. +.Dl pkt-gen -i vale:x{3 -f rx # receiver on the master side +.Dl pkt-gen -i vale:x}3 -f tx # sender on the slave side +.Pp +The following command attaches an interface and the host stack +to a switch: +.Dl vale-ctl -h vale2:em0 +Other +.Nm +clients attached to the same switch can now communicate +with the network card or the host. +.Pp +.Sh SEE ALSO +.Pp +http://info.iet.unipi.it/~luigi/netmap/ +.Pp +Luigi Rizzo, Revisiting network I/O APIs: the netmap framework, +Communications of the ACM, 55 (3), pp.45-51, March 2012 +.Pp +Luigi Rizzo, netmap: a novel framework for fast packet I/O, +Usenix ATC'12, June 2012, Boston +.Pp +Luigi Rizzo, Giuseppe Lettieri, +VALE, a switched ethernet for virtual machines, +ACM CoNEXT'12, December 2012, Nice +.Pp +Luigi Rizzo, Giuseppe Lettieri, Vincenzo Maffione, +Speeding up packet I/O in virtual machines, +ACM/IEEE ANCS'13, October 2013, San Jose .Sh AUTHORS +.An -nosplit The .Nm -framework has been designed and implemented by -.An Luigi Rizzo +framework has been originally designed and implemented at the +Universita` di Pisa in 2011 by +.An Luigi Rizzo , +and further extended with help from +.An Matteo Landi , +.An Gaetano Catalli , +.An Giuseppe Lettieri , +.An Vincenzo Maffione . +.Pp +.Nm and -.An Matteo Landi -in 2011 at the Universita` di Pisa. +.Nm VALE +have been funded by the European Commission within FP7 Projects +CHANGE (257422) and OPENLAB (287581). diff --git a/sys/conf/files b/sys/conf/files index 87dc097f6..889e38d84 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1599,7 +1599,13 @@ dev/my/if_my.c optional my dev/ncv/ncr53c500.c optional ncv dev/ncv/ncr53c500_pccard.c optional ncv pccard dev/netmap/netmap.c optional netmap +dev/netmap/netmap_freebsd.c optional netmap +dev/netmap/netmap_generic.c optional netmap +dev/netmap/netmap_mbq.c optional netmap dev/netmap/netmap_mem2.c optional netmap +dev/netmap/netmap_offloadings.c optional netmap +dev/netmap/netmap_pipe.c optional netmap +dev/netmap/netmap_vale.c optional netmap dev/nge/if_nge.c optional nge dev/nxge/if_nxge.c optional nxge \ compile-with "${NORMAL_C} ${NO_WSELF_ASSIGN}" diff --git a/sys/dev/e1000/if_em.c b/sys/dev/e1000/if_em.c index a3b3f9aeb..683385ee6 100644 --- a/sys/dev/e1000/if_em.c +++ b/sys/dev/e1000/if_em.c @@ -4352,7 +4352,7 @@ em_initialize_receive_unit(struct adapter *adapter) * preserve the rx buffers passed to userspace. */ if (ifp->if_capenable & IFCAP_NETMAP) - rdt -= NA(adapter->ifp)->rx_rings[i].nr_hwavail; + rdt -= nm_kr_rxspace(&NA(adapter->ifp)->rx_rings[i]); #endif /* DEV_NETMAP */ E1000_WRITE_REG(hw, E1000_RDT(i), rdt); } diff --git a/sys/dev/e1000/if_igb.c b/sys/dev/e1000/if_igb.c index 3ccc8d6d0..196323d71 100644 --- a/sys/dev/e1000/if_igb.c +++ b/sys/dev/e1000/if_igb.c @@ -4571,13 +4571,13 @@ igb_initialize_receive_units(struct adapter *adapter) * an init() while a netmap client is active must * preserve the rx buffers passed to userspace. * In this driver it means we adjust RDT to - * somthing different from next_to_refresh + * something different from next_to_refresh * (which is not used in netmap mode). */ if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(adapter->ifp); struct netmap_kring *kring = &na->rx_rings[i]; - int t = rxr->next_to_refresh - kring->nr_hwavail; + int t = rxr->next_to_refresh - nm_kr_rxspace(kring); if (t >= adapter->num_rx_desc) t -= adapter->num_rx_desc; diff --git a/sys/dev/e1000/if_lem.c b/sys/dev/e1000/if_lem.c index dde6d3d25..40561353e 100644 --- a/sys/dev/e1000/if_lem.c +++ b/sys/dev/e1000/if_lem.c @@ -3368,7 +3368,7 @@ lem_initialize_receive_unit(struct adapter *adapter) #ifdef DEV_NETMAP /* preserve buffers already made available to clients */ if (ifp->if_capenable & IFCAP_NETMAP) - rctl -= NA(adapter->ifp)->rx_rings[0].nr_hwavail; + rctl -= nm_kr_rxspace(&NA(adapter->ifp)->rx_rings[0]); #endif /* DEV_NETMAP */ E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), rctl); diff --git a/sys/dev/ixgbe/ixgbe.c b/sys/dev/ixgbe/ixgbe.c index dca60b5c3..388428710 100644 --- a/sys/dev/ixgbe/ixgbe.c +++ b/sys/dev/ixgbe/ixgbe.c @@ -1245,7 +1245,7 @@ ixgbe_init_locked(struct adapter *adapter) if (ifp->if_capenable & IFCAP_NETMAP) { struct netmap_adapter *na = NA(adapter->ifp); struct netmap_kring *kring = &na->rx_rings[i]; - int t = na->num_rx_desc - 1 - kring->nr_hwavail; + int t = na->num_rx_desc - 1 - nm_kr_rxspace(kring); IXGBE_WRITE_REG(hw, IXGBE_RDT(i), t); } else diff --git a/sys/dev/netmap/if_em_netmap.h b/sys/dev/netmap/if_em_netmap.h index 1ea11238a..17b4c4fd2 100644 --- a/sys/dev/netmap/if_em_netmap.h +++ b/sys/dev/netmap/if_em_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,7 +26,7 @@ /* * $FreeBSD$ * - * netmap support for em. + * netmap support for: em. * * For more details on netmap support please see ixgbe_netmap.h */ @@ -39,10 +39,6 @@ #include -static void em_netmap_block_tasks(struct adapter *); -static void em_netmap_unblock_tasks(struct adapter *); - - // XXX do we need to block/unblock the tasks ? static void em_netmap_block_tasks(struct adapter *adapter) @@ -85,45 +81,31 @@ em_netmap_unblock_tasks(struct adapter *adapter) /* - * Register/unregister routine + * Register/unregister. We are already under netmap lock. */ static int -em_netmap_reg(struct ifnet *ifp, int onoff) +em_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - - if (na == NULL) - return EINVAL; /* no netmap support here */ + EM_CORE_LOCK(adapter); em_disable_intr(adapter); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); em_netmap_block_tasks(adapter); - + /* enable or disable flags and callbacks in na and ifp */ if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_transmit; - - em_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + nm_set_native_flags(na); } else { -fail: - /* return to non-netmap mode */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - em_init_locked(adapter); /* also enable intr */ + nm_clear_native_flags(na); } + em_init_locked(adapter); /* also enable intr */ em_netmap_unblock_tasks(adapter); - return (error); + EM_CORE_UNLOCK(adapter); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } @@ -131,93 +113,92 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -em_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) +em_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct tx_ring *txr = &adapter->tx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ u_int report_frequency = kring->nkr_num_slots >> 1; - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); /* - * Process new packets to send. j is the current index in the - * netmap ring, l is the corresponding index in the NIC ring. + * First part: process new packets to send. */ - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* slot is the current slot in the netmap ring */ - struct netmap_slot *slot = &ring->slot[j]; - /* curr is the current slot in the nic ring */ - struct e1000_tx_desc *curr = &txr->tx_base[l]; - struct em_buffer *txbuf = &txr->tx_buffers[l]; - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - E1000_TXD_CMD_RS : 0; + + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; uint64_t paddr; void *addr = PNMB(slot, &paddr); - u_int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - return netmap_ring_reinit(kring); - } + /* device-specific */ + struct e1000_tx_desc *curr = &txr->tx_base[nic_i]; + struct em_buffer *txbuf = &txr->tx_buffers[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + E1000_TXD_CMD_RS : 0; + + NM_CHECK_ADDR_LEN(addr, len); - slot->flags &= ~NS_REPORT; if (slot->flags & NS_BUF_CHANGED) { curr->buffer_addr = htole64(paddr); /* buffer has changed, reload map */ netmap_reload_map(txr->txtag, txbuf->map, addr); - slot->flags &= ~NS_BUF_CHANGED; } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ curr->upper.data = 0; curr->lower.data = htole32(adapter->txd_cmd | len | (E1000_TXD_CMD_EOP | flags) ); bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = k; /* the saved ring->cur */ - kring->nr_hwavail -= n; + kring->nr_hwcur = head; + /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), l); + /* (re)start the tx unit up to slot nic_i (excluded) */ + E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), nic_i); } - if (n == 0 || kring->nr_hwavail < 1) { - int delta; - + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { /* record completed transmissions using TDH */ - l = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); - if (l >= kring->nkr_num_slots) { /* XXX can it happen ? */ - D("TDH wrap %d", l); - l -= kring->nkr_num_slots; + nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } - delta = l - txr->next_to_clean; - if (delta) { - /* some completed, increment hwavail. */ - if (delta < 0) - delta += kring->nkr_num_slots; - txr->next_to_clean = l; - kring->nr_hwavail += delta; + if (nic_i != txr->next_to_clean) { + txr->next_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + + nm_txsync_finalize(kring); return 0; } @@ -227,19 +208,23 @@ em_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) * Reconcile kernel and user view of the receive ring. */ static int -em_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) +em_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; - k = ring->cur; - if (k > lim) + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + + if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ @@ -247,84 +232,84 @@ em_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * Import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. */ - l = rxr->next_to_check; - j = netmap_idx_n2k(kring, l); if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; - for (n = 0; ; n++) { - struct e1000_rx_desc *curr = &rxr->rx_base[l]; + nic_i = rxr->next_to_check; + nm_i = netmap_idx_n2k(kring, nic_i); + + for (n = 0; ; n++) { // XXX no need to count + struct e1000_rx_desc *curr = &rxr->rx_base[nic_i]; uint32_t staterr = le32toh(curr->status); if ((staterr & E1000_RXD_STAT_DD) == 0) break; - ring->slot[j].len = le16toh(curr->length); - ring->slot[j].flags = slot_flags; - bus_dmamap_sync(rxr->rxtag, rxr->rx_buffers[l].map, + ring->slot[nm_i].len = le16toh(curr->length); + ring->slot[nm_i].flags = slot_flags; + bus_dmamap_sync(rxr->rxtag, rxr->rx_buffers[nic_i].map, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; + nm_i = nm_next(nm_i, lim); /* make sure next_to_refresh follows next_to_check */ - rxr->next_to_refresh = l; // XXX - l = (l == lim) ? 0 : l + 1; + rxr->next_to_refresh = nic_i; // XXX + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ - rxr->next_to_check = l; - kring->nr_hwavail += n; + rxr->next_to_check = nic_i; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); /* NIC ring index */ - for (n = 0; j != k; n++) { - struct netmap_slot *slot = &ring->slot[j]; - struct e1000_rx_desc *curr = &rxr->rx_base[l]; - struct em_buffer *rxbuf = &rxr->rx_buffers[l]; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - return netmap_ring_reinit(kring); - } + struct e1000_rx_desc *curr = &rxr->rx_base[nic_i]; + struct em_buffer *rxbuf = &rxr->rx_buffers[nic_i]; + + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { - curr->buffer_addr = htole64(paddr); /* buffer has changed, reload map */ + curr->buffer_addr = htole64(paddr); netmap_reload_map(rxr->rxtag, rxbuf->map, addr); slot->flags &= ~NS_BUF_CHANGED; } curr->status = 0; bus_dmamap_sync(rxr->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = head; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), l); + nic_i = nm_prev(nic_i, lim); + E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i); } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; + + /* tell userspace that there might be new packets */ + nm_rxsync_finalize(kring); + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } @@ -342,7 +327,8 @@ em_netmap_attach(struct adapter *adapter) na.nm_txsync = em_netmap_txsync; na.nm_rxsync = em_netmap_rxsync; na.nm_register = em_netmap_reg; - netmap_attach(&na, adapter->num_queues); + na.num_tx_rings = na.num_rx_rings = adapter->num_queues; + netmap_attach(&na); } /* end of file */ diff --git a/sys/dev/netmap/if_igb_netmap.h b/sys/dev/netmap/if_igb_netmap.h index 10d94b5fa..e1929f091 100644 --- a/sys/dev/netmap/if_igb_netmap.h +++ b/sys/dev/netmap/if_igb_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Universita` di Pisa. All rights reserved. + * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -37,44 +37,43 @@ #include /* vtophys ? */ #include +/* + * Adaptation to different versions of the driver. + */ + +#ifndef IGB_MEDIA_RESET +/* at the same time as IGB_MEDIA_RESET was defined, the + * tx buffer descriptor was renamed, so use this to revert + * back to the old name. + */ +#define igb_tx_buf igb_tx_buffer +#endif + /* - * register-unregister routine + * Register/unregister. We are already under netmap lock. */ static int -igb_netmap_reg(struct ifnet *ifp, int onoff) +igb_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - - if (na == NULL) - return EINVAL; /* no netmap support here */ + IGB_CORE_LOCK(adapter); igb_disable_intr(adapter); /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + /* enable or disable flags and callbacks in na and ifp */ if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_transmit; - - igb_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + nm_set_native_flags(na); } else { -fail: - /* restore if_transmit */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - igb_init_locked(adapter); /* also enable intr */ + nm_clear_native_flags(na); } - return (error); + igb_init_locked(adapter); /* also enable intr */ + IGB_CORE_UNLOCK(adapter); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } @@ -82,68 +81,59 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) +igb_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct tx_ring *txr = &adapter->tx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ u_int report_frequency = kring->nkr_num_slots >> 1; - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + /* 82575 needs the queue index added */ + u32 olinfo_status = + (adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, - BUS_DMASYNC_POSTREAD); + BUS_DMASYNC_POSTREAD); - /* check for new packets to send. - * j indexes the netmap ring, l indexes the nic ring, and - * j = kring->nr_hwcur, l = E1000_TDT (not tracked), - * j == (l + kring->nkr_hwofs) % ring_size + /* + * First part: process new packets to send. */ - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - /* 82575 needs the queue index added */ - u32 olinfo_status = - (adapter->hw.mac.type == e1000_82575) ? (txr->me << 4) : 0; - - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* slot is the current slot in the netmap ring */ - struct netmap_slot *slot = &ring->slot[j]; - /* curr is the current slot in the nic ring */ - union e1000_adv_tx_desc *curr = - (union e1000_adv_tx_desc *)&txr->tx_base[l]; -#ifndef IGB_MEDIA_RESET -/* at the same time as IGB_MEDIA_RESET was defined, the - * tx buffer descriptor was renamed, so use this to revert - * back to the old name. - */ -#define igb_tx_buf igb_tx_buffer -#endif - struct igb_tx_buf *txbuf = &txr->tx_buffers[l]; - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - E1000_ADVTXD_DCMD_RS : 0; + + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; uint64_t paddr; void *addr = PNMB(slot, &paddr); - u_int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - return netmap_ring_reinit(kring); - } + /* device-specific */ + union e1000_adv_tx_desc *curr = + (union e1000_adv_tx_desc *)&txr->tx_base[nic_i]; + struct igb_tx_buf *txbuf = &txr->tx_buffers[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + E1000_ADVTXD_DCMD_RS : 0; + + NM_CHECK_ADDR_LEN(addr, len); - slot->flags &= ~NS_REPORT; if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ netmap_reload_map(txr->txtag, txbuf->map, addr); - slot->flags &= ~NS_BUF_CHANGED; } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ curr->read.buffer_addr = htole64(paddr); // XXX check olinfo and cmd_type_len curr->read.olinfo_status = @@ -151,48 +141,46 @@ igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) (len<< E1000_ADVTXD_PAYLEN_SHIFT)); curr->read.cmd_type_len = htole32(len | E1000_ADVTXD_DTYP_DATA | - E1000_ADVTXD_DCMD_IFCS | - E1000_ADVTXD_DCMD_DEXT | - E1000_ADVTXD_DCMD_EOP | flags); + E1000_ADVTXD_DCMD_IFCS | + E1000_ADVTXD_DCMD_DEXT | + E1000_ADVTXD_DCMD_EOP | flags); + /* make sure changes to the buffer are synced */ bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = k; /* the saved ring->cur */ - kring->nr_hwavail -= n; + kring->nr_hwcur = head; /* Set the watchdog XXX ? */ txr->queue_status = IGB_QUEUE_WORKING; txr->watchdog_time = ticks; + /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), l); + /* (re)start the tx unit up to slot nic_i (excluded) */ + E1000_WRITE_REG(&adapter->hw, E1000_TDT(txr->me), nic_i); } - if (n == 0 || kring->nr_hwavail < 1) { - int delta; - + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { /* record completed transmissions using TDH */ - l = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); - if (l >= kring->nkr_num_slots) { /* XXX can it happen ? */ - D("TDH wrap %d", l); - l -= kring->nkr_num_slots; - } - delta = l - txr->next_to_clean; - if (delta) { - /* some completed, increment hwavail. */ - if (delta < 0) - delta += kring->nkr_num_slots; - txr->next_to_clean = l; - kring->nr_hwavail += delta; + nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(ring_nr)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } + txr->next_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + + nm_txsync_finalize(kring); return 0; } @@ -202,101 +190,106 @@ igb_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) * Reconcile kernel and user view of the receive ring. */ static int -igb_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) +igb_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; - k = ring->cur; - if (k > lim) + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + + if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. */ - l = rxr->next_to_check; - j = netmap_idx_n2k(kring, l); if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; + nic_i = rxr->next_to_check; + nm_i = netmap_idx_n2k(kring, nic_i); + for (n = 0; ; n++) { - union e1000_adv_rx_desc *curr = &rxr->rx_base[l]; + union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i]; uint32_t staterr = le32toh(curr->wb.upper.status_error); if ((staterr & E1000_RXD_STAT_DD) == 0) break; - ring->slot[j].len = le16toh(curr->wb.upper.length); - ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = le16toh(curr->wb.upper.length); + ring->slot[nm_i].flags = slot_flags; bus_dmamap_sync(rxr->ptag, - rxr->rx_buffers[l].pmap, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + rxr->rx_buffers[nic_i].pmap, BUS_DMASYNC_POSTREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ - rxr->next_to_check = l; - kring->nr_hwavail += n; + rxr->next_to_check = nic_i; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - struct netmap_slot *slot = ring->slot + j; - union e1000_adv_rx_desc *curr = &rxr->rx_base[l]; - struct igb_rx_buf *rxbuf = rxr->rx_buffers + l; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - return netmap_ring_reinit(kring); - } + union e1000_adv_rx_desc *curr = &rxr->rx_base[nic_i]; + struct igb_rx_buf *rxbuf = &rxr->rx_buffers[nic_i]; + + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, reload map */ netmap_reload_map(rxr->ptag, rxbuf->pmap, addr); slot->flags &= ~NS_BUF_CHANGED; } - curr->read.pkt_addr = htole64(paddr); curr->wb.upper.status_error = 0; + curr->read.pkt_addr = htole64(paddr); bus_dmamap_sync(rxr->ptag, rxbuf->pmap, - BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + BUS_DMASYNC_PREREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = head; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), l); + nic_i = nm_prev(nic_i, lim); + E1000_WRITE_REG(&adapter->hw, E1000_RDT(rxr->me), nic_i); } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; + + /* tell userspace that there might be new packets */ + nm_rxsync_finalize(kring); + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } @@ -314,6 +307,8 @@ igb_netmap_attach(struct adapter *adapter) na.nm_txsync = igb_netmap_txsync; na.nm_rxsync = igb_netmap_rxsync; na.nm_register = igb_netmap_reg; - netmap_attach(&na, adapter->num_queues); -} + na.num_tx_rings = na.num_rx_rings = adapter->num_queues; + netmap_attach(&na); +} + /* end of file */ diff --git a/sys/dev/netmap/if_lem_netmap.h b/sys/dev/netmap/if_lem_netmap.h index 25e5c7c27..4fce5c988 100644 --- a/sys/dev/netmap/if_lem_netmap.h +++ b/sys/dev/netmap/if_lem_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -27,11 +27,12 @@ /* * $FreeBSD$ * - * netmap support for "lem" + * netmap support for: lem * * For details on netmap support please see ixgbe_netmap.h */ + #include #include #include @@ -40,17 +41,13 @@ /* - * Register/unregister + * Register/unregister. We are already under netmap lock. */ static int -lem_netmap_reg(struct ifnet *ifp, int onoff) +lem_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - - if (na == NULL) - return EINVAL; EM_CORE_LOCK(adapter); @@ -64,24 +61,14 @@ lem_netmap_reg(struct ifnet *ifp, int onoff) taskqueue_drain(adapter->tq, &adapter->rxtx_task); taskqueue_drain(adapter->tq, &adapter->link_task); #endif /* !EM_LEGCY_IRQ */ - if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_transmit; - - lem_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + /* enable or disable flags and callbacks in na and ifp */ + if (onoff) { + nm_set_native_flags(na); } else { -fail: - /* return to non-netmap mode */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - lem_init_locked(adapter); /* also enable intr */ + nm_clear_native_flags(na); } + lem_init_locked(adapter); /* also enable intr */ #ifndef EM_LEGACY_IRQ taskqueue_unblock(adapter->tq); // XXX do we need this ? @@ -89,7 +76,7 @@ fail: EM_CORE_UNLOCK(adapter); - return (error); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } @@ -97,108 +84,89 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) +lem_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, k, l, n = 0, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; /* generate an interrupt approximately every half ring */ - int report_frequency = kring->nkr_num_slots >> 1; - - ND("%s: hwofs %d, hwcur %d hwavail %d lease %d cur %d avail %d", - ifp->if_xname, - kring->nkr_hwofs, kring->nr_hwcur, kring->nr_hwavail, - kring->nkr_hwlease, - ring->cur, ring->avail); - /* take a copy of ring->cur now, and never read it again */ - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); + u_int report_frequency = kring->nkr_num_slots >> 1; + + /* device-specific */ + struct adapter *adapter = ifp->if_softc; bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, BUS_DMASYNC_POSTREAD); + /* - * Process new packets to send. j is the current index in the - * netmap ring, l is the corresponding index in the NIC ring. + * First part: process new packets to send. */ - j = kring->nr_hwcur; - if (netmap_verbose > 255) - RD(5, "device %s send %d->%d", ifp->if_xname, j, k); - if (j != k) { /* we have new packets to send */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* slot is the current slot in the netmap ring */ - struct netmap_slot *slot = &ring->slot[j]; - /* curr is the current slot in the nic ring */ - struct e1000_tx_desc *curr = &adapter->tx_desc_base[l]; - struct em_buffer *txbuf = &adapter->tx_buffer_area[l]; - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - E1000_TXD_CMD_RS : 0; + + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + while (nm_i != head) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; uint64_t paddr; void *addr = PNMB(slot, &paddr); - u_int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - return netmap_ring_reinit(kring); - } - ND("slot %d NIC %d %s", j, l, nm_dump_buf(addr, len, 128, NULL)); + /* device-specific */ + struct e1000_tx_desc *curr = &adapter->tx_desc_base[nic_i]; + struct em_buffer *txbuf = &adapter->tx_buffer_area[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + E1000_TXD_CMD_RS : 0; + + NM_CHECK_ADDR_LEN(addr, len); - slot->flags &= ~NS_REPORT; - if (1 || slot->flags & NS_BUF_CHANGED) { + if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ - netmap_reload_map(adapter->txtag, txbuf->map, addr); curr->buffer_addr = htole64(paddr); - slot->flags &= ~NS_BUF_CHANGED; + netmap_reload_map(adapter->txtag, txbuf->map, addr); } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ curr->upper.data = 0; - curr->lower.data = - htole32( adapter->txd_cmd | len | + curr->lower.data = htole32(adapter->txd_cmd | len | (E1000_TXD_CMD_EOP | flags) ); - - ND("len %d kring %d nic %d", len, j, l); bus_dmamap_sync(adapter->txtag, txbuf->map, - BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + BUS_DMASYNC_PREWRITE); + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - ND("sent %d packets from %d, TDT now %d", n, kring->nr_hwcur, l); - kring->nr_hwcur = k; /* the saved ring->cur */ - kring->nr_hwavail -= n; + kring->nr_hwcur = head; + /* synchronize the NIC ring */ bus_dmamap_sync(adapter->txdma.dma_tag, adapter->txdma.dma_map, - BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), l); + /* (re)start the tx unit up to slot nic_i (excluded) */ + E1000_WRITE_REG(&adapter->hw, E1000_TDT(0), nic_i); } - if (n == 0 || kring->nr_hwavail < 1) { - int delta; - + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (ticks != kring->last_reclaim || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + kring->last_reclaim = ticks; /* record completed transmissions using TDH */ - l = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); - ND("tdh is now %d", l); - if (l >= kring->nkr_num_slots) { /* XXX can it happen ? */ - D("bad TDH %d", l); - l -= kring->nkr_num_slots; - } - delta = l - adapter->next_tx_to_clean; - if (delta) { - /* some tx completed, increment hwavail. */ - if (delta < 0) - delta += kring->nkr_num_slots; - if (netmap_verbose > 255) - RD(5, "%s tx recover %d bufs", - ifp->if_xname, delta); - adapter->next_tx_to_clean = l; - kring->nr_hwavail += delta; + nic_i = E1000_READ_REG(&adapter->hw, E1000_TDH(0)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } + adapter->next_tx_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + + nm_txsync_finalize(kring); return 0; } @@ -208,39 +176,39 @@ lem_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) * Reconcile kernel and user view of the receive ring. */ static int -lem_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) +lem_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - int j, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; - if (k > lim) - return netmap_ring_reinit(kring); + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + if (head > lim) + return netmap_ring_reinit(kring); /* XXX check sync modes */ bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * Import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. */ - l = adapter->next_rx_desc_to_check; - j = netmap_idx_n2k(kring, l); - ND("%s: next NIC %d kring %d (ofs %d), hwcur %d hwavail %d cur %d avail %d", - ifp->if_xname, - l, j, kring->nkr_hwofs, kring->nr_hwcur, kring->nr_hwavail, - ring->cur, ring->avail); if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; + nic_i = adapter->next_rx_desc_to_check; + nm_i = netmap_idx_n2k(kring, nic_i); + for (n = 0; ; n++) { - struct e1000_rx_desc *curr = &adapter->rx_desc_base[l]; + struct e1000_rx_desc *curr = &adapter->rx_desc_base[nic_i]; uint32_t staterr = le32toh(curr->status); int len; @@ -248,76 +216,77 @@ lem_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) break; len = le16toh(curr->length) - 4; // CRC if (len < 0) { - D("bogus pkt size at %d", j); + D("bogus pkt size %d nic idx %d", len, nic_i); len = 0; } - ND("\n%s", nm_dump_buf(NMB(&ring->slot[j]), - len, 128, NULL)); - ring->slot[j].len = len; - ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = len; + ring->slot[nm_i].flags = slot_flags; bus_dmamap_sync(adapter->rxtag, - adapter->rx_buffer_area[l].map, - BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + adapter->rx_buffer_area[nic_i].map, + BUS_DMASYNC_POSTREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ - adapter->next_rx_desc_to_check = l; - kring->nr_hwavail += n; + ND("%d new packets at nic %d nm %d tail %d", + n, + adapter->next_rx_desc_to_check, + netmap_idx_n2k(kring, adapter->next_rx_desc_to_check), + kring->nr_hwtail); + adapter->next_rx_desc_to_check = nic_i; + // ifp->if_ipackets += n; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); /* NIC ring index */ - for (n = 0; j != k; n++) { - struct netmap_slot *slot = &ring->slot[j]; - struct e1000_rx_desc *curr = &adapter->rx_desc_base[l]; - struct em_buffer *rxbuf = &adapter->rx_buffer_area[l]; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - return netmap_ring_reinit(kring); - } + struct e1000_rx_desc *curr = &adapter->rx_desc_base[nic_i]; + struct em_buffer *rxbuf = &adapter->rx_buffer_area[nic_i]; + + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { /* buffer has changed, reload map */ - netmap_reload_map(adapter->rxtag, rxbuf->map, addr); curr->buffer_addr = htole64(paddr); + netmap_reload_map(adapter->rxtag, rxbuf->map, addr); slot->flags &= ~NS_BUF_CHANGED; } curr->status = 0; - bus_dmamap_sync(adapter->rxtag, rxbuf->map, BUS_DMASYNC_PREREAD); - - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = head; bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); /* * IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), l); + nic_i = nm_prev(nic_i, lim); + E1000_WRITE_REG(&adapter->hw, E1000_RDT(0), nic_i); } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; + + /* tell userspace that there might be new packets */ + nm_rxsync_finalize(kring); + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } @@ -335,7 +304,8 @@ lem_netmap_attach(struct adapter *adapter) na.nm_txsync = lem_netmap_txsync; na.nm_rxsync = lem_netmap_rxsync; na.nm_register = lem_netmap_reg; - netmap_attach(&na, 1); + na.num_tx_rings = na.num_rx_rings = 1; + netmap_attach(&na); } /* end of file */ diff --git a/sys/dev/netmap/if_re_netmap.h b/sys/dev/netmap/if_re_netmap.h index ac781ccb5..10abe4f49 100644 --- a/sys/dev/netmap/if_re_netmap.h +++ b/sys/dev/netmap/if_re_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,8 +26,9 @@ /* * $FreeBSD$ * - * netmap support for "re" - * For details on netmap support please see ixgbe_netmap.h + * netmap support for: re + * + * For more details on netmap support please see ixgbe_netmap.h */ @@ -39,44 +40,24 @@ /* - * support for netmap register/unregisted. We are already under core lock. - * only called on the first register or the last unregister. + * Register/unregister. We are already under netmap lock. */ static int -re_netmap_reg(struct ifnet *ifp, int onoff) +re_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct rl_softc *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - - if (na == NULL) - return EINVAL; - /* Tell the stack that the interface is no longer active */ - ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); - - re_stop(adapter); + RL_LOCK(adapter); + re_stop(adapter); /* also clears IFF_DRV_RUNNING */ if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - - /* save if_transmit to restore it later */ - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_transmit; - - re_init_locked(adapter); - - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } + nm_set_native_flags(na); } else { -fail: - /* restore if_transmit */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - re_init_locked(adapter); /* also enables intr */ + nm_clear_native_flags(na); } - return (error); + re_init_locked(adapter); /* also enables intr */ + RL_UNLOCK(adapter); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } @@ -84,90 +65,102 @@ fail: * Reconcile kernel and user view of the transmit ring. */ static int -re_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) +re_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct rl_softc *sc = ifp->if_softc; - struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc; - struct netmap_adapter *na = NA(sc->rl_ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - int j, k, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); + /* device-specific */ + struct rl_softc *sc = ifp->if_softc; + struct rl_txdesc *txd = sc->rl_ldata.rl_tx_desc; - /* Sync the TX descriptor list */ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, - sc->rl_ldata.rl_tx_list_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); - - /* XXX move after the transmissions */ - /* record completed transmissions */ - for (n = 0, l = sc->rl_ldata.rl_tx_considx; - l != sc->rl_ldata.rl_tx_prodidx; - n++, l = RL_TX_DESC_NXT(sc, l)) { - uint32_t cmdstat = - le32toh(sc->rl_ldata.rl_tx_list[l].rl_cmdstat); - if (cmdstat & RL_TDESC_STAT_OWN) - break; - } - if (n > 0) { - sc->rl_ldata.rl_tx_considx = l; - sc->rl_ldata.rl_tx_free += n; - kring->nr_hwavail += n; - } + sc->rl_ldata.rl_tx_list_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); // XXX extra postwrite ? - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + /* + * First part: process new packets to send. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + nic_i = sc->rl_ldata.rl_tx_prodidx; + // XXX or netmap_idx_k2n(kring, nm_i); + + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; + uint64_t paddr; + void *addr = PNMB(slot, &paddr); - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - l = sc->rl_ldata.rl_tx_prodidx; - for (n = 0; j != k; n++) { - struct netmap_slot *slot = &ring->slot[j]; - struct rl_desc *desc = &sc->rl_ldata.rl_tx_list[l]; + /* device-specific */ + struct rl_desc *desc = &sc->rl_ldata.rl_tx_list[nic_i]; int cmd = slot->len | RL_TDESC_CMD_EOF | RL_TDESC_CMD_OWN | RL_TDESC_CMD_SOF ; - uint64_t paddr; - void *addr = PNMB(slot, &paddr); - int len = slot->len; - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { - // XXX what about prodidx ? - return netmap_ring_reinit(kring); - } + NM_CHECK_ADDR_LEN(addr, len); - if (l == lim) /* mark end of ring */ + if (nic_i == lim) /* mark end of ring */ cmd |= RL_TDESC_CMD_EOR; if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, reload map */ desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); - /* buffer has changed, unload and reload map */ netmap_reload_map(sc->rl_ldata.rl_tx_mtag, - txd[l].tx_dmamap, addr); - slot->flags &= ~NS_BUF_CHANGED; + txd[nic_i].tx_dmamap, addr); } - slot->flags &= ~NS_REPORT; + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ desc->rl_cmdstat = htole32(cmd); + + /* make sure changes to the buffer are synced */ bus_dmamap_sync(sc->rl_ldata.rl_tx_mtag, - txd[l].tx_dmamap, BUS_DMASYNC_PREWRITE); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + txd[nic_i].tx_dmamap, + BUS_DMASYNC_PREWRITE); + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - sc->rl_ldata.rl_tx_prodidx = l; - kring->nr_hwcur = k; /* the saved ring->cur */ - ring->avail -= n; // XXX see others - kring->nr_hwavail = ring->avail; + sc->rl_ldata.rl_tx_prodidx = nic_i; + kring->nr_hwcur = head; + /* synchronize the NIC ring */ bus_dmamap_sync(sc->rl_ldata.rl_tx_list_tag, - sc->rl_ldata.rl_tx_list_map, - BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); + sc->rl_ldata.rl_tx_list_map, + BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE); /* start ? */ CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); } + + /* + * Second part: reclaim buffers for completed transmissions. + */ + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + nic_i = sc->rl_ldata.rl_tx_considx; + for (n = 0; nic_i != sc->rl_ldata.rl_tx_prodidx; + n++, nic_i = RL_TX_DESC_NXT(sc, nic_i)) { + uint32_t cmdstat = + le32toh(sc->rl_ldata.rl_tx_list[nic_i].rl_cmdstat); + if (cmdstat & RL_TDESC_STAT_OWN) + break; + } + if (n > 0) { + sc->rl_ldata.rl_tx_considx = nic_i; + sc->rl_ldata.rl_tx_free += n; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); + } + } + + nm_txsync_finalize(kring); + return 0; } @@ -176,42 +169,46 @@ re_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) * Reconcile kernel and user view of the receive ring. */ static int -re_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) +re_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct rl_softc *sc = ifp->if_softc; - struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc; - struct netmap_adapter *na = NA(sc->rl_ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - int j, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; - k = ring->cur; - if (k > lim) + /* device-specific */ + struct rl_softc *sc = ifp->if_softc; + struct rl_rxdesc *rxd = sc->rl_ldata.rl_rx_desc; + + if (head > lim) return netmap_ring_reinit(kring); - /* XXX check sync modes */ bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, - sc->rl_ldata.rl_rx_list_map, - BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + sc->rl_ldata.rl_rx_list_map, + BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * Import newly received packets into the netmap ring. - * j is an index in the netmap ring, l in the NIC ring. + * First part: import newly received packets. * - * The device uses all the buffers in the ring, so we need + * This device uses all the buffers in the ring, so we need * another termination condition in addition to RL_RDESC_STAT_OWN - * cleared (all buffers could have it cleared. The easiest one - * is to limit the amount of data reported up to 'lim' + * cleared (all buffers could have it cleared). The easiest one + * is to stop right before nm_hwcur. */ - l = sc->rl_ldata.rl_rx_prodidx; /* next pkt to check */ - j = netmap_idx_n2k(kring, l); /* the kring index */ if (netmap_no_pendintr || force_update) { uint16_t slot_flags = kring->nkr_slot_flags; + uint32_t stop_i = nm_prev(kring->nr_hwcur, lim); - for (n = kring->nr_hwavail; n < lim ; n++) { - struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[l]; + nic_i = sc->rl_ldata.rl_rx_prodidx; /* next pkt to check */ + nm_i = netmap_idx_n2k(kring, nic_i); + + while (nm_i != stop_i) { + struct rl_desc *cur_rx = &sc->rl_ldata.rl_rx_list[nic_i]; uint32_t rxstat = le32toh(cur_rx->rl_cmdstat); uint32_t total_len; @@ -220,74 +217,72 @@ re_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) total_len = rxstat & sc->rl_rxlenmask; /* XXX subtract crc */ total_len = (total_len < 4) ? 0 : total_len - 4; - kring->ring->slot[j].len = total_len; - kring->ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = total_len; + ring->slot[nm_i].flags = slot_flags; /* sync was in re_newbuf() */ bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, - rxd[l].rx_dmamap, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; - } - if (n != kring->nr_hwavail) { - sc->rl_ldata.rl_rx_prodidx = l; - sc->rl_ifp->if_ipackets += n - kring->nr_hwavail; - kring->nr_hwavail = n; + rxd[nic_i].rx_dmamap, BUS_DMASYNC_POSTREAD); + // sc->rl_ifp->if_ipackets++; + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } + sc->rl_ldata.rl_rx_prodidx = nic_i; + kring->nr_hwtail = nm_i; kring->nr_kflags &= ~NKR_PENDINTR; } - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); /* the NIC index */ - for (n = 0; j != k; n++) { - struct netmap_slot *slot = ring->slot + j; - struct rl_desc *desc = &sc->rl_ldata.rl_rx_list[l]; - int cmd = NETMAP_BUF_SIZE | RL_RDESC_CMD_OWN; + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); - if (addr == netmap_buffer_base) { /* bad buf */ - return netmap_ring_reinit(kring); - } + struct rl_desc *desc = &sc->rl_ldata.rl_rx_list[nic_i]; + int cmd = NETMAP_BUF_SIZE | RL_RDESC_CMD_OWN; + + if (addr == netmap_buffer_base) /* bad buf */ + goto ring_reset; - if (l == lim) /* mark end of ring */ + if (nic_i == lim) /* mark end of ring */ cmd |= RL_RDESC_CMD_EOR; - slot->flags &= ~NS_REPORT; if (slot->flags & NS_BUF_CHANGED) { - netmap_reload_map(sc->rl_ldata.rl_rx_mtag, - rxd[l].rx_dmamap, addr); + /* buffer has changed, reload map */ desc->rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); desc->rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + netmap_reload_map(sc->rl_ldata.rl_rx_mtag, + rxd[nic_i].rx_dmamap, addr); slot->flags &= ~NS_BUF_CHANGED; } desc->rl_cmdstat = htole32(cmd); bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, - rxd[l].rx_dmamap, BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + rxd[nic_i].rx_dmamap, + BUS_DMASYNC_PREREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; - /* Flush the RX DMA ring */ + kring->nr_hwcur = head; bus_dmamap_sync(sc->rl_ldata.rl_rx_list_tag, sc->rl_ldata.rl_rx_list_map, - BUS_DMASYNC_PREWRITE|BUS_DMASYNC_PREREAD); + BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; + + /* tell userspace that there might be new packets */ + nm_rxsync_finalize(kring); + return 0; + +ring_reset: + return netmap_ring_reinit(kring); } + /* * Additional routines to init the tx and rx rings. * In other drivers we do that inline in the main code. @@ -299,11 +294,16 @@ re_netmap_tx_init(struct rl_softc *sc) struct rl_desc *desc; int i, n; struct netmap_adapter *na = NA(sc->rl_ifp); - struct netmap_slot *slot = netmap_reset(na, NR_TX, 0, 0); + struct netmap_slot *slot; + if (!na || !(na->na_flags & NAF_NATIVE_ON)) { + return; + } + + slot = netmap_reset(na, NR_TX, 0, 0); /* slot is NULL if we are not in netmap mode */ if (!slot) - return; + return; // XXX cannot happen /* in netmap mode, overwrite addresses and maps */ txd = sc->rl_ldata.rl_tx_desc; desc = sc->rl_ldata.rl_tx_list; @@ -329,36 +329,35 @@ re_netmap_rx_init(struct rl_softc *sc) struct netmap_slot *slot = netmap_reset(na, NR_RX, 0, 0); struct rl_desc *desc = sc->rl_ldata.rl_rx_list; uint32_t cmdstat; - int i, n, max_avail; + uint32_t nic_i, max_avail; + uint32_t const n = sc->rl_ldata.rl_rx_desc_cnt; if (!slot) return; - n = sc->rl_ldata.rl_rx_desc_cnt; /* - * Userspace owned hwavail packets before the reset, - * so the NIC that last hwavail descriptors of the ring - * are still owned by the driver (and keep one empty). + * Do not release the slots owned by userspace, + * and also keep one empty. */ - max_avail = n - 1 - na->rx_rings[0].nr_hwavail; - for (i = 0; i < n; i++) { + max_avail = n - 1 - nm_kr_rxspace(&na->rx_rings[0]); + for (nic_i = 0; nic_i < n; nic_i++) { void *addr; uint64_t paddr; - int l = netmap_idx_n2k(&na->rx_rings[0], i); + uint32_t nm_i = netmap_idx_n2k(&na->rx_rings[0], nic_i); - addr = PNMB(slot + l, &paddr); + addr = PNMB(slot + nm_i, &paddr); netmap_reload_map(sc->rl_ldata.rl_rx_mtag, - sc->rl_ldata.rl_rx_desc[i].rx_dmamap, addr); + sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, addr); bus_dmamap_sync(sc->rl_ldata.rl_rx_mtag, - sc->rl_ldata.rl_rx_desc[i].rx_dmamap, BUS_DMASYNC_PREREAD); - desc[i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); - desc[i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); + sc->rl_ldata.rl_rx_desc[nic_i].rx_dmamap, BUS_DMASYNC_PREREAD); + desc[nic_i].rl_bufaddr_lo = htole32(RL_ADDR_LO(paddr)); + desc[nic_i].rl_bufaddr_hi = htole32(RL_ADDR_HI(paddr)); cmdstat = NETMAP_BUF_SIZE; - if (i == n - 1) /* mark the end of ring */ + if (nic_i == n - 1) /* mark the end of ring */ cmdstat |= RL_RDESC_CMD_EOR; - if (i < max_avail) + if (nic_i < max_avail) cmdstat |= RL_RDESC_CMD_OWN; - desc[i].rl_cmdstat = htole32(cmdstat); + desc[nic_i].rl_cmdstat = htole32(cmdstat); } } @@ -377,6 +376,8 @@ re_netmap_attach(struct rl_softc *sc) na.nm_txsync = re_netmap_txsync; na.nm_rxsync = re_netmap_rxsync; na.nm_register = re_netmap_reg; - netmap_attach(&na, 1); + na.num_tx_rings = na.num_rx_rings = 1; + netmap_attach(&na); } + /* end of file */ diff --git a/sys/dev/netmap/ixgbe_netmap.h b/sys/dev/netmap/ixgbe_netmap.h index fca1cf1e0..a617cc4c2 100644 --- a/sys/dev/netmap/ixgbe_netmap.h +++ b/sys/dev/netmap/ixgbe_netmap.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -26,16 +26,16 @@ /* * $FreeBSD$ * - * netmap modifications for ixgbe + * netmap support for: ixgbe * * This file is meant to be a reference on how to implement * netmap support for a network driver. - * This file contains code but only static or inline functions - * that are used by a single driver. To avoid replication of - * code we just #include it near the beginning of the - * standard driver. + * This file contains code but only static or inline functions used + * by a single driver. To avoid replication of code we just #include + * it near the beginning of the standard driver. */ + #include #include /* @@ -48,7 +48,10 @@ */ #include + /* + * device-specific sysctl variables: + * * ix_crcstrip: 0: keep CRC in rx frames (default), 1: strip it. * During regular operations the CRC is stripped, but on some * hardware reception of frames not multiple of 64 is slower, @@ -56,17 +59,11 @@ * * ix_rx_miss, ix_rx_miss_bufs: * count packets that might be missed due to lost interrupts. - * - * ix_use_dd - * use the dd bit for completed tx transmissions. - * This is tricky, much better to use TDH for now. */ SYSCTL_DECL(_dev_netmap); -static int ix_rx_miss, ix_rx_miss_bufs, ix_use_dd, ix_crcstrip; +static int ix_rx_miss, ix_rx_miss_bufs, ix_crcstrip; SYSCTL_INT(_dev_netmap, OID_AUTO, ix_crcstrip, CTLFLAG_RW, &ix_crcstrip, 0, "strip CRC on rx frames"); -SYSCTL_INT(_dev_netmap, OID_AUTO, ix_use_dd, - CTLFLAG_RW, &ix_use_dd, 0, "use dd instead of tdh to detect tx frames"); SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss, CTLFLAG_RW, &ix_rx_miss, 0, "potentially missed rx intr"); SYSCTL_INT(_dev_netmap, OID_AUTO, ix_rx_miss_bufs, @@ -110,283 +107,221 @@ set_crcstrip(struct ixgbe_hw *hw, int onoff) IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rxc); } + /* - * Register/unregister. We are already under core lock. + * Register/unregister. We are already under netmap lock. * Only called on the first register or the last unregister. */ static int -ixgbe_netmap_reg(struct ifnet *ifp, int onoff) +ixgbe_netmap_reg(struct netmap_adapter *na, int onoff) { + struct ifnet *ifp = na->ifp; struct adapter *adapter = ifp->if_softc; - struct netmap_adapter *na = NA(ifp); - int error = 0; - - if (na == NULL) - return EINVAL; /* no netmap support here */ IXGBE_CORE_LOCK(adapter); - ixgbe_disable_intr(adapter); + ixgbe_disable_intr(adapter); // XXX maybe ixgbe_stop ? /* Tell the stack that the interface is no longer active */ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); set_crcstrip(&adapter->hw, onoff); - if (onoff) { /* enable netmap mode */ - ifp->if_capenable |= IFCAP_NETMAP; - - /* save if_transmit and replace with our routine */ - na->if_transmit = ifp->if_transmit; - ifp->if_transmit = netmap_transmit; - - /* - * reinitialize the adapter, now with netmap flag set, - * so the rings will be set accordingly. - */ - ixgbe_init_locked(adapter); - if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) == 0) { - error = ENOMEM; - goto fail; - } - } else { /* reset normal mode (explicit request or netmap failed) */ -fail: - /* restore if_transmit */ - ifp->if_transmit = na->if_transmit; - ifp->if_capenable &= ~IFCAP_NETMAP; - /* initialize the card, this time in standard mode */ - ixgbe_init_locked(adapter); /* also enables intr */ + /* enable or disable flags and callbacks in na and ifp */ + if (onoff) { + nm_set_native_flags(na); + } else { + nm_clear_native_flags(na); } - set_crcstrip(&adapter->hw, onoff); + ixgbe_init_locked(adapter); /* also enables intr */ + set_crcstrip(&adapter->hw, onoff); // XXX why twice ? IXGBE_CORE_UNLOCK(adapter); - return (error); + return (ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1); } /* * Reconcile kernel and user view of the transmit ring. - * This routine might be called frequently so it must be efficient. - * - * ring->cur holds the userspace view of the current ring index. Userspace - * has filled the tx slots from the previous call's ring->cur up to but not - * including ring->cur for this call. In this function the kernel updates - * kring->nr_hwcur to ring->cur, thus slots [kring->nr_hwcur, ring->cur) are - * now ready to transmit. At the last interrupt kring->nr_hwavail slots were - * available. - * - * This function runs under lock (acquired from the caller or internally). - * It must first update ring->avail to what the kernel knows, - * subtract the newly used slots (ring->cur - kring->nr_hwcur) - * from both avail and nr_hwavail, and set ring->nr_hwcur = ring->cur - * issuing a dmamap_sync on all slots. * - * Since ring comes from userspace, its content must be read only once, - * and validated before being used to update the kernel's structures. - * (this is also true for every use of ring in the kernel). + * All information is in the kring. + * Userspace wants to send packets up to the one before kring->rhead, + * kernel knows kring->nr_hwcur is the first unsent packet. * - * ring->avail is never used, only checked for bogus values. + * Here we push packets out (as many as possible), and possibly + * reclaim buffers from previously completed transmission. * - * I flags & FORCE_RECLAIM, reclaim transmitted - * buffers irrespective of interrupt mitigation. + * The caller (netmap) guarantees that there is only one instance + * running at any time. Any interference with other driver + * methods should be handled by the individual drivers. */ static int -ixgbe_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) +ixgbe_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct tx_ring *txr = &adapter->tx_rings[ring_nr]; - struct netmap_adapter *na = NA(adapter->ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->tx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n = 0; - u_int const k = ring->cur, lim = kring->nkr_num_slots - 1; - + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; /* - * ixgbe can generate an interrupt on every tx packet, but it - * seems very expensive, so we interrupt once every half ring, - * or when requested with NS_REPORT + * interrupts on every tx packet are expensive so request + * them every half ring, or where NS_REPORT is set */ u_int report_frequency = kring->nkr_num_slots >> 1; - if (k > lim) - return netmap_ring_reinit(kring); + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct tx_ring *txr = &adapter->tx_rings[ring_nr]; + int reclaim_tx; bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_POSTREAD); /* - * Process new packets to send. j is the current index in the - * netmap ring, l is the corresponding index in the NIC ring. + * First part: process new packets to send. + * nm_i is the current index in the netmap ring, + * nic_i is the corresponding index in the NIC ring. * The two numbers differ because upon a *_init() we reset * the NIC ring but leave the netmap ring unchanged. * For the transmit ring, we have * - * j = kring->nr_hwcur - * l = IXGBE_TDT (not tracked in the driver) + * nm_i = kring->nr_hwcur + * nic_i = IXGBE_TDT (not tracked in the driver) * and - * j == (l + kring->nkr_hwofs) % ring_size + * nm_i == (nic_i + kring->nkr_hwofs) % ring_size * * In this driver kring->nkr_hwofs >= 0, but for other * drivers it might be negative as well. */ - j = kring->nr_hwcur; - if (j != k) { /* we have new packets to send */ - prefetch(&ring->slot[j]); - l = netmap_idx_k2n(kring, j); /* NIC index */ - prefetch(&txr->tx_buffers[l]); - for (n = 0; j != k; n++) { - /* - * Collect per-slot info. - * Note that txbuf and curr are indexed by l. - * - * In this driver we collect the buffer address - * (using the PNMB() macro) because we always - * need to rewrite it into the NIC ring. - * Many other drivers preserve the address, so - * we only need to access it if NS_BUF_CHANGED - * is set. - * XXX note, on this device the dmamap* calls are - * not necessary because tag is 0, however just accessing - * the per-packet tag kills 1Mpps at 900 MHz. - */ - struct netmap_slot *slot = &ring->slot[j]; - union ixgbe_adv_tx_desc *curr = &txr->tx_base[l]; - struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[l]; - uint64_t paddr; - // XXX type for flags and len ? - int flags = ((slot->flags & NS_REPORT) || - j == 0 || j == report_frequency) ? - IXGBE_TXD_CMD_RS : 0; + + /* + * If we have packets to send (kring->nr_hwcur != kring->rhead) + * iterate over the netmap ring, fetch length and update + * the corresponding slot in the NIC ring. Some drivers also + * need to update the buffer's physical address in the NIC slot + * even NS_BUF_CHANGED is not set (PNMB computes the addresses). + * + * The netmap_reload_map() calls is especially expensive, + * even when (as in this case) the tag is 0, so do only + * when the buffer has actually changed. + * + * If possible do not set the report/intr bit on all slots, + * but only a few times per ring or when NS_REPORT is set. + * + * Finally, on 10G and faster drivers, it might be useful + * to prefetch the next slot and txr entry. + */ + + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + nic_i = netmap_idx_k2n(kring, nm_i); + + __builtin_prefetch(&ring->slot[nm_i]); + __builtin_prefetch(&txr->tx_buffers[nic_i]); + + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; u_int len = slot->len; + uint64_t paddr; void *addr = PNMB(slot, &paddr); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; - prefetch(&ring->slot[j]); - prefetch(&txr->tx_buffers[l]); - - /* - * Quick check for valid addr and len. - * NMB() returns netmap_buffer_base for invalid - * buffer indexes (but the address is still a - * valid one to be used in a ring). slot->len is - * unsigned so no need to check for negative values. - */ - if (addr == netmap_buffer_base || len > NETMAP_BUF_SIZE) { -ring_reset: - return netmap_ring_reinit(kring); - } + /* device-specific */ + union ixgbe_adv_tx_desc *curr = &txr->tx_base[nic_i]; + struct ixgbe_tx_buf *txbuf = &txr->tx_buffers[nic_i]; + int flags = (slot->flags & NS_REPORT || + nic_i == 0 || nic_i == report_frequency) ? + IXGBE_TXD_CMD_RS : 0; + + /* prefetch for next round */ + __builtin_prefetch(&ring->slot[nm_i + 1]); + __builtin_prefetch(&txr->tx_buffers[nic_i + 1]); + + NM_CHECK_ADDR_LEN(addr, len); if (slot->flags & NS_BUF_CHANGED) { - /* buffer has changed, unload and reload map */ + /* buffer has changed, reload map */ netmap_reload_map(txr->txtag, txbuf->map, addr); - slot->flags &= ~NS_BUF_CHANGED; } - slot->flags &= ~NS_REPORT; - /* - * Fill the slot in the NIC ring. - * In this driver we need to rewrite the buffer - * address in the NIC ring. Other drivers do not - * need this. - * Use legacy descriptor, it is faster. - */ + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + + /* Fill the slot in the NIC ring. */ + /* Use legacy descriptor, they are faster? */ curr->read.buffer_addr = htole64(paddr); curr->read.olinfo_status = 0; curr->read.cmd_type_len = htole32(len | flags | IXGBE_ADVTXD_DCMD_IFCS | IXGBE_TXD_CMD_EOP); /* make sure changes to the buffer are synced */ - bus_dmamap_sync(txr->txtag, txbuf->map, BUS_DMASYNC_PREWRITE); + bus_dmamap_sync(txr->txtag, txbuf->map, + BUS_DMASYNC_PREWRITE); + + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwcur = k; /* the saved ring->cur */ - /* decrease avail by number of packets sent */ - kring->nr_hwavail -= n; + kring->nr_hwcur = head; /* synchronize the NIC ring */ bus_dmamap_sync(txr->txdma.dma_tag, txr->txdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - /* (re)start the transmitter up to slot l (excluded) */ - IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), l); + + /* (re)start the tx unit up to slot nic_i (excluded) */ + IXGBE_WRITE_REG(&adapter->hw, IXGBE_TDT(txr->me), nic_i); } /* - * Reclaim buffers for completed transmissions. + * Second part: reclaim buffers for completed transmissions. * Because this is expensive (we read a NIC register etc.) * we only do it in specific cases (see below). - * In all cases kring->nr_kflags indicates which slot will be - * checked upon a tx interrupt (nkr_num_slots means none). */ if (flags & NAF_FORCE_RECLAIM) { - j = 1; /* forced reclaim, ignore interrupts */ - kring->nr_kflags = kring->nkr_num_slots; - } else if (kring->nr_hwavail > 0) { - j = 0; /* buffers still available: no reclaim, ignore intr. */ - kring->nr_kflags = kring->nkr_num_slots; + reclaim_tx = 1; /* forced reclaim */ + } else if (!nm_kr_txempty(kring)) { + reclaim_tx = 0; /* have buffers, no reclaim */ } else { /* - * no buffers available, locate a slot for which we request - * ReportStatus (approximately half ring after next_to_clean) - * and record it in kring->nr_kflags. - * If the slot has DD set, do the reclaim looking at TDH, - * otherwise we go to sleep (in netmap_poll()) and will be - * woken up when slot nr_kflags will be ready. + * No buffers available. Locate previous slot with + * REPORT_STATUS set. + * If the slot has DD set, we can reclaim space, + * otherwise wait for the next interrupt. + * This enables interrupt moderation on the tx + * side though it might reduce throughput. */ struct ixgbe_legacy_tx_desc *txd = (struct ixgbe_legacy_tx_desc *)txr->tx_base; - j = txr->next_to_clean + kring->nkr_num_slots/2; - if (j >= kring->nkr_num_slots) - j -= kring->nkr_num_slots; + nic_i = txr->next_to_clean + report_frequency; + if (nic_i > lim) + nic_i -= lim + 1; // round to the closest with dd set - j= (j < kring->nkr_num_slots / 4 || j >= kring->nkr_num_slots*3/4) ? + nic_i = (nic_i < kring->nkr_num_slots / 4 || + nic_i >= kring->nkr_num_slots*3/4) ? 0 : report_frequency; - kring->nr_kflags = j; /* the slot to check */ - j = txd[j].upper.fields.status & IXGBE_TXD_STAT_DD; // XXX cpu_to_le32 ? + reclaim_tx = txd[nic_i].upper.fields.status & IXGBE_TXD_STAT_DD; // XXX cpu_to_le32 ? } - if (j) { - int delta; - + if (reclaim_tx) { /* * Record completed transmissions. * We (re)use the driver's txr->next_to_clean to keep * track of the most recently completed transmission. * - * The datasheet discourages the use of TDH to find out the - * number of sent packets. We should rather check the DD - * status bit in a packet descriptor. However, we only set - * the "report status" bit for some descriptors (a kind of - * interrupt mitigation), so we can only check on those. - * For the time being we use TDH, as we do it infrequently - * enough not to pose performance problems. + * The datasheet discourages the use of TDH to find + * out the number of sent packets, but we only set + * REPORT_STATUS in a few slots so TDH is the only + * good way. */ - if (ix_use_dd) { - struct ixgbe_legacy_tx_desc *txd = - (struct ixgbe_legacy_tx_desc *)txr->tx_base; - u_int k1 = netmap_idx_k2n(kring, kring->nr_hwcur); - l = txr->next_to_clean; - delta = 0; - while (l != k1 && - txd[l].upper.fields.status & IXGBE_TXD_STAT_DD) { - delta++; - l = (l == lim) ? 0 : l + 1; - } - } else { - l = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(ring_nr)); - if (l >= kring->nkr_num_slots) { /* XXX can happen */ - D("TDH wrap %d", l); - l -= kring->nkr_num_slots; + nic_i = IXGBE_READ_REG(&adapter->hw, IXGBE_TDH(ring_nr)); + if (nic_i >= kring->nkr_num_slots) { /* XXX can it happen ? */ + D("TDH wrap %d", nic_i); + nic_i -= kring->nkr_num_slots; } - delta = l - txr->next_to_clean; - } - if (delta) { + if (nic_i != txr->next_to_clean) { /* some tx completed, increment avail */ - if (delta < 0) - delta += kring->nkr_num_slots; - txr->next_to_clean = l; - kring->nr_hwavail += delta; - if (kring->nr_hwavail > lim) - goto ring_reset; + txr->next_to_clean = nic_i; + kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim); } } - /* update avail to what the kernel knows */ - ring->avail = kring->nr_hwavail; + + nm_txsync_finalize(kring); return 0; } @@ -394,34 +329,35 @@ ring_reset: /* * Reconcile kernel and user view of the receive ring. - * Same as for the txsync, this routine must be efficient and - * avoid races in accessing the shared regions. - * - * When called, userspace has read data from slots kring->nr_hwcur - * up to ring->cur (excluded). + * Same as for the txsync, this routine must be efficient. + * The caller guarantees a single invocations, but races against + * the rest of the driver should be handled here. * - * The last interrupt reported kring->nr_hwavail slots available - * after kring->nr_hwcur. - * We must subtract the newly consumed slots (cur - nr_hwcur) - * from nr_hwavail, make the descriptors available for the next reads, - * and set kring->nr_hwcur = ring->cur and ring->avail = kring->nr_hwavail. + * On call, kring->rhead is the first packet that userspace wants + * to keep, and kring->rcur is the wakeup point. + * The kernel has previously reported packets up to kring->rtail. * * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective * of whether or not we received an interrupt. */ static int -ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) +ixgbe_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) { - struct adapter *adapter = ifp->if_softc; - struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; - struct netmap_adapter *na = NA(adapter->ifp); + struct ifnet *ifp = na->ifp; struct netmap_kring *kring = &na->rx_rings[ring_nr]; struct netmap_ring *ring = kring->ring; - u_int j, l, n, lim = kring->nkr_num_slots - 1; + u_int nm_i; /* index into the netmap ring */ + u_int nic_i; /* index into the NIC ring */ + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; - u_int k = ring->cur, resvd = ring->reserved; - if (k > lim) + /* device-specific */ + struct adapter *adapter = ifp->if_softc; + struct rx_ring *rxr = &adapter->rx_rings[ring_nr]; + + if (head > lim) return netmap_ring_reinit(kring); /* XXX check sync modes */ @@ -429,17 +365,17 @@ ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); /* - * First part, import newly received packets into the netmap ring. + * First part: import newly received packets. * - * j is the index of the next free slot in the netmap ring, - * and l is the index of the next received packet in the NIC ring, + * nm_i is the index of the next free slot in the netmap ring, + * nic_i is the index of the next received packet in the NIC ring, * and they may differ in case if_init() has been called while * in netmap mode. For the receive ring we have * - * j = (kring->nr_hwcur + kring->nr_hwavail) % ring_size - * l = rxr->next_to_check; + * nic_i = rxr->next_to_check; + * nm_i = kring->nr_hwtail (previous) * and - * j == (l + kring->nkr_hwofs) % ring_size + * nm_i == (nic_i + kring->nkr_hwofs) % ring_size * * rxr->next_to_check is set to 0 on a ring reinit */ @@ -447,21 +383,21 @@ ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) int crclen = ix_crcstrip ? 0 : 4; uint16_t slot_flags = kring->nkr_slot_flags; - l = rxr->next_to_check; - j = netmap_idx_n2k(kring, l); + nic_i = rxr->next_to_check; // or also k2n(kring->nr_hwtail) + nm_i = netmap_idx_n2k(kring, nic_i); for (n = 0; ; n++) { - union ixgbe_adv_rx_desc *curr = &rxr->rx_base[l]; + union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i]; uint32_t staterr = le32toh(curr->wb.upper.status_error); if ((staterr & IXGBE_RXD_STAT_DD) == 0) break; - ring->slot[j].len = le16toh(curr->wb.upper.length) - crclen; - ring->slot[j].flags = slot_flags; + ring->slot[nm_i].len = le16toh(curr->wb.upper.length) - crclen; + ring->slot[nm_i].flags = slot_flags; bus_dmamap_sync(rxr->ptag, - rxr->rx_buffers[l].pmap, BUS_DMASYNC_POSTREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + rxr->rx_buffers[nic_i].pmap, BUS_DMASYNC_POSTREAD); + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } if (n) { /* update the state variables */ if (netmap_no_pendintr && !force_update) { @@ -469,48 +405,36 @@ ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) ix_rx_miss ++; ix_rx_miss_bufs += n; } - rxr->next_to_check = l; - kring->nr_hwavail += n; + rxr->next_to_check = nic_i; + kring->nr_hwtail = nm_i; } kring->nr_kflags &= ~NKR_PENDINTR; } /* - * Skip past packets that userspace has released - * (from kring->nr_hwcur to ring->cur - ring->reserved excluded), + * Second part: skip past packets that userspace has released. + * (kring->nr_hwcur to kring->rhead excluded), * and make the buffers available for reception. - * As usual j is the index in the netmap ring, l is the index - * in the NIC ring, and j == (l + kring->nkr_hwofs) % ring_size + * As usual nm_i is the index in the netmap ring, + * nic_i is the index in the NIC ring, and + * nm_i == (nic_i + kring->nkr_hwofs) % ring_size */ - j = kring->nr_hwcur; - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - if (j != k) { /* userspace has released some packets. */ - l = netmap_idx_k2n(kring, j); - for (n = 0; j != k; n++) { - /* collect per-slot info, with similar validations - * and flag handling as in the txsync code. - * - * NOTE curr and rxbuf are indexed by l. - * Also, this driver needs to update the physical - * address in the NIC ring, but other drivers - * may not have this requirement. - */ - struct netmap_slot *slot = &ring->slot[j]; - union ixgbe_adv_rx_desc *curr = &rxr->rx_base[l]; - struct ixgbe_rx_buf *rxbuf = &rxr->rx_buffers[l]; + nm_i = kring->nr_hwcur; + if (nm_i != head) { + nic_i = netmap_idx_k2n(kring, nm_i); + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; uint64_t paddr; void *addr = PNMB(slot, &paddr); + union ixgbe_adv_rx_desc *curr = &rxr->rx_base[nic_i]; + struct ixgbe_rx_buf *rxbuf = &rxr->rx_buffers[nic_i]; + if (addr == netmap_buffer_base) /* bad buf */ goto ring_reset; if (slot->flags & NS_BUF_CHANGED) { + /* buffer has changed, reload map */ netmap_reload_map(rxr->ptag, rxbuf->pmap, addr); slot->flags &= ~NS_BUF_CHANGED; } @@ -518,21 +442,23 @@ ixgbe_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) curr->read.pkt_addr = htole64(paddr); bus_dmamap_sync(rxr->ptag, rxbuf->pmap, BUS_DMASYNC_PREREAD); - j = (j == lim) ? 0 : j + 1; - l = (l == lim) ? 0 : l + 1; + nm_i = nm_next(nm_i, lim); + nic_i = nm_next(nic_i, lim); } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; + kring->nr_hwcur = head; + bus_dmamap_sync(rxr->rxdma.dma_tag, rxr->rxdma.dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - /* IMPORTANT: we must leave one free slot in the ring, - * so move l back by one unit + /* + * IMPORTANT: we must leave one free slot in the ring, + * so move nic_i back by one unit */ - l = (l == 0) ? lim : l - 1; - IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), l); + nic_i = nm_prev(nic_i, lim); + IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->me), nic_i); } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; + + /* tell userspace that there might be new packets */ + nm_rxsync_finalize(kring); return 0; @@ -562,7 +488,8 @@ ixgbe_netmap_attach(struct adapter *adapter) na.nm_txsync = ixgbe_netmap_txsync; na.nm_rxsync = ixgbe_netmap_rxsync; na.nm_register = ixgbe_netmap_reg; - netmap_attach(&na, adapter->num_queues); -} + na.num_tx_rings = na.num_rx_rings = adapter->num_queues; + netmap_attach(&na); +} /* end of file */ diff --git a/sys/dev/netmap/netmap.c b/sys/dev/netmap/netmap.c index 19be406f6..6fd8028db 100644 --- a/sys/dev/netmap/netmap.c +++ b/sys/dev/netmap/netmap.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -8,7 +8,7 @@ * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. + * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -25,6 +25,8 @@ /* + * $FreeBSD$ + * * This module supports memory mapped access to network devices, * see netmap(4). * @@ -130,131 +132,41 @@ ports attached to the switch) #if defined(__FreeBSD__) #include /* prerequisite */ -__FBSDID("$FreeBSD$"); - #include -#include #include #include /* defines used in kernel.h */ -#include #include /* types used in module initialization */ -#include /* cdevsw struct */ -#include /* uio struct */ +#include /* cdevsw struct, UID, GID */ +#include /* FIONBIO */ #include #include /* struct socket */ #include -#include /* PROT_EXEC */ #include -#include #include -#include /* vtophys */ -#include /* vtophys */ -#include -#include -#include -#include -#include #include /* sockaddrs */ #include #include +#include +#include #include #include #include /* BIOCIMMEDIATE */ -#include #include /* bus_dmamap_* */ #include #include -#define prefetch(x) __builtin_prefetch(x) - -#define BDG_RWLOCK_T struct rwlock // struct rwlock - -#define BDG_RWINIT(b) \ - rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) -#define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) -#define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) -#define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) -#define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) -#define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) -#define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) +/* reduce conditional code */ +// linux API, use for the knlist in FreeBSD +#define init_waitqueue_head(x) knlist_init_mtx(&(x)->si_note, NULL) -/* netmap global lock. - * normally called within the user thread (upon a system call) - * or when a file descriptor or process is terminated - * (last close or last munmap) - */ - -#define NMG_LOCK_T struct mtx -#define NMG_LOCK_INIT() mtx_init(&netmap_global_lock, "netmap global lock", NULL, MTX_DEF) -#define NMG_LOCK_DESTROY() mtx_destroy(&netmap_global_lock) -#define NMG_LOCK() mtx_lock(&netmap_global_lock) -#define NMG_UNLOCK() mtx_unlock(&netmap_global_lock) -#define NMG_LOCK_ASSERT() mtx_assert(&netmap_global_lock, MA_OWNED) - - -/* atomic operations */ -#include -#define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) -#define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) - +void freebsd_selwakeup(struct selinfo *si, int pri); +#define OS_selwakeup(a, b) freebsd_selwakeup(a, b) #elif defined(linux) #include "bsd_glue.h" -static netdev_tx_t linux_netmap_start_xmit(struct sk_buff *, struct net_device *); - -static struct device_driver* -linux_netmap_find_driver(struct device *dev) -{ - struct device_driver *dd; - - while ( (dd = dev->driver) == NULL ) { - if ( (dev = dev->parent) == NULL ) - return NULL; - } - return dd; -} - -static struct net_device* -ifunit_ref(const char *name) -{ - struct net_device *ifp = dev_get_by_name(&init_net, name); - struct device_driver *dd; - - if (ifp == NULL) - return NULL; - - if ( (dd = linux_netmap_find_driver(&ifp->dev)) == NULL ) - goto error; - - if (!try_module_get(dd->owner)) - goto error; - - return ifp; -error: - dev_put(ifp); - return NULL; -} - -static void -if_rele(struct net_device *ifp) -{ - struct device_driver *dd; - dd = linux_netmap_find_driver(&ifp->dev); - dev_put(ifp); - if (dd) - module_put(dd->owner); -} - -// XXX a mtx would suffice here too 20130404 gl -#define NMG_LOCK_T struct semaphore -#define NMG_LOCK_INIT() sema_init(&netmap_global_lock, 1) -#define NMG_LOCK_DESTROY() -#define NMG_LOCK() down(&netmap_global_lock) -#define NMG_UNLOCK() up(&netmap_global_lock) -#define NMG_LOCK_ASSERT() // XXX to be completed #elif defined(__APPLE__) @@ -306,57 +218,49 @@ int netmap_txsync_retry = 2; SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW, &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush."); -int netmap_drop = 0; /* debugging */ int netmap_flags = 0; /* debug flags */ int netmap_fwd = 0; /* force transparent mode */ int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */ -SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , ""); +/* + * netmap_admode selects the netmap mode to use. + * Invalid values are reset to NETMAP_ADMODE_BEST + */ +enum { NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */ + NETMAP_ADMODE_NATIVE, /* either native or none */ + NETMAP_ADMODE_GENERIC, /* force generic */ + NETMAP_ADMODE_LAST }; +static int netmap_admode = NETMAP_ADMODE_BEST; + +int netmap_generic_mit = 100*1000; /* Generic mitigation interval in nanoseconds. */ +int netmap_generic_ringsize = 1024; /* Generic ringsize. */ +int netmap_generic_rings = 1; /* number of queues in generic. */ + SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , ""); SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , ""); +SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , ""); NMG_LOCK_T netmap_global_lock; -/* - * protect against multiple threads using the same ring. - * also check that the ring has not been stopped. - */ -#define NM_KR_BUSY 1 -#define NM_KR_STOPPED 2 -static void nm_kr_put(struct netmap_kring *kr); -static __inline int nm_kr_tryget(struct netmap_kring *kr) -{ - /* check a first time without taking the lock - * to avoid starvation for nm_kr_get() - */ - if (unlikely(kr->nkr_stopped)) { - ND("ring %p stopped (%d)", kr, kr->nkr_stopped); - return NM_KR_STOPPED; - } - if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))) - return NM_KR_BUSY; - /* check a second time with lock held */ - if (unlikely(kr->nkr_stopped)) { - ND("ring %p stopped (%d)", kr, kr->nkr_stopped); - nm_kr_put(kr); - return NM_KR_STOPPED; - } - return 0; -} - -static __inline void nm_kr_put(struct netmap_kring *kr) -{ - NM_ATOMIC_CLEAR(&kr->nr_busy); -} -static void nm_kr_get(struct netmap_kring *kr) +static void +nm_kr_get(struct netmap_kring *kr) { while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)) tsleep(kr, 0, "NM_KR_GET", 4); } -static void nm_disable_ring(struct netmap_kring *kr) + +/* + * mark the ring as stopped, and run through the locks + * to make sure other users get to see it. + */ +void +netmap_disable_ring(struct netmap_kring *kr) { kr->nkr_stopped = 1; nm_kr_get(kr); @@ -365,45 +269,51 @@ static void nm_disable_ring(struct netmap_kring *kr) nm_kr_put(kr); } -void netmap_disable_all_rings(struct ifnet *ifp) + +static void +netmap_set_all_rings(struct ifnet *ifp, int stopped) { struct netmap_adapter *na; int i; + u_int ntx, nrx; if (!(ifp->if_capenable & IFCAP_NETMAP)) return; na = NA(ifp); - for (i = 0; i < na->num_tx_rings + 1; i++) { - nm_disable_ring(na->tx_rings + i); - selwakeuppri(&na->tx_rings[i].si, PI_NET); + ntx = netmap_real_tx_rings(na); + nrx = netmap_real_rx_rings(na); + + for (i = 0; i < ntx; i++) { + if (stopped) + netmap_disable_ring(na->tx_rings + i); + else + na->tx_rings[i].nkr_stopped = 0; + na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY); } - for (i = 0; i < na->num_rx_rings + 1; i++) { - nm_disable_ring(na->rx_rings + i); - selwakeuppri(&na->rx_rings[i].si, PI_NET); + + for (i = 0; i < nrx; i++) { + if (stopped) + netmap_disable_ring(na->rx_rings + i); + else + na->rx_rings[i].nkr_stopped = 0; + na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY); } - selwakeuppri(&na->tx_si, PI_NET); - selwakeuppri(&na->rx_si, PI_NET); } -void netmap_enable_all_rings(struct ifnet *ifp) + +void +netmap_disable_all_rings(struct ifnet *ifp) { - struct netmap_adapter *na; - int i; + netmap_set_all_rings(ifp, 1 /* stopped */); +} - if (!(ifp->if_capenable & IFCAP_NETMAP)) - return; - na = NA(ifp); - for (i = 0; i < na->num_tx_rings + 1; i++) { - D("enabling %p", na->tx_rings + i); - na->tx_rings[i].nkr_stopped = 0; - } - for (i = 0; i < na->num_rx_rings + 1; i++) { - D("enabling %p", na->rx_rings + i); - na->rx_rings[i].nkr_stopped = 0; - } +void +netmap_enable_all_rings(struct ifnet *ifp) +{ + netmap_set_all_rings(ifp, 0 /* enabled */); } @@ -432,6 +342,7 @@ nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg) return *v; } + /* * packet-dump function, user-supplied or static buffer. * The destination buffer must be at least 30+4*len @@ -440,7 +351,7 @@ const char * nm_dump_buf(char *p, int len, int lim, char *dst) { static char _dst[8192]; - int i, j, i0; + int i, j, i0; static char hex[] ="0123456789abcdef"; char *o; /* output position */ @@ -477,455 +388,265 @@ nm_dump_buf(char *p, int len, int lim, char *dst) return dst; } -/* - * system parameters (most of them in netmap_kern.h) - * NM_NAME prefix for switch port names, default "vale" - * NM_BDG_MAXPORTS number of ports - * NM_BRIDGES max number of switches in the system. - * XXX should become a sysctl or tunable - * - * Switch ports are named valeX:Y where X is the switch name and Y - * is the port. If Y matches a physical interface name, the port is - * connected to a physical device. - * - * Unlike physical interfaces, switch ports use their own memory region - * for rings and buffers. - * The virtual interfaces use per-queue lock instead of core lock. - * In the tx loop, we aggregate traffic in batches to make all operations - * faster. The batch size is bridge_batch. - */ -#define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ -#define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ -#define NM_BRIDGE_RINGSIZE 1024 /* in the device */ -#define NM_BDG_HASH 1024 /* forwarding table entries */ -#define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ -#define NM_MULTISEG 64 /* max size of a chain of bufs */ -/* actual size of the tables */ -#define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) -/* NM_FT_NULL terminates a list of slots in the ft */ -#define NM_FT_NULL NM_BDG_BATCH_MAX -#define NM_BRIDGES 8 /* number of bridges */ - /* - * bridge_batch is set via sysctl to the max batch size to be - * used in the bridge. The actual value may be larger as the - * last packet in the block may overflow the size. + * Fetch configuration from the device, to cope with dynamic + * reconfigurations after loading the module. */ -int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ -SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); +int +netmap_update_config(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + u_int txr, txd, rxr, rxd; + txr = txd = rxr = rxd = 0; + if (na->nm_config) { + na->nm_config(na, &txr, &txd, &rxr, &rxd); + } else { + /* take whatever we had at init time */ + txr = na->num_tx_rings; + txd = na->num_tx_desc; + rxr = na->num_rx_rings; + rxd = na->num_rx_desc; + } -/* - * These are used to handle reference counters for bridge ports. - */ -#define ADD_BDG_REF(ifp) refcount_acquire(&NA(ifp)->na_bdg_refcount) -#define DROP_BDG_REF(ifp) refcount_release(&NA(ifp)->na_bdg_refcount) + if (na->num_tx_rings == txr && na->num_tx_desc == txd && + na->num_rx_rings == rxr && na->num_rx_desc == rxd) + return 0; /* nothing changed */ + if (netmap_verbose || na->active_fds > 0) { + D("stored config %s: txring %d x %d, rxring %d x %d", + NM_IFPNAME(ifp), + na->num_tx_rings, na->num_tx_desc, + na->num_rx_rings, na->num_rx_desc); + D("new config %s: txring %d x %d, rxring %d x %d", + NM_IFPNAME(ifp), txr, txd, rxr, rxd); + } + if (na->active_fds == 0) { + D("configuration changed (but fine)"); + na->num_tx_rings = txr; + na->num_tx_desc = txd; + na->num_rx_rings = rxr; + na->num_rx_desc = rxd; + return 0; + } + D("configuration changed while active, this is bad..."); + return 1; +} -/* The bridge references the buffers using the device specific look up table */ -static inline void * -BDG_NMB(struct netmap_mem_d *nmd, struct netmap_slot *slot) +static int +netmap_txsync_compat(struct netmap_kring *kring, int flags) { - struct lut_entry *lut = nmd->pools[NETMAP_BUF_POOL].lut; - uint32_t i = slot->buf_idx; - return (unlikely(i >= nmd->pools[NETMAP_BUF_POOL].objtotal)) ? lut[0].vaddr : lut[i].vaddr; + struct netmap_adapter *na = kring->na; + return na->nm_txsync(na, kring->ring_id, flags); } -static int bdg_netmap_attach(struct netmap_adapter *); -static int bdg_netmap_reg(struct ifnet *ifp, int onoff); -int kern_netmap_regif(struct nmreq *nmr); - -/* - * Each transmit queue accumulates a batch of packets into - * a structure before forwarding. Packets to the same - * destination are put in a list using ft_next as a link field. - * ft_frags and ft_next are valid only on the first fragment. - */ -struct nm_bdg_fwd { /* forwarding entry for a bridge */ - void *ft_buf; /* netmap or indirect buffer */ - uint8_t ft_frags; /* how many fragments (only on 1st frag) */ - uint8_t _ft_port; /* dst port (unused) */ - uint16_t ft_flags; /* flags, e.g. indirect */ - uint16_t ft_len; /* src fragment len */ - uint16_t ft_next; /* next packet to same destination */ -}; - -/* - * For each output interface, nm_bdg_q is used to construct a list. - * bq_len is the number of output buffers (we can have coalescing - * during the copy). - */ -struct nm_bdg_q { - uint16_t bq_head; - uint16_t bq_tail; - uint32_t bq_len; /* number of buffers */ -}; - -/* XXX revise this */ -struct nm_hash_ent { - uint64_t mac; /* the top 2 bytes are the epoch */ - uint64_t ports; -}; - -/* - * nm_bridge is a descriptor for a VALE switch. - * Interfaces for a bridge are all in bdg_ports[]. - * The array has fixed size, an empty entry does not terminate - * the search, but lookups only occur on attach/detach so we - * don't mind if they are slow. - * - * The bridge is non blocking on the transmit ports: excess - * packets are dropped if there is no room on the output port. - * - * bdg_lock protects accesses to the bdg_ports array. - * This is a rw lock (or equivalent). - */ -struct nm_bridge { - /* XXX what is the proper alignment/layout ? */ - BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ - int bdg_namelen; - uint32_t bdg_active_ports; /* 0 means free */ - char bdg_basename[IFNAMSIZ]; - - /* Indexes of active ports (up to active_ports) - * and all other remaining ports. - */ - uint8_t bdg_port_index[NM_BDG_MAXPORTS]; - - struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS]; - - - /* - * The function to decide the destination port. - * It returns either of an index of the destination port, - * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to - * forward this packet. ring_nr is the source ring index, and the - * function may overwrite this value to forward this packet to a - * different ring index. - * This function must be set by netmap_bdgctl(). - */ - bdg_lookup_fn_t nm_bdg_lookup; - - /* the forwarding table, MAC+ports. - * XXX should be changed to an argument to be passed to - * the lookup function, and allocated on attach - */ - struct nm_hash_ent ht[NM_BDG_HASH]; -}; - - -/* - * XXX in principle nm_bridges could be created dynamically - * Right now we have a static array and deletions are protected - * by an exclusive lock. - */ -struct nm_bridge nm_bridges[NM_BRIDGES]; - - -/* - * A few function to tell which kind of port are we using. - * XXX should we hold a lock ? - * - * nma_is_vp() virtual port - * nma_is_host() port connected to the host stack - * nma_is_hw() port connected to a NIC - */ -int nma_is_vp(struct netmap_adapter *na); -int -nma_is_vp(struct netmap_adapter *na) +static int +netmap_rxsync_compat(struct netmap_kring *kring, int flags) { - return na->nm_register == bdg_netmap_reg; + struct netmap_adapter *na = kring->na; + return na->nm_rxsync(na, kring->ring_id, flags); } -static __inline int -nma_is_host(struct netmap_adapter *na) +static int +netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags) { - return na->nm_register == NULL; + (void)flags; + netmap_txsync_to_host(kring->na); + return 0; } -static __inline int -nma_is_hw(struct netmap_adapter *na) +static int +netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags) { - /* In case of sw adapter, nm_register is NULL */ - return !nma_is_vp(na) && !nma_is_host(na); + (void)flags; + netmap_rxsync_from_host(kring->na, NULL, NULL); + return 0; } -/* - * If the NIC is owned by the kernel - * (i.e., bridge), neither another bridge nor user can use it; - * if the NIC is owned by a user, only users can share it. - * Evaluation must be done under NMG_LOCK(). - */ -#define NETMAP_OWNED_BY_KERN(ifp) (!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg) -#define NETMAP_OWNED_BY_ANY(ifp) \ - (NETMAP_OWNED_BY_KERN(ifp) || (NA(ifp)->refcount > 0)) - -/* - * NA(ifp)->bdg_port port index - */ - -/* - * this is a slightly optimized copy routine which rounds - * to multiple of 64 bytes and is often faster than dealing - * with other odd sizes. We assume there is enough room - * in the source and destination buffers. +/* create the krings array and initialize the fields common to all adapters. + * The array layout is this: * - * XXX only for multiples of 64 bytes, non overlapped. - */ -static inline void -pkt_copy(void *_src, void *_dst, int l) -{ - uint64_t *src = _src; - uint64_t *dst = _dst; - if (unlikely(l >= 1024)) { - memcpy(dst, src, l); - return; - } - for (; likely(l > 0); l-=64) { - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - } -} - - -/* - * locate a bridge among the existing ones. - * MUST BE CALLED WITH NMG_LOCK() + * +----------+ + * na->tx_rings ----->| | \ + * | | } na->num_tx_ring + * | | / + * +----------+ + * | | host tx kring + * na->rx_rings ----> +----------+ + * | | \ + * | | } na->num_rx_rings + * | | / + * +----------+ + * | | host rx kring + * +----------+ + * na->tailroom ----->| | \ + * | | } tailroom bytes + * | | / + * +----------+ * - * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. - * We assume that this is called with a name of at least NM_NAME chars. + * Note: for compatibility, host krings are created even when not needed. + * The tailroom space is currently used by vale ports for allocating leases. */ -static struct nm_bridge * -nm_find_bridge(const char *name, int create) +int +netmap_krings_create(struct netmap_adapter *na, u_int tailroom) { - int i, l, namelen; - struct nm_bridge *b = NULL; + u_int i, len, ndesc; + struct netmap_kring *kring; + u_int ntx, nrx; - NMG_LOCK_ASSERT(); + /* account for the (possibly fake) host rings */ + ntx = na->num_tx_rings + 1; + nrx = na->num_rx_rings + 1; - namelen = strlen(NM_NAME); /* base length */ - l = name ? strlen(name) : 0; /* actual length */ - if (l < namelen) { - D("invalid bridge name %s", name ? name : NULL); - return NULL; + len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom; + + na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); + if (na->tx_rings == NULL) { + D("Cannot allocate krings"); + return ENOMEM; } - for (i = namelen + 1; i < l; i++) { - if (name[i] == ':') { - namelen = i; - break; + na->rx_rings = na->tx_rings + ntx; + + /* + * All fields in krings are 0 except the one initialized below. + * but better be explicit on important kring fields. + */ + ndesc = na->num_tx_desc; + for (i = 0; i < ntx; i++) { /* Transmit rings */ + kring = &na->tx_rings[i]; + bzero(kring, sizeof(*kring)); + kring->na = na; + kring->ring_id = i; + kring->nkr_num_slots = ndesc; + if (i < na->num_tx_rings) { + kring->nm_sync = netmap_txsync_compat; // XXX + } else if (i == na->num_tx_rings) { + kring->nm_sync = netmap_txsync_to_host_compat; } + /* + * IMPORTANT: Always keep one slot empty. + */ + kring->rhead = kring->rcur = kring->nr_hwcur = 0; + kring->rtail = kring->nr_hwtail = ndesc - 1; + snprintf(kring->name, sizeof(kring->name) - 1, "%s TX%d", NM_IFPNAME(na->ifp), i); + ND("ktx %s h %d c %d t %d", + kring->name, kring->rhead, kring->rcur, kring->rtail); + mtx_init(&kring->q_lock, "nm_txq_lock", NULL, MTX_DEF); + init_waitqueue_head(&kring->si); } - if (namelen >= IFNAMSIZ) - namelen = IFNAMSIZ; - ND("--- prefix is '%.*s' ---", namelen, name); - /* lookup the name, remember empty slot if there is one */ - for (i = 0; i < NM_BRIDGES; i++) { - struct nm_bridge *x = nm_bridges + i; - - if (x->bdg_active_ports == 0) { - if (create && b == NULL) - b = x; /* record empty slot */ - } else if (x->bdg_namelen != namelen) { - continue; - } else if (strncmp(name, x->bdg_basename, namelen) == 0) { - ND("found '%.*s' at %d", namelen, name, i); - b = x; - break; + ndesc = na->num_rx_desc; + for (i = 0; i < nrx; i++) { /* Receive rings */ + kring = &na->rx_rings[i]; + bzero(kring, sizeof(*kring)); + kring->na = na; + kring->ring_id = i; + kring->nkr_num_slots = ndesc; + if (i < na->num_rx_rings) { + kring->nm_sync = netmap_rxsync_compat; // XXX + } else if (i == na->num_rx_rings) { + kring->nm_sync = netmap_rxsync_from_host_compat; } + kring->rhead = kring->rcur = kring->nr_hwcur = 0; + kring->rtail = kring->nr_hwtail = 0; + snprintf(kring->name, sizeof(kring->name) - 1, "%s RX%d", NM_IFPNAME(na->ifp), i); + ND("krx %s h %d c %d t %d", + kring->name, kring->rhead, kring->rcur, kring->rtail); + mtx_init(&kring->q_lock, "nm_rxq_lock", NULL, MTX_DEF); + init_waitqueue_head(&kring->si); } - if (i == NM_BRIDGES && b) { /* name not found, can create entry */ - /* initialize the bridge */ - strncpy(b->bdg_basename, name, namelen); - ND("create new bridge %s with ports %d", b->bdg_basename, - b->bdg_active_ports); - b->bdg_namelen = namelen; - b->bdg_active_ports = 0; - for (i = 0; i < NM_BDG_MAXPORTS; i++) - b->bdg_port_index[i] = i; - /* set the default function */ - b->nm_bdg_lookup = netmap_bdg_learning; - /* reset the MAC address table */ - bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); - } - return b; -} - + init_waitqueue_head(&na->tx_si); + init_waitqueue_head(&na->rx_si); -/* - * Free the forwarding tables for rings attached to switch ports. - */ -static void -nm_free_bdgfwd(struct netmap_adapter *na) -{ - int nrings, i; - struct netmap_kring *kring; + na->tailroom = na->rx_rings + nrx; - NMG_LOCK_ASSERT(); - nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; - kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; - for (i = 0; i < nrings; i++) { - if (kring[i].nkr_ft) { - free(kring[i].nkr_ft, M_DEVBUF); - kring[i].nkr_ft = NULL; /* protect from freeing twice */ - } - } - if (nma_is_hw(na)) - nm_free_bdgfwd(SWNA(na->ifp)); + return 0; } -/* - * Allocate the forwarding tables for the rings attached to the bridge ports. - */ -static int -nm_alloc_bdgfwd(struct netmap_adapter *na) +/* undo the actions performed by netmap_krings_create */ +void +netmap_krings_delete(struct netmap_adapter *na) { - int nrings, l, i, num_dstq; - struct netmap_kring *kring; + struct netmap_kring *kring = na->tx_rings; - NMG_LOCK_ASSERT(); - /* all port:rings + broadcast */ - num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; - l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; - l += sizeof(struct nm_bdg_q) * num_dstq; - l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; - - nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings; - kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings; - for (i = 0; i < nrings; i++) { - struct nm_bdg_fwd *ft; - struct nm_bdg_q *dstq; - int j; - - ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); - if (!ft) { - nm_free_bdgfwd(na); - return ENOMEM; - } - dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); - for (j = 0; j < num_dstq; j++) { - dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; - dstq[j].bq_len = 0; - } - kring[i].nkr_ft = ft; + /* we rely on the krings layout described above */ + for ( ; kring != na->tailroom; kring++) { + mtx_destroy(&kring->q_lock); } - if (nma_is_hw(na)) - nm_alloc_bdgfwd(SWNA(na->ifp)); - return 0; + free(na->tx_rings, M_DEVBUF); + na->tx_rings = na->rx_rings = na->tailroom = NULL; } /* - * Fetch configuration from the device, to cope with dynamic - * reconfigurations after loading the module. + * Destructor for NIC ports. They also have an mbuf queue + * on the rings connected to the host so we need to purge + * them first. */ -static int -netmap_update_config(struct netmap_adapter *na) +static void +netmap_hw_krings_delete(struct netmap_adapter *na) { - struct ifnet *ifp = na->ifp; - u_int txr, txd, rxr, rxd; - - txr = txd = rxr = rxd = 0; - if (na->nm_config) { - na->nm_config(ifp, &txr, &txd, &rxr, &rxd); - } else { - /* take whatever we had at init time */ - txr = na->num_tx_rings; - txd = na->num_tx_desc; - rxr = na->num_rx_rings; - rxd = na->num_rx_desc; - } + struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue; - if (na->num_tx_rings == txr && na->num_tx_desc == txd && - na->num_rx_rings == rxr && na->num_rx_desc == rxd) - return 0; /* nothing changed */ - if (netmap_verbose || na->refcount > 0) { - D("stored config %s: txring %d x %d, rxring %d x %d", - ifp->if_xname, - na->num_tx_rings, na->num_tx_desc, - na->num_rx_rings, na->num_rx_desc); - D("new config %s: txring %d x %d, rxring %d x %d", - ifp->if_xname, txr, txd, rxr, rxd); - } - if (na->refcount == 0) { - D("configuration changed (but fine)"); - na->num_tx_rings = txr; - na->num_tx_desc = txd; - na->num_rx_rings = rxr; - na->num_rx_desc = rxd; - return 0; - } - D("configuration changed while active, this is bad..."); - return 1; + ND("destroy sw mbq with len %d", mbq_len(q)); + mbq_purge(q); + mbq_safe_destroy(q); + netmap_krings_delete(na); } -static struct netmap_if * + +static struct netmap_if* netmap_if_new(const char *ifname, struct netmap_adapter *na) { + struct netmap_if *nifp; + if (netmap_update_config(na)) { /* configuration mismatch, report and fail */ return NULL; } - return netmap_mem_if_new(ifname, na); -} + if (na->active_fds) + goto final; -/* Structure associated to each thread which registered an interface. - * - * The first 4 fields of this structure are written by NIOCREGIF and - * read by poll() and NIOC?XSYNC. - * There is low contention among writers (actually, a correct user program - * should have no contention among writers) and among writers and readers, - * so we use a single global lock to protect the structure initialization. - * Since initialization involves the allocation of memory, we reuse the memory - * allocator lock. - * Read access to the structure is lock free. Readers must check that - * np_nifp is not NULL before using the other fields. - * If np_nifp is NULL initialization has not been performed, so they should - * return an error to userlevel. - * - * The ref_done field is used to regulate access to the refcount in the - * memory allocator. The refcount must be incremented at most once for - * each open("/dev/netmap"). The increment is performed by the first - * function that calls netmap_get_memory() (currently called by - * mmap(), NIOCGINFO and NIOCREGIF). - * If the refcount is incremented, it is then decremented when the - * private structure is destroyed. - */ -struct netmap_priv_d { - struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ + if (na->nm_krings_create(na)) + goto cleanup; - struct ifnet *np_ifp; /* device for which we hold a ref. */ - int np_ringid; /* from the ioctl */ - u_int np_qfirst, np_qlast; /* range of rings to scan */ - uint16_t np_txpoll; + if (netmap_mem_rings_create(na)) + goto cleanup; - struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ -#ifdef __FreeBSD__ - int np_refcount; /* use with NMG_LOCK held */ -#endif /* __FreeBSD__ */ -}; +final: -/* grab a reference to the memory allocator, if we don't have one already. The - * reference is taken from the netmap_adapter registered with the priv. - * - */ -static int -netmap_get_memory_locked(struct netmap_priv_d* p) -{ - struct netmap_mem_d *nmd; - int error = 0; + nifp = netmap_mem_if_new(ifname, na); + if (nifp == NULL) + goto cleanup; - if (p->np_ifp == NULL) { + return (nifp); + +cleanup: + + if (na->active_fds == 0) { + netmap_mem_rings_delete(na); + na->nm_krings_delete(na); + } + + return NULL; +} + + +/* grab a reference to the memory allocator, if we don't have one already. The + * reference is taken from the netmap_adapter registered with the priv. + * + */ +static int +netmap_get_memory_locked(struct netmap_priv_d* p) +{ + struct netmap_mem_d *nmd; + int error = 0; + + if (p->np_na == NULL) { if (!netmap_mmap_unreg) return ENODEV; /* for compatibility with older versions of the API @@ -934,7 +655,7 @@ netmap_get_memory_locked(struct netmap_priv_d* p) */ nmd = &nm_mem; } else { - nmd = NA(p->np_ifp)->nm_mem; + nmd = p->np_na->nm_mem; } if (p->np_mref == NULL) { error = netmap_mem_finalize(nmd); @@ -950,7 +671,8 @@ netmap_get_memory_locked(struct netmap_priv_d* p) return error; } -static int + +int netmap_get_memory(struct netmap_priv_d* p) { int error; @@ -960,12 +682,14 @@ netmap_get_memory(struct netmap_priv_d* p) return error; } + static int netmap_have_memory_locked(struct netmap_priv_d* p) { return p->np_mref != NULL; } + static void netmap_drop_memory_locked(struct netmap_priv_d* p) { @@ -975,11 +699,12 @@ netmap_drop_memory_locked(struct netmap_priv_d* p) } } + /* * File descriptor's private data destructor. * * Call nm_register(ifp,0) to stop netmap mode on the interface and - * revert to normal operation. We expect that np_ifp has not gone. + * revert to normal operation. We expect that np_na->ifp has not gone. * The second argument is the nifp to work on. In some cases it is * not attached yet to the netmap_priv_d so we need to pass it as * a separate argument. @@ -988,16 +713,15 @@ netmap_drop_memory_locked(struct netmap_priv_d* p) static void netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) { - struct ifnet *ifp = priv->np_ifp; - struct netmap_adapter *na = NA(ifp); + struct netmap_adapter *na = priv->np_na; + struct ifnet *ifp = na->ifp; NMG_LOCK_ASSERT(); - na->refcount--; - if (na->refcount <= 0) { /* last instance */ - u_int i; + na->active_fds--; + if (na->active_fds <= 0) { /* last instance */ if (netmap_verbose) - D("deleting last instance for %s", ifp->if_xname); + D("deleting last instance for %s", NM_IFPNAME(ifp)); /* * (TO CHECK) This function is only called * when the last reference to this file descriptor goes @@ -1012,140 +736,47 @@ netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp) * happens if the close() occurs while a concurrent * syscall is running. */ - na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */ + if (ifp) + na->nm_register(na, 0); /* off, clear flags */ /* Wake up any sleeping threads. netmap_poll will * then return POLLERR * XXX The wake up now must happen during *_down(), when * we order all activities to stop. -gl */ - nm_free_bdgfwd(na); - for (i = 0; i < na->num_tx_rings + 1; i++) { - mtx_destroy(&na->tx_rings[i].q_lock); - } - for (i = 0; i < na->num_rx_rings + 1; i++) { - mtx_destroy(&na->rx_rings[i].q_lock); - } /* XXX kqueue(9) needed; these will mirror knlist_init. */ /* knlist_destroy(&na->tx_si.si_note); */ /* knlist_destroy(&na->rx_si.si_note); */ - if (nma_is_hw(na)) - SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL; + + /* delete rings and buffers */ + netmap_mem_rings_delete(na); + na->nm_krings_delete(na); } - /* - * netmap_mem_if_delete() deletes the nifp, and if this is - * the last instance also buffers, rings and krings. - */ + /* delete the nifp */ netmap_mem_if_delete(na, nifp); } - -/* we assume netmap adapter exists - * Called with NMG_LOCK held - */ -static void -nm_if_rele(struct ifnet *ifp) +static __inline int +nm_tx_si_user(struct netmap_priv_d *priv) { - int i, is_hw, hw, sw, lim; - struct nm_bridge *b; - struct netmap_adapter *na; - uint8_t tmp[NM_BDG_MAXPORTS]; - - NMG_LOCK_ASSERT(); - /* I can be called not only for get_ifp()-ed references where netmap's - * capability is guaranteed, but also for non-netmap-capable NICs. - */ - if (!NETMAP_CAPABLE(ifp) || !NA(ifp)->na_bdg) { - if_rele(ifp); - return; - } - na = NA(ifp); - b = na->na_bdg; - is_hw = nma_is_hw(na); - - ND("%s has %d references", ifp->if_xname, NA(ifp)->na_bdg_refcount); - - if (!DROP_BDG_REF(ifp)) - return; - - /* - New algorithm: - make a copy of bdg_port_index; - lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port - in the array of bdg_port_index, replacing them with - entries from the bottom of the array; - decrement bdg_active_ports; - acquire BDG_WLOCK() and copy back the array. - */ - - hw = NA(ifp)->bdg_port; - sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; - lim = b->bdg_active_ports; - - ND("detach %d and %d (lim %d)", hw, sw, lim); - /* make a copy of the list of active ports, update it, - * and then copy back within BDG_WLOCK(). - */ - memcpy(tmp, b->bdg_port_index, sizeof(tmp)); - for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { - if (hw >= 0 && tmp[i] == hw) { - ND("detach hw %d at %d", hw, i); - lim--; /* point to last active port */ - tmp[i] = tmp[lim]; /* swap with i */ - tmp[lim] = hw; /* now this is inactive */ - hw = -1; - } else if (sw >= 0 && tmp[i] == sw) { - ND("detach sw %d at %d", sw, i); - lim--; - tmp[i] = tmp[lim]; - tmp[lim] = sw; - sw = -1; - } else { - i++; - } - } - if (hw >= 0 || sw >= 0) { - D("XXX delete failed hw %d sw %d, should panic...", hw, sw); - } - hw = NA(ifp)->bdg_port; - sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1; - - BDG_WLOCK(b); - b->bdg_ports[hw] = NULL; - na->na_bdg = NULL; - if (sw >= 0) { - b->bdg_ports[sw] = NULL; - SWNA(ifp)->na_bdg = NULL; - } - memcpy(b->bdg_port_index, tmp, sizeof(tmp)); - b->bdg_active_ports = lim; - BDG_WUNLOCK(b); - - ND("now %d active ports", lim); - if (lim == 0) { - ND("marking bridge %s as free", b->bdg_basename); - b->nm_bdg_lookup = NULL; - } + return (priv->np_na != NULL && + (priv->np_txqlast - priv->np_txqfirst > 1)); +} - if (is_hw) { - if_rele(ifp); - } else { - if (na->na_flags & NAF_MEM_OWNER) - netmap_mem_private_delete(na->nm_mem); - bzero(na, sizeof(*na)); - free(na, M_DEVBUF); - bzero(ifp, sizeof(*ifp)); - free(ifp, M_DEVBUF); - } +static __inline int +nm_rx_si_user(struct netmap_priv_d *priv) +{ + return (priv->np_na != NULL && + (priv->np_rxqlast - priv->np_rxqfirst > 1)); } /* * returns 1 if this is the last instance and we can free priv */ -static int +int netmap_dtor_locked(struct netmap_priv_d *priv) { - struct ifnet *ifp = priv->np_ifp; + struct netmap_adapter *na = priv->np_na; #ifdef __FreeBSD__ /* @@ -1156,17 +787,25 @@ netmap_dtor_locked(struct netmap_priv_d *priv) return 0; } #endif /* __FreeBSD__ */ - if (ifp) { - netmap_do_unregif(priv, priv->np_nifp); + if (!na) { + return 1; //XXX is it correct? } + netmap_do_unregif(priv, priv->np_nifp); + priv->np_nifp = NULL; netmap_drop_memory_locked(priv); - if (ifp) { - nm_if_rele(ifp); /* might also destroy *na */ + if (priv->np_na) { + if (nm_tx_si_user(priv)) + na->tx_si_users--; + if (nm_rx_si_user(priv)) + na->rx_si_users--; + netmap_adapter_put(na); + priv->np_na = NULL; } return 1; } -static void + +void netmap_dtor(void *data) { struct netmap_priv_d *priv = data; @@ -1182,190 +821,6 @@ netmap_dtor(void *data) } -#ifdef __FreeBSD__ - -/* - * In order to track whether pages are still mapped, we hook into - * the standard cdev_pager and intercept the constructor and - * destructor. - */ - -struct netmap_vm_handle_t { - struct cdev *dev; - struct netmap_priv_d *priv; -}; - -static int -netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, - vm_ooffset_t foff, struct ucred *cred, u_short *color) -{ - struct netmap_vm_handle_t *vmh = handle; - D("handle %p size %jd prot %d foff %jd", - handle, (intmax_t)size, prot, (intmax_t)foff); - dev_ref(vmh->dev); - return 0; -} - - -static void -netmap_dev_pager_dtor(void *handle) -{ - struct netmap_vm_handle_t *vmh = handle; - struct cdev *dev = vmh->dev; - struct netmap_priv_d *priv = vmh->priv; - D("handle %p", handle); - netmap_dtor(priv); - free(vmh, M_DEVBUF); - dev_rel(dev); -} - -static int -netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, - int prot, vm_page_t *mres) -{ - struct netmap_vm_handle_t *vmh = object->handle; - struct netmap_priv_d *priv = vmh->priv; - vm_paddr_t paddr; - vm_page_t page; - vm_memattr_t memattr; - vm_pindex_t pidx; - - ND("object %p offset %jd prot %d mres %p", - object, (intmax_t)offset, prot, mres); - memattr = object->memattr; - pidx = OFF_TO_IDX(offset); - paddr = netmap_mem_ofstophys(priv->np_mref, offset); - if (paddr == 0) - return VM_PAGER_FAIL; - - if (((*mres)->flags & PG_FICTITIOUS) != 0) { - /* - * If the passed in result page is a fake page, update it with - * the new physical address. - */ - page = *mres; - vm_page_updatefake(page, paddr, memattr); - } else { - /* - * Replace the passed in reqpage page with our own fake page and - * free up the all of the original pages. - */ -#ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */ -#define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK -#define VM_OBJECT_WLOCK VM_OBJECT_LOCK -#endif /* VM_OBJECT_WUNLOCK */ - - VM_OBJECT_WUNLOCK(object); - page = vm_page_getfake(paddr, memattr); - VM_OBJECT_WLOCK(object); - vm_page_lock(*mres); - vm_page_free(*mres); - vm_page_unlock(*mres); - *mres = page; - vm_page_insert(page, object, pidx); - } - page->valid = VM_PAGE_BITS_ALL; - return (VM_PAGER_OK); -} - - -static struct cdev_pager_ops netmap_cdev_pager_ops = { - .cdev_pg_ctor = netmap_dev_pager_ctor, - .cdev_pg_dtor = netmap_dev_pager_dtor, - .cdev_pg_fault = netmap_dev_pager_fault, -}; - - -static int -netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, - vm_size_t objsize, vm_object_t *objp, int prot) -{ - int error; - struct netmap_vm_handle_t *vmh; - struct netmap_priv_d *priv; - vm_object_t obj; - - D("cdev %p foff %jd size %jd objp %p prot %d", cdev, - (intmax_t )*foff, (intmax_t )objsize, objp, prot); - - vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (vmh == NULL) - return ENOMEM; - vmh->dev = cdev; - - NMG_LOCK(); - error = devfs_get_cdevpriv((void**)&priv); - if (error) - goto err_unlock; - vmh->priv = priv; - priv->np_refcount++; - NMG_UNLOCK(); - - error = netmap_get_memory(priv); - if (error) - goto err_deref; - - obj = cdev_pager_allocate(vmh, OBJT_DEVICE, - &netmap_cdev_pager_ops, objsize, prot, - *foff, NULL); - if (obj == NULL) { - D("cdev_pager_allocate failed"); - error = EINVAL; - goto err_deref; - } - - *objp = obj; - return 0; - -err_deref: - NMG_LOCK(); - priv->np_refcount--; -err_unlock: - NMG_UNLOCK(); -// err: - free(vmh, M_DEVBUF); - return error; -} - - -// XXX can we remove this ? -static int -netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) -{ - if (netmap_verbose) - D("dev %p fflag 0x%x devtype %d td %p", - dev, fflag, devtype, td); - return 0; -} - - -static int -netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) -{ - struct netmap_priv_d *priv; - int error; - - (void)dev; - (void)oflags; - (void)devtype; - (void)td; - - // XXX wait or nowait ? - priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (priv == NULL) - return ENOMEM; - - error = devfs_set_cdevpriv(priv, netmap_dtor); - if (error) - return error; - - priv->np_refcount = 1; - - return 0; -} -#endif /* __FreeBSD__ */ /* @@ -1389,144 +844,107 @@ netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) /* * pass a chain of buffers to the host stack as coming from 'dst' + * We do not need to lock because the queue is private. */ static void -netmap_send_up(struct ifnet *dst, struct mbuf *head) +netmap_send_up(struct ifnet *dst, struct mbq *q) { struct mbuf *m; /* send packets up, outside the lock */ - while ((m = head) != NULL) { - head = head->m_nextpkt; - m->m_nextpkt = NULL; + while ((m = mbq_dequeue(q)) != NULL) { if (netmap_verbose & NM_VERB_HOST) D("sending up pkt %p size %d", m, MBUF_LEN(m)); NM_SEND_UP(dst, m); } + mbq_destroy(q); } -struct mbq { - struct mbuf *head; - struct mbuf *tail; - int count; -}; - /* * put a copy of the buffers marked NS_FORWARD into an mbuf chain. - * Run from hwcur to cur - reserved + * Take packets from hwcur to ring->head marked NS_FORWARD (or forced) + * and pass them up. Drop remaining packets in the unlikely event + * of an mbuf shortage. */ static void netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force) { - /* Take packets from hwcur to cur-reserved and pass them up. - * In case of no buffers we give up. At the end of the loop, - * the queue is drained in all cases. - * XXX handle reserved - */ - u_int lim = kring->nkr_num_slots - 1; - struct mbuf *m, *tail = q->tail; - u_int k = kring->ring->cur, n = kring->ring->reserved; - struct netmap_mem_d *nmd = kring->na->nm_mem; - - /* compute the final position, ring->cur - ring->reserved */ - if (n > 0) { - if (k < n) - k += kring->nkr_num_slots; - k += n; - } - for (n = kring->nr_hwcur; n != k;) { + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->ring->head; + u_int n; + struct netmap_adapter *na = kring->na; + + for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) { + struct mbuf *m; struct netmap_slot *slot = &kring->ring->slot[n]; - n = nm_next(n, lim); if ((slot->flags & NS_FORWARD) == 0 && !force) continue; - if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(nmd)) { - D("bad pkt at %d len %d", n, slot->len); + if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { + RD(5, "bad pkt at %d len %d", n, slot->len); continue; } slot->flags &= ~NS_FORWARD; // XXX needed ? - /* XXX adapt to the case of a multisegment packet */ - m = m_devget(BDG_NMB(nmd, slot), slot->len, 0, kring->na->ifp, NULL); + /* XXX TODO: adapt to the case of a multisegment packet */ + m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL); if (m == NULL) break; - if (tail) - tail->m_nextpkt = m; - else - q->head = m; - tail = m; - q->count++; - m->m_nextpkt = NULL; + mbq_enqueue(q, m); } - q->tail = tail; } /* - * The host ring has packets from nr_hwcur to (cur - reserved) - * to be sent down to the NIC. - * We need to use the queue lock on the source (host RX ring) - * to protect against netmap_transmit. - * If the user is well behaved we do not need to acquire locks - * on the destination(s), - * so we only need to make sure that there are no panics because - * of user errors. - * XXX verify - * - * We scan the tx rings, which have just been - * flushed so nr_hwcur == cur. Pushing packets down means - * increment cur and decrement avail. - * XXX to be verified + * Send to the NIC rings packets marked NS_FORWARD between + * kring->nr_hwcur and kring->rhead + * Called under kring->rx_queue.lock on the sw rx ring, */ -static void +static u_int netmap_sw_to_nic(struct netmap_adapter *na) { struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; - struct netmap_kring *k1 = &na->tx_rings[0]; - u_int i, howmany, src_lim, dst_lim; - - /* XXX we should also check that the carrier is on */ - if (kring->nkr_stopped) - return; + struct netmap_slot *rxslot = kring->ring->slot; + u_int i, rxcur = kring->nr_hwcur; + u_int const head = kring->rhead; + u_int const src_lim = kring->nkr_num_slots - 1; + u_int sent = 0; + + /* scan rings to find space, then fill as much as possible */ + for (i = 0; i < na->num_tx_rings; i++) { + struct netmap_kring *kdst = &na->tx_rings[i]; + struct netmap_ring *rdst = kdst->ring; + u_int const dst_lim = kdst->nkr_num_slots - 1; + + /* XXX do we trust ring or kring->rcur,rtail ? */ + for (; rxcur != head && !nm_ring_empty(rdst); + rxcur = nm_next(rxcur, src_lim) ) { + struct netmap_slot *src, *dst, tmp; + u_int dst_cur = rdst->cur; - mtx_lock(&kring->q_lock); + src = &rxslot[rxcur]; + if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd) + continue; - if (kring->nkr_stopped) - goto out; + sent++; - howmany = kring->nr_hwavail; /* XXX otherwise cur - reserved - nr_hwcur */ + dst = &rdst->slot[dst_cur]; - src_lim = kring->nkr_num_slots - 1; - for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) { - ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail); - dst_lim = k1->nkr_num_slots - 1; - while (howmany > 0 && k1->ring->avail > 0) { - struct netmap_slot *src, *dst, tmp; - src = &kring->ring->slot[kring->nr_hwcur]; - dst = &k1->ring->slot[k1->ring->cur]; tmp = *src; + src->buf_idx = dst->buf_idx; src->flags = NS_BUF_CHANGED; dst->buf_idx = tmp.buf_idx; dst->len = tmp.len; dst->flags = NS_BUF_CHANGED; - ND("out len %d buf %d from %d to %d", - dst->len, dst->buf_idx, - kring->nr_hwcur, k1->ring->cur); - - kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim); - howmany--; - kring->nr_hwavail--; - k1->ring->cur = nm_next(k1->ring->cur, dst_lim); - k1->ring->avail--; + + rdst->cur = nm_next(dst_cur, dst_lim); } - kring->ring->cur = kring->nr_hwcur; // XXX - k1++; // XXX why? + /* if (sent) XXX txsync ? */ } -out: - mtx_unlock(&kring->q_lock); + return sent; } @@ -1536,360 +954,486 @@ out: * can be among multiple user threads erroneously calling * this routine concurrently. */ -static void +void netmap_txsync_to_host(struct netmap_adapter *na) { struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings]; struct netmap_ring *ring = kring->ring; - u_int k, lim = kring->nkr_num_slots - 1; - struct mbq q = { NULL, NULL, 0 }; - - if (nm_kr_tryget(kring)) { - D("ring %p busy (user error)", kring); - return; - } - k = ring->cur; - if (k > lim) { - D("invalid ring index in stack TX kring %p", kring); - netmap_ring_reinit(kring); - nm_kr_put(kring); - return; - } + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; + struct mbq q; - /* Take packets from hwcur to cur and pass them up. + /* Take packets from hwcur to head and pass them up. + * force head = cur since netmap_grab_packets() stops at head * In case of no buffers we give up. At the end of the loop, * the queue is drained in all cases. */ - netmap_grab_packets(kring, &q, 1); - kring->nr_hwcur = k; - kring->nr_hwavail = ring->avail = lim; - - nm_kr_put(kring); - netmap_send_up(na->ifp, q.head); -} - + mbq_init(&q); + ring->cur = head; + netmap_grab_packets(kring, &q, 1 /* force */); + ND("have %d pkts in queue", mbq_len(&q)); + kring->nr_hwcur = head; + kring->nr_hwtail = head + lim; + if (kring->nr_hwtail > lim) + kring->nr_hwtail -= lim + 1; + nm_txsync_finalize(kring); -/* - * This is the 'txsync' handler to send from a software ring to the - * host stack. - */ -/* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */ -static int -netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int flags) -{ - (void)ring_nr; - (void)flags; - if (netmap_verbose > 255) - RD(5, "sync to host %s ring %d", ifp->if_xname, ring_nr); - netmap_txsync_to_host(NA(ifp)); - return 0; + netmap_send_up(na->ifp, &q); } /* * rxsync backend for packets coming from the host stack. - * They have been put in the queue by netmap_transmit() so we - * need to protect access to the kring using a lock. + * They have been put in kring->rx_queue by netmap_transmit(). + * We protect access to the kring using kring->rx_queue.lock * * This routine also does the selrecord if called from the poll handler * (we know because td != NULL). * * NOTE: on linux, selrecord() is defined as a macro and uses pwait * as an additional hidden argument. + * returns the number of packets delivered to tx queues in + * transparent mode, or a negative value if error */ -static void +int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait) { struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings]; struct netmap_ring *ring = kring->ring; - u_int j, n, lim = kring->nkr_num_slots; - u_int k = ring->cur, resvd = ring->reserved; + u_int nm_i, n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; + int ret = 0; + struct mbq *q = &kring->rx_queue; (void)pwait; /* disable unused warnings */ + (void)td; - if (kring->nkr_stopped) /* check a first time without lock */ - return; + mtx_lock(&q->lock); - /* XXX as an optimization we could reuse na->core_lock */ - mtx_lock(&kring->q_lock); + /* First part: import newly received packets */ + n = mbq_len(q); + if (n) { /* grab packets from the queue */ + struct mbuf *m; + uint32_t stop_i; - if (kring->nkr_stopped) /* check again with lock held */ - goto unlock_out; + nm_i = kring->nr_hwtail; + stop_i = nm_prev(nm_i, lim); + while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) { + int len = MBUF_LEN(m); + struct netmap_slot *slot = &ring->slot[nm_i]; - if (k >= lim) { - netmap_ring_reinit(kring); - goto unlock_out; - } - /* new packets are already set in nr_hwavail */ - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... + m_copydata(m, 0, len, BDG_NMB(na, slot)); + ND("nm %d len %d", nm_i, len); + if (netmap_verbose) + D("%s", nm_dump_buf(BDG_NMB(na, slot),len, 128, NULL)); + + slot->len = len; + slot->flags = kring->nkr_slot_flags; + nm_i = nm_next(nm_i, lim); } - k = (k >= resvd) ? k - resvd : k + lim - resvd; - } - if (j != k) { - n = k >= j ? k - j : k + lim - j; - kring->nr_hwavail -= n; - kring->nr_hwcur = k; - } - k = ring->avail = kring->nr_hwavail - resvd; - if (k == 0 && td) + kring->nr_hwtail = nm_i; + } + + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* something was released */ + if (netmap_fwd || kring->ring->flags & NR_FORWARD) + ret = netmap_sw_to_nic(na); + kring->nr_hwcur = head; + } + + nm_rxsync_finalize(kring); + + /* access copies of cur,tail in the kring */ + if (kring->rcur == kring->rtail && td) /* no bufs available */ selrecord(td, &kring->si); - if (k && (netmap_verbose & NM_VERB_HOST)) - D("%d pkts from stack", k); -unlock_out: - mtx_unlock(&kring->q_lock); + mtx_unlock(&q->lock); + return ret; } -/* - * MUST BE CALLED UNDER NMG_LOCK() +/* Get a netmap adapter for the port. * - * get a refcounted reference to an interface. - * This is always called in the execution of an ioctl(). + * If it is possible to satisfy the request, return 0 + * with *na containing the netmap adapter found. + * Otherwise return an error code, with *na containing NULL. * - * Return ENXIO if the interface does not exist, EINVAL if netmap - * is not supported by the interface. - * If successful, hold a reference. + * When the port is attached to a bridge, we always return + * EBUSY. + * Otherwise, if the port is already bound to a file descriptor, + * then we unconditionally return the existing adapter into *na. + * In all the other cases, we return (into *na) either native, + * generic or NULL, according to the following table: * - * When the NIC is attached to a bridge, reference is managed - * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as - * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC - * is detached from the bridge, then ifp's refcount is dropped (this - * is equivalent to that ifp is destroyed in case of virtual ports. + * native_support + * active_fds dev.netmap.admode YES NO + * ------------------------------------------------------- + * >0 * NA(ifp) NA(ifp) + * + * 0 NETMAP_ADMODE_BEST NATIVE GENERIC + * 0 NETMAP_ADMODE_NATIVE NATIVE NULL + * 0 NETMAP_ADMODE_GENERIC GENERIC GENERIC * - * This function uses if_rele() when we want to prevent the NIC from - * being detached from the bridge in error handling. But once refcount - * is acquired by this function, it must be released using nm_if_rele(). */ -static int -get_ifp(struct nmreq *nmr, struct ifnet **ifp, int create) -{ - const char *name = nmr->nr_name; - int namelen = strlen(name); - struct ifnet *iter = NULL; - int no_prefix = 0; - /* first try to see if this is a bridge port. */ - struct nm_bridge *b; - struct netmap_adapter *na; - int i, j, cand = -1, cand2 = -1; - int needed; +int +netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na) +{ + /* generic support */ + int i = netmap_admode; /* Take a snapshot. */ + int error = 0; + struct netmap_adapter *prev_na; + struct netmap_generic_adapter *gna; - NMG_LOCK_ASSERT(); - *ifp = NULL; /* default */ - if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { - no_prefix = 1; /* no VALE prefix */ - goto no_bridge_port; - } + *na = NULL; /* default */ - b = nm_find_bridge(name, create); - if (b == NULL) { - D("no bridges available for '%s'", name); - return (ENXIO); - } + /* reset in case of invalid value */ + if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST) + i = netmap_admode = NETMAP_ADMODE_BEST; - /* Now we are sure that name starts with the bridge's name, - * lookup the port in the bridge. We need to scan the entire - * list. It is not important to hold a WLOCK on the bridge - * during the search because NMG_LOCK already guarantees - * that there are no other possible writers. - */ + if (NETMAP_CAPABLE(ifp)) { + /* If an adapter already exists, but is + * attached to a vale port, we report that the + * port is busy. + */ + if (NETMAP_OWNED_BY_KERN(NA(ifp))) + return EBUSY; - /* lookup in the local list of ports */ - for (j = 0; j < b->bdg_active_ports; j++) { - i = b->bdg_port_index[j]; - na = b->bdg_ports[i]; - // KASSERT(na != NULL); - iter = na->ifp; - /* XXX make sure the name only contains one : */ - if (!strcmp(iter->if_xname, name) /* virtual port */ || - (namelen > b->bdg_namelen && !strcmp(iter->if_xname, - name + b->bdg_namelen + 1)) /* NIC */) { - ADD_BDG_REF(iter); - ND("found existing if %s refs %d", name, - NA(iter)->na_bdg_refcount); - *ifp = iter; - /* we are done, this is surely netmap capable */ + /* If an adapter already exists, return it if + * there are active file descriptors or if + * netmap is not forced to use generic + * adapters. + */ + if (NA(ifp)->active_fds > 0 || + i != NETMAP_ADMODE_GENERIC) { + *na = NA(ifp); return 0; } } - /* not found, should we create it? */ - if (!create) - return ENXIO; - /* yes we should, see if we have space to attach entries */ - needed = 2; /* in some cases we only need 1 */ - if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { - D("bridge full %d, cannot create new port", b->bdg_active_ports); - return EINVAL; - } - /* record the next two ports available, but do not allocate yet */ - cand = b->bdg_port_index[b->bdg_active_ports]; - cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; - ND("+++ bridge %s port %s used %d avail %d %d", - b->bdg_basename, name, b->bdg_active_ports, cand, cand2); - /* - * try see if there is a matching NIC with this name - * (after the bridge's name) + /* If there isn't native support and netmap is not allowed + * to use generic adapters, we cannot satisfy the request. */ - iter = ifunit_ref(name + b->bdg_namelen + 1); - if (!iter) { /* this is a virtual port */ - /* Create a temporary NA with arguments, then - * bdg_netmap_attach() will allocate the real one - * and attach it to the ifp - */ - struct netmap_adapter tmp_na; - int error; + if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE) + return EOPNOTSUPP; - if (nmr->nr_cmd) { - /* nr_cmd must be 0 for a virtual port */ - return EINVAL; - } - bzero(&tmp_na, sizeof(tmp_na)); - /* bound checking */ - tmp_na.num_tx_rings = nmr->nr_tx_rings; - nm_bound_var(&tmp_na.num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); - nmr->nr_tx_rings = tmp_na.num_tx_rings; // write back - tmp_na.num_rx_rings = nmr->nr_rx_rings; - nm_bound_var(&tmp_na.num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); - nmr->nr_rx_rings = tmp_na.num_rx_rings; // write back - nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, - 1, NM_BDG_MAXSLOTS, NULL); - tmp_na.num_tx_desc = nmr->nr_tx_slots; - nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, - 1, NM_BDG_MAXSLOTS, NULL); - tmp_na.num_rx_desc = nmr->nr_rx_slots; - - /* create a struct ifnet for the new port. - * need M_NOWAIT as we are under nma_lock - */ - iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO); - if (!iter) - return ENOMEM; - - strcpy(iter->if_xname, name); - tmp_na.ifp = iter; - /* bdg_netmap_attach creates a struct netmap_adapter */ - error = bdg_netmap_attach(&tmp_na); - if (error) { - D("error %d", error); - free(iter, M_DEVBUF); - return error; - } - cand2 = -1; /* only need one port */ - } else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */ - /* make sure the NIC is not already in use */ - if (NETMAP_OWNED_BY_ANY(iter)) { - D("NIC %s busy, cannot attach to bridge", - iter->if_xname); - if_rele(iter); /* don't detach from bridge */ - return EINVAL; - } - if (nmr->nr_arg1 != NETMAP_BDG_HOST) - cand2 = -1; /* only need one port */ - } else { /* not a netmap-capable NIC */ - if_rele(iter); /* don't detach from bridge */ - return EINVAL; - } - na = NA(iter); - - BDG_WLOCK(b); - na->bdg_port = cand; - ND("NIC %p to bridge port %d", NA(iter), cand); - /* bind the port to the bridge (virtual ports are not active) */ - b->bdg_ports[cand] = na; - na->na_bdg = b; - b->bdg_active_ports++; - if (cand2 >= 0) { - /* also bind the host stack to the bridge */ - b->bdg_ports[cand2] = SWNA(iter); - SWNA(iter)->bdg_port = cand2; - SWNA(iter)->na_bdg = b; - b->bdg_active_ports++; - ND("host %p to bridge port %d", SWNA(iter), cand2); - } - ADD_BDG_REF(iter); // XXX one or two ? - ND("if %s refs %d", name, NA(iter)->na_bdg_refcount); - BDG_WUNLOCK(b); - *ifp = iter; - return 0; + /* Otherwise, create a generic adapter and return it, + * saving the previously used netmap adapter, if any. + * + * Note that here 'prev_na', if not NULL, MUST be a + * native adapter, and CANNOT be a generic one. This is + * true because generic adapters are created on demand, and + * destroyed when not used anymore. Therefore, if the adapter + * currently attached to an interface 'ifp' is generic, it + * must be that + * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))). + * Consequently, if NA(ifp) is generic, we will enter one of + * the branches above. This ensures that we never override + * a generic adapter with another generic adapter. + */ + prev_na = NA(ifp); + error = generic_netmap_attach(ifp); + if (error) + return error; -no_bridge_port: - *ifp = iter; - if (! *ifp) - *ifp = ifunit_ref(name); - if (*ifp == NULL) - return (ENXIO); - - if (NETMAP_CAPABLE(*ifp)) { - /* Users cannot use the NIC attached to a bridge directly */ - if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) { - if_rele(*ifp); /* don't detach from bridge */ - return EINVAL; - } else - return 0; /* valid pointer, we hold the refcount */ + *na = NA(ifp); + gna = (struct netmap_generic_adapter*)NA(ifp); + gna->prev = prev_na; /* save old na */ + if (prev_na != NULL) { + ifunit_ref(ifp->if_xname); + // XXX add a refcount ? + netmap_adapter_get(prev_na); } - nm_if_rele(*ifp); - return EINVAL; // not NETMAP capable + ND("Created generic NA %p (prev %p)", gna, gna->prev); + + return 0; } /* - * Error routine called when txsync/rxsync detects an error. - * Can't do much more than resetting cur = hwcur, avail = hwavail. - * Return 1 on reinit. + * MUST BE CALLED UNDER NMG_LOCK() * - * This routine is only called by the upper half of the kernel. - * It only reads hwcur (which is changed only by the upper half, too) - * and hwavail (which may be changed by the lower half, but only on - * a tx ring and only to increase it, so any error will be recovered - * on the next call). For the above, we don't strictly need to call - * it under lock. + * Get a refcounted reference to a netmap adapter attached + * to the interface specified by nmr. + * This is always called in the execution of an ioctl(). + * + * Return ENXIO if the interface specified by the request does + * not exist, ENOTSUP if netmap is not supported by the interface, + * EBUSY if the interface is already attached to a bridge, + * EINVAL if parameters are invalid, ENOMEM if needed resources + * could not be allocated. + * If successful, hold a reference to the netmap adapter. + * + * No reference is kept on the real interface, which may then + * disappear at any time. */ int -netmap_ring_reinit(struct netmap_kring *kring) +netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create) { - struct netmap_ring *ring = kring->ring; - u_int i, lim = kring->nkr_num_slots - 1; - int errors = 0; + struct ifnet *ifp = NULL; + int error = 0; + struct netmap_adapter *ret = NULL; - // XXX KASSERT nm_kr_tryget - RD(10, "called for %s", kring->na->ifp->if_xname); - if (ring->cur > lim) - errors++; - for (i = 0; i <= lim; i++) { - u_int idx = ring->slot[i].buf_idx; - u_int len = ring->slot[i].len; - if (idx < 2 || idx >= netmap_total_buffers) { - if (!errors++) - D("bad buffer at slot %d idx %d len %d ", i, idx, len); - ring->slot[i].buf_idx = 0; - ring->slot[i].len = 0; - } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { - ring->slot[i].len = 0; - if (!errors++) - D("bad len %d at slot %d idx %d", - len, i, idx); - } - } - if (errors) { - int pos = kring - kring->na->tx_rings; - int n = kring->na->num_tx_rings + 1; + *na = NULL; /* default return value */ - RD(10, "total %d errors", errors); - errors++; - RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d", - kring->na->ifp->if_xname, - pos < n ? "TX" : "RX", pos < n ? pos : pos - n, - ring->cur, kring->nr_hwcur, - ring->avail, kring->nr_hwavail); - ring->cur = kring->nr_hwcur; - ring->avail = kring->nr_hwavail; + /* first try to see if this is a bridge port. */ + NMG_LOCK_ASSERT(); + + error = netmap_get_pipe_na(nmr, na, create); + if (error || *na != NULL) + return error; + + error = netmap_get_bdg_na(nmr, na, create); + if (error) + return error; + + if (*na != NULL) /* valid match in netmap_get_bdg_na() */ + goto pipes; + + ifp = ifunit_ref(nmr->nr_name); + if (ifp == NULL) { + return ENXIO; + } + + error = netmap_get_hw_na(ifp, &ret); + if (error) + goto out; + + /* Users cannot use the NIC attached to a bridge directly */ + if (NETMAP_OWNED_BY_KERN(ret)) { + error = EBUSY; + goto out; + } + *na = ret; + netmap_adapter_get(ret); + +pipes: + error = netmap_pipe_alloc(*na, nmr); + +out: + if (error && ret != NULL) + netmap_adapter_put(ret); + + if (ifp) + if_rele(ifp); + + return error; +} + + +/* + * validate parameters on entry for *_txsync() + * Returns ring->cur if ok, or something >= kring->nkr_num_slots + * in case of error. + * + * rhead, rcur and rtail=hwtail are stored from previous round. + * hwcur is the next packet to send to the ring. + * + * We want + * hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail + * + * hwcur, rhead, rtail and hwtail are reliable + */ +u_int +nm_txsync_prologue(struct netmap_kring *kring) +{ + struct netmap_ring *ring = kring->ring; + u_int head = ring->head; /* read only once */ + u_int cur = ring->cur; /* read only once */ + u_int n = kring->nkr_num_slots; + + ND(5, "%s kcur %d ktail %d head %d cur %d tail %d", + kring->name, + kring->nr_hwcur, kring->nr_hwtail, + ring->head, ring->cur, ring->tail); +#if 1 /* kernel sanity checks; but we can trust the kring. */ + if (kring->nr_hwcur >= n || kring->rhead >= n || + kring->rtail >= n || kring->nr_hwtail >= n) + goto error; +#endif /* kernel sanity checks */ + /* + * user sanity checks. We only use 'cur', + * A, B, ... are possible positions for cur: + * + * 0 A cur B tail C n-1 + * 0 D tail E cur F n-1 + * + * B, F, D are valid. A, C, E are wrong + */ + if (kring->rtail >= kring->rhead) { + /* want rhead <= head <= rtail */ + if (head < kring->rhead || head > kring->rtail) + goto error; + /* and also head <= cur <= rtail */ + if (cur < head || cur > kring->rtail) + goto error; + } else { /* here rtail < rhead */ + /* we need head outside rtail .. rhead */ + if (head > kring->rtail && head < kring->rhead) + goto error; + + /* two cases now: head <= rtail or head >= rhead */ + if (head <= kring->rtail) { + /* want head <= cur <= rtail */ + if (cur < head || cur > kring->rtail) + goto error; + } else { /* head >= rhead */ + /* cur must be outside rtail..head */ + if (cur > kring->rtail && cur < head) + goto error; + } + } + if (ring->tail != kring->rtail) { + RD(5, "tail overwritten was %d need %d", + ring->tail, kring->rtail); + ring->tail = kring->rtail; + } + kring->rhead = head; + kring->rcur = cur; + return head; + +error: + RD(5, "%s kring error: hwcur %d rcur %d hwtail %d cur %d tail %d", + kring->name, + kring->nr_hwcur, + kring->rcur, kring->nr_hwtail, + cur, ring->tail); + return n; +} + + +/* + * validate parameters on entry for *_rxsync() + * Returns ring->head if ok, kring->nkr_num_slots on error. + * + * For a valid configuration, + * hwcur <= head <= cur <= tail <= hwtail + * + * We only consider head and cur. + * hwcur and hwtail are reliable. + * + */ +u_int +nm_rxsync_prologue(struct netmap_kring *kring) +{ + struct netmap_ring *ring = kring->ring; + uint32_t const n = kring->nkr_num_slots; + uint32_t head, cur; + + ND("%s kc %d kt %d h %d c %d t %d", + kring->name, + kring->nr_hwcur, kring->nr_hwtail, + ring->head, ring->cur, ring->tail); + /* + * Before storing the new values, we should check they do not + * move backwards. However: + * - head is not an issue because the previous value is hwcur; + * - cur could in principle go back, however it does not matter + * because we are processing a brand new rxsync() + */ + cur = kring->rcur = ring->cur; /* read only once */ + head = kring->rhead = ring->head; /* read only once */ +#if 1 /* kernel sanity checks */ + if (kring->nr_hwcur >= n || kring->nr_hwtail >= n) + goto error; +#endif /* kernel sanity checks */ + /* user sanity checks */ + if (kring->nr_hwtail >= kring->nr_hwcur) { + /* want hwcur <= rhead <= hwtail */ + if (head < kring->nr_hwcur || head > kring->nr_hwtail) + goto error; + /* and also rhead <= rcur <= hwtail */ + if (cur < head || cur > kring->nr_hwtail) + goto error; + } else { + /* we need rhead outside hwtail..hwcur */ + if (head < kring->nr_hwcur && head > kring->nr_hwtail) + goto error; + /* two cases now: head <= hwtail or head >= hwcur */ + if (head <= kring->nr_hwtail) { + /* want head <= cur <= hwtail */ + if (cur < head || cur > kring->nr_hwtail) + goto error; + } else { + /* cur must be outside hwtail..head */ + if (cur < head && cur > kring->nr_hwtail) + goto error; + } + } + if (ring->tail != kring->rtail) { + RD(5, "%s tail overwritten was %d need %d", + kring->name, + ring->tail, kring->rtail); + ring->tail = kring->rtail; + } + return head; + +error: + RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d", + kring->nr_hwcur, + kring->rcur, kring->nr_hwtail, + kring->rhead, kring->rcur, ring->tail); + return n; +} + + +/* + * Error routine called when txsync/rxsync detects an error. + * Can't do much more than resetting head =cur = hwcur, tail = hwtail + * Return 1 on reinit. + * + * This routine is only called by the upper half of the kernel. + * It only reads hwcur (which is changed only by the upper half, too) + * and hwtail (which may be changed by the lower half, but only on + * a tx ring and only to increase it, so any error will be recovered + * on the next call). For the above, we don't strictly need to call + * it under lock. + */ +int +netmap_ring_reinit(struct netmap_kring *kring) +{ + struct netmap_ring *ring = kring->ring; + u_int i, lim = kring->nkr_num_slots - 1; + int errors = 0; + + // XXX KASSERT nm_kr_tryget + RD(10, "called for %s", NM_IFPNAME(kring->na->ifp)); + // XXX probably wrong to trust userspace + kring->rhead = ring->head; + kring->rcur = ring->cur; + kring->rtail = ring->tail; + + if (ring->cur > lim) + errors++; + if (ring->head > lim) + errors++; + if (ring->tail > lim) + errors++; + for (i = 0; i <= lim; i++) { + u_int idx = ring->slot[i].buf_idx; + u_int len = ring->slot[i].len; + if (idx < 2 || idx >= netmap_total_buffers) { + RD(5, "bad index at slot %d idx %d len %d ", i, idx, len); + ring->slot[i].buf_idx = 0; + ring->slot[i].len = 0; + } else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) { + ring->slot[i].len = 0; + RD(5, "bad len at slot %d idx %d len %d", i, idx, len); + } + } + if (errors) { + RD(10, "total %d errors", errors); + RD(10, "%s reinit, cur %d -> %d tail %d -> %d", + kring->name, + ring->cur, kring->nr_hwcur, + ring->tail, kring->nr_hwtail); + ring->head = kring->rhead = kring->nr_hwcur; + ring->cur = kring->rcur = kring->nr_hwcur; + ring->tail = kring->rtail = kring->nr_hwtail; } return (errors ? 1 : 0); } @@ -1900,63 +1444,106 @@ netmap_ring_reinit(struct netmap_kring *kring) * for all rings is the same as a single ring. */ static int -netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid) -{ - struct ifnet *ifp = priv->np_ifp; - struct netmap_adapter *na = NA(ifp); - u_int i = ringid & NETMAP_RING_MASK; - /* initially (np_qfirst == np_qlast) we don't want to lock */ - u_int lim = na->num_rx_rings; - - if (na->num_tx_rings > lim) - lim = na->num_tx_rings; - if ( (ringid & NETMAP_HW_RING) && i >= lim) { - D("invalid ring id %d", i); - return (EINVAL); - } - priv->np_ringid = ringid; - if (ringid & NETMAP_SW_RING) { - priv->np_qfirst = NETMAP_SW_RING; - priv->np_qlast = 0; - } else if (ringid & NETMAP_HW_RING) { - priv->np_qfirst = i; - priv->np_qlast = i + 1; - } else { - priv->np_qfirst = 0; - priv->np_qlast = NETMAP_HW_RING ; +netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags) +{ + struct netmap_adapter *na = priv->np_na; + u_int j, i = ringid & NETMAP_RING_MASK; + u_int reg = flags & NR_REG_MASK; + + if (reg == NR_REG_DEFAULT) { + /* convert from old ringid to flags */ + if (ringid & NETMAP_SW_RING) { + reg = NR_REG_SW; + } else if (ringid & NETMAP_HW_RING) { + reg = NR_REG_ONE_NIC; + } else { + reg = NR_REG_ALL_NIC; + } + D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg); + } + switch (reg) { + case NR_REG_ALL_NIC: + case NR_REG_PIPE_MASTER: + case NR_REG_PIPE_SLAVE: + priv->np_txqfirst = 0; + priv->np_txqlast = na->num_tx_rings; + priv->np_rxqfirst = 0; + priv->np_rxqlast = na->num_rx_rings; + ND("%s %d %d", "ALL/PIPE", + priv->np_rxqfirst, priv->np_rxqlast); + break; + case NR_REG_SW: + case NR_REG_NIC_SW: + if (!(na->na_flags & NAF_HOST_RINGS)) { + D("host rings not supported"); + return EINVAL; + } + priv->np_txqfirst = (reg == NR_REG_SW ? + na->num_tx_rings : 0); + priv->np_txqlast = na->num_tx_rings + 1; + priv->np_rxqfirst = (reg == NR_REG_SW ? + na->num_rx_rings : 0); + priv->np_rxqlast = na->num_rx_rings + 1; + ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW", + priv->np_rxqfirst, priv->np_rxqlast); + break; + case NR_REG_ONE_NIC: + if (i >= na->num_tx_rings && i >= na->num_rx_rings) { + D("invalid ring id %d", i); + return EINVAL; + } + /* if not enough rings, use the first one */ + j = i; + if (j >= na->num_tx_rings) + j = 0; + priv->np_txqfirst = j; + priv->np_txqlast = j + 1; + j = i; + if (j >= na->num_rx_rings) + j = 0; + priv->np_rxqfirst = j; + priv->np_rxqlast = j + 1; + break; + default: + D("invalid regif type %d", reg); + return EINVAL; } priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1; - if (netmap_verbose) { - if (ringid & NETMAP_SW_RING) - D("ringid %s set to SW RING", ifp->if_xname); - else if (ringid & NETMAP_HW_RING) - D("ringid %s set to HW RING %d", ifp->if_xname, - priv->np_qfirst); - else - D("ringid %s set to all %d HW RINGS", ifp->if_xname, lim); - } + priv->np_flags = (flags & ~NR_REG_MASK) | reg; + if (nm_tx_si_user(priv)) + na->tx_si_users++; + if (nm_rx_si_user(priv)) + na->rx_si_users++; + if (netmap_verbose) { + D("%s: tx [%d,%d) rx [%d,%d) id %d", + NM_IFPNAME(na->ifp), + priv->np_txqfirst, + priv->np_txqlast, + priv->np_rxqfirst, + priv->np_rxqlast, + i); + } return 0; } - /* * possibly move the interface to netmap-mode. * If success it returns a pointer to netmap_if, otherwise NULL. * This must be called with NMG_LOCK held. */ -static struct netmap_if * -netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp, - uint16_t ringid, int *err) +struct netmap_if * +netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, + uint16_t ringid, uint32_t flags, int *err) { - struct netmap_adapter *na = NA(ifp); + struct ifnet *ifp = na->ifp; struct netmap_if *nifp = NULL; - int error, need_mem; + int error, need_mem = 0; NMG_LOCK_ASSERT(); /* ring configuration may have changed, fetch from the card */ netmap_update_config(na); - priv->np_ifp = ifp; /* store the reference */ - error = netmap_set_ringid(priv, ringid); + priv->np_na = na; /* store the reference */ + error = netmap_set_ringid(priv, ringid, flags); if (error) goto out; /* ensure allocators are ready */ @@ -1967,57 +1554,40 @@ netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp, if (error) goto out; } - nifp = netmap_if_new(ifp->if_xname, na); + nifp = netmap_if_new(NM_IFPNAME(ifp), na); if (nifp == NULL) { /* allocation failed */ /* we should drop the allocator, but only * if we were the ones who grabbed it */ - if (need_mem) - netmap_drop_memory_locked(priv); error = ENOMEM; goto out; } - na->refcount++; + na->active_fds++; if (ifp->if_capenable & IFCAP_NETMAP) { /* was already set */ } else { - u_int i; /* Otherwise set the card in netmap mode * and make it use the shared buffers. * - * If the interface is attached to a bridge, lock it. - */ - if (NETMAP_OWNED_BY_KERN(ifp)) - BDG_WLOCK(NA(ifp)->na_bdg); - for (i = 0 ; i < na->num_tx_rings + 1; i++) - mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock", - NULL, MTX_DEF); - for (i = 0 ; i < na->num_rx_rings + 1; i++) { - mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", - NULL, MTX_DEF); - } - if (nma_is_hw(na)) { - SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings]; - SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings]; - } - /* * do not core lock because the race is harmless here, * there cannot be any traffic to netmap_transmit() */ - error = na->nm_register(ifp, 1); /* mode on */ - // XXX do we need to nm_alloc_bdgfwd() in all cases ? - if (!error) - error = nm_alloc_bdgfwd(na); + na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut; + ND("%p->na_lut == %p", na, na->na_lut); + na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal; + error = na->nm_register(na, 1); /* mode on */ if (error) { netmap_do_unregif(priv, nifp); nifp = NULL; } - if (NETMAP_OWNED_BY_KERN(ifp)) - BDG_WUNLOCK(NA(ifp)->na_bdg); - } out: *err = error; + if (error) { + priv->np_na = NULL; + if (need_mem) + netmap_drop_memory_locked(priv); + } if (nifp != NULL) { /* * advertise that the interface is ready bt setting ni_nifp. @@ -2030,251 +1600,6 @@ out: return nifp; } -/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ -static int -nm_bdg_attach(struct nmreq *nmr) -{ - struct ifnet *ifp; - struct netmap_if *nifp; - struct netmap_priv_d *npriv; - int error; - - npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); - if (npriv == NULL) - return ENOMEM; - NMG_LOCK(); - error = get_ifp(nmr, &ifp, 1 /* create if not exists */); - if (error) /* no device, or another bridge or user owns the device */ - goto unlock_exit; - /* get_ifp() sets na_bdg if this is a physical interface - * that we can attach to a switch. - */ - if (!NETMAP_OWNED_BY_KERN(ifp)) { - /* got reference to a virtual port or direct access to a NIC. - * perhaps specified no bridge prefix or wrong NIC name - */ - error = EINVAL; - goto unref_exit; - } - - if (NA(ifp)->refcount > 0) { /* already registered */ - error = EBUSY; - DROP_BDG_REF(ifp); - goto unlock_exit; - } - - nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error); - if (!nifp) { - goto unref_exit; - } - - NA(ifp)->na_kpriv = npriv; - NMG_UNLOCK(); - ND("registered %s to netmap-mode", ifp->if_xname); - return 0; - -unref_exit: - nm_if_rele(ifp); -unlock_exit: - NMG_UNLOCK(); - bzero(npriv, sizeof(*npriv)); - free(npriv, M_DEVBUF); - return error; -} - -static int -nm_bdg_detach(struct nmreq *nmr) -{ - struct ifnet *ifp; - int error; - int last_instance; - - NMG_LOCK(); - error = get_ifp(nmr, &ifp, 0 /* don't create */); - if (error) { /* no device, or another bridge or user owns the device */ - goto unlock_exit; - } - /* XXX do we need to check this ? */ - if (!NETMAP_OWNED_BY_KERN(ifp)) { - /* got reference to a virtual port or direct access to a NIC. - * perhaps specified no bridge's prefix or wrong NIC's name - */ - error = EINVAL; - goto unref_exit; - } - - if (NA(ifp)->refcount == 0) { /* not registered */ - error = EINVAL; - goto unref_exit; - } - - DROP_BDG_REF(ifp); /* the one from get_ifp */ - last_instance = netmap_dtor_locked(NA(ifp)->na_kpriv); /* unregister */ - NMG_UNLOCK(); - if (!last_instance) { - D("--- error, trying to detach an entry with active mmaps"); - error = EINVAL; - } else { - struct netmap_priv_d *npriv = NA(ifp)->na_kpriv; - NA(ifp)->na_kpriv = NULL; - - bzero(npriv, sizeof(*npriv)); - free(npriv, M_DEVBUF); - } - return error; - -unref_exit: - nm_if_rele(ifp); -unlock_exit: - NMG_UNLOCK(); - return error; -} - - -/* Initialize necessary fields of sw adapter located in right after hw's - * one. sw adapter attaches a pair of sw rings of the netmap-mode NIC. - * It is always activated and deactivated at the same tie with the hw's one. - * Thus we don't need refcounting on the sw adapter. - * Regardless of NIC's feature we use separate lock so that anybody can lock - * me independently from the hw adapter. - * Make sure nm_register is NULL to be handled as FALSE in nma_is_hw - */ -static void -netmap_attach_sw(struct ifnet *ifp) -{ - struct netmap_adapter *hw_na = NA(ifp); - struct netmap_adapter *na = SWNA(ifp); - - na->ifp = ifp; - na->num_rx_rings = na->num_tx_rings = 1; - na->num_tx_desc = hw_na->num_tx_desc; - na->num_rx_desc = hw_na->num_rx_desc; - na->nm_txsync = netmap_bdg_to_host; - /* we use the same memory allocator as the - * the hw adapter */ - na->nm_mem = hw_na->nm_mem; -} - - -/* exported to kernel callers, e.g. OVS ? - * Entry point. - * Called without NMG_LOCK. - */ -int -netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) -{ - struct nm_bridge *b; - struct netmap_adapter *na; - struct ifnet *iter; - char *name = nmr->nr_name; - int cmd = nmr->nr_cmd, namelen = strlen(name); - int error = 0, i, j; - - switch (cmd) { - case NETMAP_BDG_ATTACH: - error = nm_bdg_attach(nmr); - break; - - case NETMAP_BDG_DETACH: - error = nm_bdg_detach(nmr); - break; - - case NETMAP_BDG_LIST: - /* this is used to enumerate bridges and ports */ - if (namelen) { /* look up indexes of bridge and port */ - if (strncmp(name, NM_NAME, strlen(NM_NAME))) { - error = EINVAL; - break; - } - NMG_LOCK(); - b = nm_find_bridge(name, 0 /* don't create */); - if (!b) { - error = ENOENT; - NMG_UNLOCK(); - break; - } - - error = ENOENT; - for (j = 0; j < b->bdg_active_ports; j++) { - i = b->bdg_port_index[j]; - na = b->bdg_ports[i]; - if (na == NULL) { - D("---AAAAAAAAARGH-------"); - continue; - } - iter = na->ifp; - /* the former and the latter identify a - * virtual port and a NIC, respectively - */ - if (!strcmp(iter->if_xname, name) || - (namelen > b->bdg_namelen && - !strcmp(iter->if_xname, - name + b->bdg_namelen + 1))) { - /* bridge index */ - nmr->nr_arg1 = b - nm_bridges; - nmr->nr_arg2 = i; /* port index */ - error = 0; - break; - } - } - NMG_UNLOCK(); - } else { - /* return the first non-empty entry starting from - * bridge nr_arg1 and port nr_arg2. - * - * Users can detect the end of the same bridge by - * seeing the new and old value of nr_arg1, and can - * detect the end of all the bridge by error != 0 - */ - i = nmr->nr_arg1; - j = nmr->nr_arg2; - - NMG_LOCK(); - for (error = ENOENT; i < NM_BRIDGES; i++) { - b = nm_bridges + i; - if (j >= b->bdg_active_ports) { - j = 0; /* following bridges scan from 0 */ - continue; - } - nmr->nr_arg1 = i; - nmr->nr_arg2 = j; - j = b->bdg_port_index[j]; - na = b->bdg_ports[j]; - iter = na->ifp; - strncpy(name, iter->if_xname, (size_t)IFNAMSIZ); - error = 0; - break; - } - NMG_UNLOCK(); - } - break; - - case NETMAP_BDG_LOOKUP_REG: - /* register a lookup function to the given bridge. - * nmr->nr_name may be just bridge's name (including ':' - * if it is not just NM_NAME). - */ - if (!func) { - error = EINVAL; - break; - } - NMG_LOCK(); - b = nm_find_bridge(name, 0 /* don't create */); - if (!b) { - error = EINVAL; - } else { - b->nm_bdg_lookup = func; - } - NMG_UNLOCK(); - break; - - default: - D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); - error = EINVAL; - break; - } - return error; -} /* @@ -2284,13 +1609,12 @@ netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) * - NIOCGINFO * - SIOCGIFADDR just for convenience * - NIOCREGIF - * - NIOCUNREGIF * - NIOCTXSYNC * - NIOCRXSYNC * * Return 0 on success, errno otherwise. */ -static int +int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { @@ -2299,27 +1623,27 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, struct nmreq *nmr = (struct nmreq *) data; struct netmap_adapter *na = NULL; int error; - u_int i, lim; + u_int i, qfirst, qlast; struct netmap_if *nifp; struct netmap_kring *krings; (void)dev; /* UNUSED */ (void)fflag; /* UNUSED */ -#ifdef linux -#define devfs_get_cdevpriv(pp) \ - ({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; \ - (*pp ? 0 : ENOENT); }) - -/* devfs_set_cdevpriv cannot fail on linux */ -#define devfs_set_cdevpriv(p, fn) \ - ({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); }) - - -#define devfs_clear_cdevpriv() do { \ - netmap_dtor(priv); ((struct file *)td)->private_data = 0; \ - } while (0) -#endif /* linux */ + if (cmd == NIOCGINFO || cmd == NIOCREGIF) { + /* truncate name */ + nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; + if (nmr->nr_version != NETMAP_API) { + D("API mismatch for %s got %d need %d", + nmr->nr_name, + nmr->nr_version, NETMAP_API); + nmr->nr_version = NETMAP_API; + } + if (nmr->nr_version < NETMAP_MIN_API || + nmr->nr_version > NETMAP_MAX_API) { + return EINVAL; + } + } CURVNET_SET(TD_TO_VNET(td)); error = devfs_get_cdevpriv((void **)&priv); @@ -2330,16 +1654,8 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, return (error == ENOENT ? ENXIO : error); } - nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0'; /* truncate name */ switch (cmd) { case NIOCGINFO: /* return capabilities etc */ - if (nmr->nr_version != NETMAP_API) { - D("API mismatch got %d have %d", - nmr->nr_version, NETMAP_API); - nmr->nr_version = NETMAP_API; - error = EINVAL; - break; - } if (nmr->nr_cmd == NETMAP_BDG_LIST) { error = netmap_bdg_ctl(nmr, NULL); break; @@ -2353,14 +1669,14 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, if (nmr->nr_name[0] != '\0') { /* get a refcount */ - error = get_ifp(nmr, &ifp, 1 /* create */); + error = netmap_get_na(nmr, &na, 1 /* create */); if (error) break; - na = NA(ifp); /* retrieve the netmap adapter */ - nmd = na->nm_mem; /* and its memory allocator */ + nmd = na->nm_mem; /* get memory allocator */ } - - error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags); + + error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags, + &nmr->nr_arg2); if (error) break; if (na == NULL) /* only memory info */ @@ -2372,23 +1688,16 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, nmr->nr_tx_rings = na->num_tx_rings; nmr->nr_rx_slots = na->num_rx_desc; nmr->nr_tx_slots = na->num_tx_desc; - if (memflags & NETMAP_MEM_PRIVATE) - nmr->nr_ringid |= NETMAP_PRIV_MEM; + netmap_adapter_put(na); } while (0); - if (ifp) - nm_if_rele(ifp); /* return the refcount */ NMG_UNLOCK(); break; case NIOCREGIF: - if (nmr->nr_version != NETMAP_API) { - nmr->nr_version = NETMAP_API; - error = EINVAL; - break; - } /* possibly attach/detach NIC and VALE switch */ i = nmr->nr_cmd; - if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) { + if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH + || i == NETMAP_BDG_VNET_HDR) { error = netmap_bdg_ctl(nmr, NULL); break; } else if (i != 0) { @@ -2402,53 +1711,58 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, do { u_int memflags; - if (priv->np_ifp != NULL) { /* thread already registered */ - error = netmap_set_ringid(priv, nmr->nr_ringid); + if (priv->np_na != NULL) { /* thread already registered */ + error = EBUSY; break; } /* find the interface and a reference */ - error = get_ifp(nmr, &ifp, 1 /* create */); /* keep reference */ + error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */ if (error) break; - if (NETMAP_OWNED_BY_KERN(ifp)) { - nm_if_rele(ifp); + ifp = na->ifp; + if (NETMAP_OWNED_BY_KERN(na)) { + netmap_adapter_put(na); error = EBUSY; break; } - nifp = netmap_do_regif(priv, ifp, nmr->nr_ringid, &error); + nifp = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags, &error); if (!nifp) { /* reg. failed, release priv and ref */ - nm_if_rele(ifp); /* return the refcount */ - priv->np_ifp = NULL; + netmap_adapter_put(na); priv->np_nifp = NULL; break; } + priv->np_td = td; // XXX kqueue, debugging only /* return the offset of the netmap_if object */ - na = NA(ifp); /* retrieve netmap adapter */ nmr->nr_rx_rings = na->num_rx_rings; nmr->nr_tx_rings = na->num_tx_rings; nmr->nr_rx_slots = na->num_rx_desc; nmr->nr_tx_slots = na->num_tx_desc; - error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags); + error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags, + &nmr->nr_arg2); if (error) { - nm_if_rele(ifp); + netmap_adapter_put(na); break; } if (memflags & NETMAP_MEM_PRIVATE) { - nmr->nr_ringid |= NETMAP_PRIV_MEM; *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM; } + priv->np_txsi = (priv->np_txqlast - priv->np_txqfirst > 1) ? + &na->tx_si : &na->tx_rings[priv->np_txqfirst].si; + priv->np_rxsi = (priv->np_rxqlast - priv->np_rxqfirst > 1) ? + &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si; + + if (nmr->nr_arg3) { + D("requested %d extra buffers", nmr->nr_arg3); + nmr->nr_arg3 = netmap_extra_alloc(na, + &nifp->ni_bufs_head, nmr->nr_arg3); + D("got %d extra buffers", nmr->nr_arg3); + } nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp); } while (0); NMG_UNLOCK(); break; - case NIOCUNREGIF: - // XXX we have no data here ? - D("deprecated, data is %p", nmr); - error = EINVAL; - break; - case NIOCTXSYNC: case NIOCRXSYNC: nifp = priv->np_nifp; @@ -2459,30 +1773,32 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, } rmb(); /* make sure following reads are not from cache */ - ifp = priv->np_ifp; /* we have a reference */ + na = priv->np_na; /* we have a reference */ - if (ifp == NULL) { - D("Internal error: nifp != NULL && ifp == NULL"); + if (na == NULL) { + D("Internal error: nifp != NULL && na == NULL"); error = ENXIO; break; } - na = NA(ifp); /* retrieve netmap adapter */ - if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */ - if (cmd == NIOCTXSYNC) - netmap_txsync_to_host(na); - else - netmap_rxsync_from_host(na, NULL, NULL); + ifp = na->ifp; + if (ifp == NULL) { + RD(1, "the ifp is gone"); + error = ENXIO; break; } - /* find the last ring to scan */ - lim = priv->np_qlast; - if (lim == NETMAP_HW_RING) - lim = (cmd == NIOCTXSYNC) ? - na->num_tx_rings : na->num_rx_rings; - - krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings; - for (i = priv->np_qfirst; i < lim; i++) { + + if (cmd == NIOCTXSYNC) { + krings = na->tx_rings; + qfirst = priv->np_txqfirst; + qlast = priv->np_txqlast; + } else { + krings = na->rx_rings; + qfirst = priv->np_rxqfirst; + qlast = priv->np_rxqlast; + } + + for (i = qfirst; i < qlast; i++) { struct netmap_kring *kring = krings + i; if (nm_kr_tryget(kring)) { error = EBUSY; @@ -2493,13 +1809,17 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, D("pre txsync ring %d cur %d hwcur %d", i, kring->ring->cur, kring->nr_hwcur); - na->nm_txsync(ifp, i, NAF_FORCE_RECLAIM); + if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { + netmap_ring_reinit(kring); + } else { + kring->nm_sync(kring, NAF_FORCE_RECLAIM); + } if (netmap_verbose & NM_VERB_TXSYNC) D("post txsync ring %d cur %d hwcur %d", i, kring->ring->cur, kring->nr_hwcur); } else { - na->nm_rxsync(ifp, i, NAF_FORCE_READ); + kring->nm_sync(kring, NAF_FORCE_READ); microtime(&na->rx_rings[i].ring->ts); } nm_kr_put(kring); @@ -2508,6 +1828,11 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, break; #ifdef __FreeBSD__ + case FIONBIO: + case FIOASYNC: + ND("FIONBIO/FIOASYNC are no-ops"); + break; + case BIOCIMMEDIATE: case BIOCGHDRCMPLT: case BIOCSHDRCMPLT: @@ -2521,15 +1846,17 @@ netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, bzero(&so, sizeof(so)); NMG_LOCK(); - error = get_ifp(nmr, &ifp, 0 /* don't create */); /* keep reference */ + error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */ if (error) { + netmap_adapter_put(na); NMG_UNLOCK(); break; } + ifp = na->ifp; so.so_vnet = ifp->if_vnet; // so->so_proto not null. error = ifioctl(&so, cmd, data, td); - nm_if_rele(ifp); + netmap_adapter_put(na); NMG_UNLOCK(); break; } @@ -2560,7 +1887,7 @@ out: * The first one is remapped to pwait as selrecord() uses the name as an * hidden argument. */ -static int +int netmap_poll(struct cdev *dev, int events, struct thread *td) { struct netmap_priv_d *priv = NULL; @@ -2568,15 +1895,33 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) struct ifnet *ifp; struct netmap_kring *kring; u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0; - u_int lim_tx, lim_rx, host_forwarded = 0; - struct mbq q = { NULL, NULL, 0 }; + struct mbq q; /* packets from hw queues to host stack */ void *pwait = dev; /* linux compatibility */ + int is_kevent = 0; - int retry_tx = 1; + /* + * In order to avoid nested locks, we need to "double check" + * txsync and rxsync if we decide to do a selrecord(). + * retry_tx (and retry_rx, later) prevent looping forever. + */ + int retry_tx = 1, retry_rx = 1; (void)pwait; + mbq_init(&q); - if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL) + /* + * XXX kevent has curthread->tp_fop == NULL, + * so devfs_get_cdevpriv() fails. We circumvent this by passing + * priv as the first argument, which is also useful to avoid + * the selrecord() which are not necessary in that case. + */ + if (devfs_get_cdevpriv((void **)&priv) != 0) { + is_kevent = 1; + if (netmap_verbose) + D("called from kevent"); + priv = (struct netmap_priv_d *)dev; + } + if (priv == NULL) return POLLERR; if (priv->np_nifp == NULL) { @@ -2585,53 +1930,26 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) } rmb(); /* make sure following reads are not from cache */ - ifp = priv->np_ifp; - // XXX check for deleting() ? + na = priv->np_na; + ifp = na->ifp; + // check for deleted + if (ifp == NULL) { + RD(1, "the ifp is gone"); + return POLLERR; + } + if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) return POLLERR; if (netmap_verbose & 0x8000) - D("device %s events 0x%x", ifp->if_xname, events); + D("device %s events 0x%x", NM_IFPNAME(ifp), events); want_tx = events & (POLLOUT | POLLWRNORM); want_rx = events & (POLLIN | POLLRDNORM); - na = NA(ifp); /* retrieve netmap adapter */ - - lim_tx = na->num_tx_rings; - lim_rx = na->num_rx_rings; - - if (priv->np_qfirst == NETMAP_SW_RING) { - /* handle the host stack ring */ - if (priv->np_txpoll || want_tx) { - /* push any packets up, then we are always ready */ - netmap_txsync_to_host(na); - revents |= want_tx; - } - if (want_rx) { - kring = &na->rx_rings[lim_rx]; - if (kring->ring->avail == 0) - netmap_rxsync_from_host(na, td, dev); - if (kring->ring->avail > 0) { - revents |= want_rx; - } - } - return (revents); - } - - /* if we are in transparent mode, check also the host rx ring */ - kring = &na->rx_rings[lim_rx]; - if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all - && want_rx - && (netmap_fwd || kring->ring->flags & NR_FORWARD) ) { - if (kring->ring->avail == 0) - netmap_rxsync_from_host(na, td, dev); - if (kring->ring->avail > 0) - revents |= want_rx; - } /* - * check_all is set if the card has more than one queue AND - * the client is polling all of them. If true, we sleep on + * check_all_{tx|rx} are set if the card has more than one queue AND + * the file descriptor is bound to all of them. If so, we sleep on * the "global" selinfo, otherwise we sleep on individual selinfo * (FreeBSD only allows two selinfo's per file descriptor). * The interrupt routine in the driver wake one or the other @@ -2642,104 +1960,113 @@ netmap_poll(struct cdev *dev, int events, struct thread *td) * there are pending packets to send. The latter can be disabled * passing NETMAP_NO_TX_POLL in the NIOCREG call. */ - check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1); - check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1); - - if (priv->np_qlast != NETMAP_HW_RING) { - lim_tx = lim_rx = priv->np_qlast; - } + check_all_tx = nm_tx_si_user(priv); + check_all_rx = nm_rx_si_user(priv); /* - * We start with a lock free round which is good if we have - * data available. If this fails, then lock and call the sync + * We start with a lock free round which is cheap if we have + * slots available. If this fails, then lock and call the sync * routines. */ - for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) { + for (i = priv->np_rxqfirst; want_rx && i < priv->np_rxqlast; i++) { kring = &na->rx_rings[i]; - if (kring->ring->avail > 0) { + /* XXX compare ring->cur and kring->tail */ + if (!nm_ring_empty(kring->ring)) { revents |= want_rx; want_rx = 0; /* also breaks the loop */ } } - for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) { + for (i = priv->np_txqfirst; want_tx && i < priv->np_txqlast; i++) { kring = &na->tx_rings[i]; - if (kring->ring->avail > 0) { + /* XXX compare ring->cur and kring->tail */ + if (!nm_ring_empty(kring->ring)) { revents |= want_tx; want_tx = 0; /* also breaks the loop */ } } /* - * If we to push packets out (priv->np_txpoll) or want_tx is - * still set, we do need to run the txsync calls (on all rings, - * to avoid that the tx rings stall). + * If we want to push packets out (priv->np_txpoll) or + * want_tx is still set, we must issue txsync calls + * (on all rings, to avoid that the tx rings stall). + * XXX should also check cur != hwcur on the tx rings. + * Fortunately, normal tx mode has np_txpoll set. */ if (priv->np_txpoll || want_tx) { - /* If we really want to be woken up (want_tx), - * do a selrecord, either on the global or on - * the private structure. Then issue the txsync - * so there is no race in the selrecord/selwait + /* + * The first round checks if anyone is ready, if not + * do a selrecord and another round to handle races. + * want_tx goes to 0 if any space is found, and is + * used to skip rings with no pending transmissions. */ flush_tx: - for (i = priv->np_qfirst; i < lim_tx; i++) { + for (i = priv->np_txqfirst; i < priv->np_txqlast; i++) { + int found = 0; + kring = &na->tx_rings[i]; - /* - * Skip this ring if want_tx == 0 - * (we have already done a successful sync on - * a previous ring) AND kring->cur == kring->hwcur - * (there are no pending transmissions for this ring). - */ if (!want_tx && kring->ring->cur == kring->nr_hwcur) continue; - /* make sure only one user thread is doing this */ + /* only one thread does txsync */ if (nm_kr_tryget(kring)) { - ND("ring %p busy is %d", kring, (int)kring->nr_busy); - revents |= POLLERR; - goto out; + if (netmap_verbose) + RD(2, "%p lost race on txring %d, ok", + priv, i); + continue; } - - if (netmap_verbose & NM_VERB_TXSYNC) - D("send %d on %s %d", - kring->ring->cur, ifp->if_xname, i); - if (na->nm_txsync(ifp, i, 0)) + if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) { + netmap_ring_reinit(kring); revents |= POLLERR; - - /* Check avail/call selrecord only if called with POLLOUT */ - if (want_tx) { - if (kring->ring->avail > 0) { - /* stop at the first ring. We don't risk - * starvation. - */ - revents |= want_tx; - want_tx = 0; - } + } else { + if (kring->nm_sync(kring, 0)) + revents |= POLLERR; } + + /* + * If we found new slots, notify potential + * listeners on the same ring. + * Since we just did a txsync, look at the copies + * of cur,tail in the kring. + */ + found = kring->rcur != kring->rtail; nm_kr_put(kring); + if (found) { /* notify other listeners */ + revents |= want_tx; + want_tx = 0; + na->nm_notify(na, i, NR_TX, 0); + } } - if (want_tx && retry_tx) { + if (want_tx && retry_tx && !is_kevent) { selrecord(td, check_all_tx ? - &na->tx_si : &na->tx_rings[priv->np_qfirst].si); + &na->tx_si : &na->tx_rings[priv->np_txqfirst].si); retry_tx = 0; goto flush_tx; } } /* - * now if want_rx is still set we need to lock and rxsync. + * If want_rx is still set scan receive rings. * Do it on all rings because otherwise we starve. */ if (want_rx) { - int retry_rx = 1; + int send_down = 0; /* transparent mode */ + /* two rounds here to for race avoidance */ do_retry_rx: - for (i = priv->np_qfirst; i < lim_rx; i++) { + for (i = priv->np_rxqfirst; i < priv->np_rxqlast; i++) { + int found = 0; + kring = &na->rx_rings[i]; if (nm_kr_tryget(kring)) { - revents |= POLLERR; - goto out; + if (netmap_verbose) + RD(2, "%p lost race on rxring %d, ok", + priv, i); + continue; } - /* XXX NR_FORWARD should only be read on + /* + * transparent mode support: collect packets + * from the rxring(s). + * XXX NR_FORWARD should only be read on * physical or NIC ports */ if (netmap_fwd ||kring->ring->flags & NR_FORWARD) { @@ -2748,50 +2075,142 @@ do_retry_rx: netmap_grab_packets(kring, &q, netmap_fwd); } - if (na->nm_rxsync(ifp, i, 0)) + if (kring->nm_sync(kring, 0)) revents |= POLLERR; if (netmap_no_timestamp == 0 || kring->ring->flags & NR_TIMESTAMP) { microtime(&kring->ring->ts); } - - if (kring->ring->avail > 0) { + /* after an rxsync we can use kring->rcur, rtail */ + found = kring->rcur != kring->rtail; + nm_kr_put(kring); + if (found) { revents |= want_rx; retry_rx = 0; + na->nm_notify(na, i, NR_RX, 0); } - nm_kr_put(kring); } - if (retry_rx) { - retry_rx = 0; + + /* transparent mode XXX only during first pass ? */ + if (na->na_flags & NAF_HOST_RINGS) { + kring = &na->rx_rings[na->num_rx_rings]; + if (check_all_rx + && (netmap_fwd || kring->ring->flags & NR_FORWARD)) { + /* XXX fix to use kring fields */ + if (nm_ring_empty(kring->ring)) + send_down = netmap_rxsync_from_host(na, td, dev); + if (!nm_ring_empty(kring->ring)) + revents |= want_rx; + } + } + + if (retry_rx && !is_kevent) selrecord(td, check_all_rx ? - &na->rx_si : &na->rx_rings[priv->np_qfirst].si); - goto do_retry_rx; + &na->rx_si : &na->rx_rings[priv->np_rxqfirst].si); + if (send_down > 0 || retry_rx) { + retry_rx = 0; + if (send_down) + goto flush_tx; /* and retry_rx */ + else + goto do_retry_rx; } } - /* forward host to the netmap ring. - * I am accessing nr_hwavail without lock, but netmap_transmit - * can only increment it, so the operation is safe. + /* + * Transparent mode: marked bufs on rx rings between + * kring->nr_hwcur and ring->head + * are passed to the other endpoint. + * + * In this mode we also scan the sw rxring, which in + * turn passes packets up. + * + * XXX Transparent mode at the moment requires to bind all + * rings to a single file descriptor. */ - kring = &na->rx_rings[lim_rx]; - if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all - && (netmap_fwd || kring->ring->flags & NR_FORWARD) - && kring->nr_hwavail > 0 && !host_forwarded) { - netmap_sw_to_nic(na); - host_forwarded = 1; /* prevent another pass */ - want_rx = 0; - goto flush_tx; - } if (q.head) - netmap_send_up(na->ifp, q.head); - -out: + netmap_send_up(na->ifp, &q); return (revents); } -/*------- driver support routines ------*/ + +/*-------------------- driver support routines -------------------*/ + +static int netmap_hw_krings_create(struct netmap_adapter *); + +static int +netmap_notify(struct netmap_adapter *na, u_int n_ring, + enum txrx tx, int flags) +{ + struct netmap_kring *kring; + + if (tx == NR_TX) { + kring = na->tx_rings + n_ring; + OS_selwakeup(&kring->si, PI_NET); + if (na->tx_si_users > 0) + OS_selwakeup(&na->tx_si, PI_NET); + } else { + kring = na->rx_rings + n_ring; + OS_selwakeup(&kring->si, PI_NET); + if (na->rx_si_users > 0) + OS_selwakeup(&na->rx_si, PI_NET); + } + return 0; +} + + +// XXX check handling of failures +int +netmap_attach_common(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + + if (na->num_tx_rings == 0 || na->num_rx_rings == 0) { + D("%s: invalid rings tx %d rx %d", + ifp->if_xname, na->num_tx_rings, na->num_rx_rings); + return EINVAL; + } + WNA(ifp) = na; + + /* the following is only needed for na that use the host port. + * XXX do we have something similar for linux ? + */ +#ifdef __FreeBSD__ + na->if_input = ifp->if_input; /* for netmap_send_up */ +#endif /* __FreeBSD__ */ + + NETMAP_SET_CAPABLE(ifp); + if (na->nm_krings_create == NULL) { + na->nm_krings_create = netmap_hw_krings_create; + na->nm_krings_delete = netmap_hw_krings_delete; + } + if (na->nm_notify == NULL) + na->nm_notify = netmap_notify; + na->active_fds = 0; + + if (na->nm_mem == NULL) + na->nm_mem = &nm_mem; + return 0; +} + + +void +netmap_detach_common(struct netmap_adapter *na) +{ + if (na->ifp) + WNA(na->ifp) = NULL; /* XXX do we need this? */ + + if (na->tx_rings) { /* XXX should not happen */ + D("freeing leftover tx_rings"); + na->nm_krings_delete(na); + } + netmap_pipe_dealloc(na); + if (na->na_flags & NAF_MEM_OWNER) + netmap_mem_private_delete(na->nm_mem); + bzero(na, sizeof(*na)); + free(na, M_DEVBUF); +} /* @@ -2801,61 +2220,94 @@ out: * of hardware rings): * krings 0..N-1 are for the hardware queues. * kring N is for the host stack queue - * kring N+1 is only used for the selinfo for all queues. + * kring N+1 is only used for the selinfo for all queues. // XXX still true ? * Return 0 on success, ENOMEM otherwise. - * - * By default the receive and transmit adapter ring counts are both initialized - * to num_queues. na->num_tx_rings can be set for cards with different tx/rx - * setups. */ int -netmap_attach(struct netmap_adapter *arg, u_int num_queues) +netmap_attach(struct netmap_adapter *arg) { - struct netmap_adapter *na = NULL; + struct netmap_hw_adapter *hwna = NULL; + // XXX when is arg == NULL ? struct ifnet *ifp = arg ? arg->ifp : NULL; - size_t len; if (arg == NULL || ifp == NULL) goto fail; - /* a VALE port uses two endpoints */ - len = nma_is_vp(arg) ? sizeof(*na) : sizeof(*na) * 2; - na = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); - if (na == NULL) + hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (hwna == NULL) goto fail; - WNA(ifp) = na; - *na = *arg; /* copy everything, trust the driver to not pass junk */ - NETMAP_SET_CAPABLE(ifp); - if (na->num_tx_rings == 0) - na->num_tx_rings = num_queues; - na->num_rx_rings = num_queues; - na->refcount = na->na_single = na->na_multi = 0; - /* Core lock initialized here, others after netmap_if_new. */ - mtx_init(&na->core_lock, "netmap core lock", MTX_NETWORK_LOCK, MTX_DEF); + hwna->up = *arg; + hwna->up.na_flags |= NAF_HOST_RINGS; + if (netmap_attach_common(&hwna->up)) { + free(hwna, M_DEVBUF); + goto fail; + } + netmap_adapter_get(&hwna->up); + #ifdef linux if (ifp->netdev_ops) { - ND("netdev_ops %p", ifp->netdev_ops); /* prepare a clone of the netdev ops */ #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28) - na->nm_ndo.ndo_start_xmit = ifp->netdev_ops; + hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops; #else - na->nm_ndo = *ifp->netdev_ops; + hwna->nm_ndo = *ifp->netdev_ops; #endif } - na->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; + hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit; #endif /* linux */ - na->nm_mem = arg->nm_mem ? arg->nm_mem : &nm_mem; - if (!nma_is_vp(arg)) - netmap_attach_sw(ifp); - D("success for %s", ifp->if_xname); + + D("success for %s", NM_IFPNAME(ifp)); return 0; fail: - D("fail, arg %p ifp %p na %p", arg, ifp, na); + D("fail, arg %p ifp %p na %p", arg, ifp, hwna); netmap_detach(ifp); - return (na ? EINVAL : ENOMEM); + return (hwna ? EINVAL : ENOMEM); } +void +NM_DBG(netmap_adapter_get)(struct netmap_adapter *na) +{ + if (!na) { + return; + } + + refcount_acquire(&na->na_refcount); +} + + +/* returns 1 iff the netmap_adapter is destroyed */ +int +NM_DBG(netmap_adapter_put)(struct netmap_adapter *na) +{ + if (!na) + return 1; + + if (!refcount_release(&na->na_refcount)) + return 0; + + if (na->nm_dtor) + na->nm_dtor(na); + + netmap_detach_common(na); + + return 1; +} + +int +netmap_hw_krings_create(struct netmap_adapter *na) +{ + int ret = netmap_krings_create(na, 0); + if (ret == 0) { + /* initialize the mbq for the sw rx ring */ + mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue); + ND("initialized sw rx queue %d", na->num_rx_rings); + } + return ret; +} + + + /* * Free the allocated memory linked to the given ``netmap_adapter`` * object. @@ -2868,124 +2320,91 @@ netmap_detach(struct ifnet *ifp) if (!na) return; - mtx_destroy(&na->core_lock); - - if (na->tx_rings) { /* XXX should not happen */ - D("freeing leftover tx_rings"); - free(na->tx_rings, M_DEVBUF); + NMG_LOCK(); + netmap_disable_all_rings(ifp); + if (!netmap_adapter_put(na)) { + /* someone is still using the adapter, + * tell them that the interface is gone + */ + na->ifp = NULL; + /* give them a chance to notice */ + netmap_enable_all_rings(ifp); } - if (na->na_flags & NAF_MEM_OWNER) - netmap_mem_private_delete(na->nm_mem); - bzero(na, sizeof(*na)); - WNA(ifp) = NULL; - free(na, M_DEVBUF); + NMG_UNLOCK(); } -int -nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, - struct netmap_adapter *na, u_int ring_nr); - - /* * Intercept packets from the network stack and pass them * to netmap as incoming packets on the 'software' ring. + * + * We only store packets in a bounded mbq and then copy them + * in the relevant rxsync routine. + * * We rely on the OS to make sure that the ifp and na do not go * away (typically the caller checks for IFF_DRV_RUNNING or the like). * In nm_register() or whenever there is a reinitialization, - * we make sure to access the core lock and per-ring locks - * so that IFCAP_NETMAP is visible here. + * we make sure to make the mode change visible here. */ int netmap_transmit(struct ifnet *ifp, struct mbuf *m) { struct netmap_adapter *na = NA(ifp); struct netmap_kring *kring; - u_int i, len = MBUF_LEN(m); - u_int error = EBUSY, lim; - struct netmap_slot *slot; + u_int len = MBUF_LEN(m); + u_int error = ENOBUFS; + struct mbq *q; + int space; // XXX [Linux] we do not need this lock // if we follow the down/configure/up protocol -gl // mtx_lock(&na->core_lock); + if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) { - /* interface not in netmap mode anymore */ + D("%s not in netmap mode anymore", NM_IFPNAME(ifp)); error = ENXIO; goto done; } kring = &na->rx_rings[na->num_rx_rings]; - lim = kring->nkr_num_slots - 1; - if (netmap_verbose & NM_VERB_HOST) - D("%s packet %d len %d from the stack", ifp->if_xname, - kring->nr_hwcur + kring->nr_hwavail, len); + q = &kring->rx_queue; + // XXX reconsider long packets if we handle fragments if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */ - D("%s from_host, drop packet size %d > %d", ifp->if_xname, + D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp), len, NETMAP_BDG_BUF_SIZE(na->nm_mem)); goto done; } - if (SWNA(ifp)->na_bdg) { - struct nm_bdg_fwd *ft; - char *dst; - - na = SWNA(ifp); /* we operate on the host port */ - ft = na->rx_rings[0].nkr_ft; - dst = BDG_NMB(na->nm_mem, &na->rx_rings[0].ring->slot[0]); - - /* use slot 0 in the ft, there is nothing queued here */ - /* XXX we can save the copy calling m_copydata in nm_bdg_flush, - * need a special flag for this. - */ - m_copydata(m, 0, (int)len, dst); - ft->ft_flags = 0; - ft->ft_len = len; - ft->ft_buf = dst; - ft->ft_next = NM_FT_NULL; - ft->ft_frags = 1; - if (netmap_verbose & NM_VERB_HOST) - RD(5, "pkt %p size %d to bridge port %d", - dst, len, na->bdg_port); - nm_bdg_flush(ft, 1, na, 0); - na = NA(ifp); /* back to the regular object/lock */ - error = 0; - goto done; - } - /* protect against other instances of netmap_transmit, - * and userspace invocations of rxsync(). - * XXX could reuse core_lock + /* protect against rxsync_from_host(), netmap_sw_to_nic() + * and maybe other instances of netmap_transmit (the latter + * not possible on Linux). + * Also avoid overflowing the queue. */ - // XXX [Linux] there can be no other instances of netmap_transmit - // on this same ring, but we still need this lock to protect - // concurrent access from netmap_sw_to_nic() -gl - mtx_lock(&kring->q_lock); - if (kring->nr_hwavail >= lim) { - if (netmap_verbose) - D("stack ring %s full\n", ifp->if_xname); + mtx_lock(&q->lock); + + space = kring->nr_hwtail - kring->nr_hwcur; + if (space < 0) + space += kring->nkr_num_slots; + if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX + RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p", + NM_IFPNAME(ifp), kring->nr_hwcur, kring->nr_hwtail, mbq_len(q), + len, m); } else { - /* compute the insert position */ - i = nm_kr_rxpos(kring); - slot = &kring->ring->slot[i]; - m_copydata(m, 0, (int)len, BDG_NMB(na->nm_mem, slot)); - slot->len = len; - slot->flags = kring->nkr_slot_flags; - kring->nr_hwavail++; - if (netmap_verbose & NM_VERB_HOST) - D("wake up host ring %s %d", na->ifp->if_xname, na->num_rx_rings); - selwakeuppri(&kring->si, PI_NET); + mbq_enqueue(q, m); + ND(10, "%s %d bufs in queue len %d m %p", + NM_IFPNAME(ifp), mbq_len(q), len, m); + /* notify outside the lock */ + m = NULL; error = 0; } - mtx_unlock(&kring->q_lock); + mtx_unlock(&q->lock); done: - // mtx_unlock(&na->core_lock); - - /* release the mbuf in either cases of success or failure. As an - * alternative, put the mbuf in a free list and free the list - * only when really necessary. - */ - m_freem(m); + if (m) + m_freem(m); + /* unconditionally wake up listeners */ + na->nm_notify(na, na->num_rx_rings, NR_RX, 0); return (error); } @@ -2994,7 +2413,7 @@ done: /* * netmap_reset() is called by the driver routines when reinitializing * a ring. The driver is in charge of locking to protect the kring. - * If netmap mode is not set just return NULL. + * If native netmap mode is not set just return NULL. */ struct netmap_slot * netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, @@ -3024,26 +2443,32 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, if (n >= na->num_tx_rings) return NULL; kring = na->tx_rings + n; + // XXX check whether we should use hwcur or rcur new_hwofs = kring->nr_hwcur - new_cur; } else { if (n >= na->num_rx_rings) return NULL; kring = na->rx_rings + n; - new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur; + new_hwofs = kring->nr_hwtail - new_cur; } lim = kring->nkr_num_slots - 1; if (new_hwofs > lim) new_hwofs -= lim + 1; /* Always set the new offset value and realign the ring. */ - D("%s hwofs %d -> %d, hwavail %d -> %d", - tx == NR_TX ? "TX" : "RX", + if (netmap_verbose) + D("%s %s%d hwofs %d -> %d, hwtail %d -> %d", + NM_IFPNAME(na->ifp), + tx == NR_TX ? "TX" : "RX", n, kring->nkr_hwofs, new_hwofs, - kring->nr_hwavail, - tx == NR_TX ? lim : kring->nr_hwavail); + kring->nr_hwtail, + tx == NR_TX ? lim : kring->nr_hwtail); kring->nkr_hwofs = new_hwofs; - if (tx == NR_TX) - kring->nr_hwavail = lim; + if (tx == NR_TX) { + kring->nr_hwtail = kring->nr_hwcur + lim; + if (kring->nr_hwtail > lim) + kring->nr_hwtail -= lim + 1; + } #if 0 // def linux /* XXX check that the mappings are correct */ @@ -3060,137 +2485,59 @@ netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, * We do the wakeup here, but the ring is not yet reconfigured. * However, we are under lock so there are no races. */ - selwakeuppri(&kring->si, PI_NET); - selwakeuppri(tx == NR_TX ? &na->tx_si : &na->rx_si, PI_NET); + na->nm_notify(na, n, tx, 0); return kring->ring->slot; } /* - * Grab packets from a kring, move them into the ft structure - * associated to the tx (input) port. Max one instance per port, - * filtered on input (ioctl, poll or XXX). - * Returns the next position in the ring. - */ -static int -nm_bdg_preflush(struct netmap_adapter *na, u_int ring_nr, - struct netmap_kring *kring, u_int end) -{ - struct netmap_ring *ring = kring->ring; - struct nm_bdg_fwd *ft; - u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; - u_int ft_i = 0; /* start from 0 */ - u_int frags = 1; /* how many frags ? */ - struct nm_bridge *b = na->na_bdg; - - /* To protect against modifications to the bridge we acquire a - * shared lock, waiting if we can sleep (if the source port is - * attached to a user process) or with a trylock otherwise (NICs). - */ - ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); - if (na->na_flags & NAF_BDG_MAYSLEEP) - BDG_RLOCK(b); - else if (!BDG_RTRYLOCK(b)) - return 0; - ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); - ft = kring->nkr_ft; - - for (; likely(j != end); j = nm_next(j, lim)) { - struct netmap_slot *slot = &ring->slot[j]; - char *buf; - - ft[ft_i].ft_len = slot->len; - ft[ft_i].ft_flags = slot->flags; - - ND("flags is 0x%x", slot->flags); - /* this slot goes into a list so initialize the link field */ - ft[ft_i].ft_next = NM_FT_NULL; - buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? - (void *)(uintptr_t)slot->ptr : BDG_NMB(na->nm_mem, slot); - prefetch(buf); - ++ft_i; - if (slot->flags & NS_MOREFRAG) { - frags++; - continue; - } - if (unlikely(netmap_verbose && frags > 1)) - RD(5, "%d frags at %d", frags, ft_i - frags); - ft[ft_i - frags].ft_frags = frags; - frags = 1; - if (unlikely((int)ft_i >= bridge_batch)) - ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); - } - if (frags > 1) { - D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); - // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG - ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; - ft[ft_i - frags].ft_frags = frags - 1; - } - if (ft_i) - ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); - BDG_RUNLOCK(b); - return j; -} - - -/* - * Pass packets from nic to the bridge. - * XXX TODO check locking: this is called from the interrupt - * handler so we should make sure that the interface is not - * disconnected while passing down an interrupt. + * Dispatch rx/tx interrupts to the netmap rings. + * + * "work_done" is non-null on the RX path, NULL for the TX path. + * We rely on the OS to make sure that there is only one active + * instance per queue, and that there is appropriate locking. * - * Note, no user process can access this NIC so we can ignore - * the info in the 'ring'. + * The 'notify' routine depends on what the ring is attached to. + * - for a netmap file descriptor, do a selwakeup on the individual + * waitqueue, plus one on the global one if needed + * - for a switch, call the proper forwarding routine + * - XXX more ? */ -static void -netmap_nic_to_bdg(struct ifnet *ifp, u_int ring_nr) +void +netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done) { struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring = &na->rx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, k; - - /* make sure that only one thread is ever in here, - * after which we can unlock. Probably unnecessary XXX. - */ - if (nm_kr_tryget(kring)) - return; - /* fetch packets that have arrived. - * XXX maybe do this in a loop ? - */ - if (na->nm_rxsync(ifp, ring_nr, 0)) - goto put_out; - if (kring->nr_hwavail == 0 && netmap_verbose) { - D("how strange, interrupt with no packets on %s", - ifp->if_xname); - goto put_out; - } - k = nm_kr_rxpos(kring); + struct netmap_kring *kring; - j = nm_bdg_preflush(na, ring_nr, kring, k); + q &= NETMAP_RING_MASK; - /* we consume everything, but we cannot update kring directly - * because the nic may have destroyed the info in the NIC ring. - * So we need to call rxsync again to restore it. - */ - ring->cur = j; - ring->avail = 0; - na->nm_rxsync(ifp, ring_nr, 0); + if (netmap_verbose) { + RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); + } -put_out: - nm_kr_put(kring); - return; + if (work_done) { /* RX path */ + if (q >= na->num_rx_rings) + return; // not a physical queue + kring = na->rx_rings + q; + kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? + na->nm_notify(na, q, NR_RX, 0); + *work_done = 1; /* do not fire napi again */ + } else { /* TX path */ + if (q >= na->num_tx_rings) + return; // not a physical queue + kring = na->tx_rings + q; + na->nm_notify(na, q, NR_TX, 0); + } } /* * Default functions to handle rx/tx interrupts from a physical device. * "work_done" is non-null on the RX path, NULL for the TX path. - * We rely on the OS to make sure that there is only one active - * instance per queue, and that there is appropriate locking. * * If the card is not in netmap mode, simply return 0, * so that the caller proceeds with regular processing. + * Otherwise call netmap_common_irq() and return 1. * * If the card is connected to a netmap file descriptor, * do a selwakeup on the individual queue, plus one on the global one @@ -3203,871 +2550,66 @@ put_out: int netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done) { - struct netmap_adapter *na; - struct netmap_kring *kring; - + // XXX could we check NAF_NATIVE_ON ? if (!(ifp->if_capenable & IFCAP_NETMAP)) return 0; - q &= NETMAP_RING_MASK; - - if (netmap_verbose) - RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q); - na = NA(ifp); - if (na->na_flags & NAF_SKIP_INTR) { + if (NA(ifp)->na_flags & NAF_SKIP_INTR) { ND("use regular interrupt"); return 0; } - if (work_done) { /* RX path */ - if (q >= na->num_rx_rings) - return 0; // not a physical queue - kring = na->rx_rings + q; - kring->nr_kflags |= NKR_PENDINTR; // XXX atomic ? - if (na->na_bdg != NULL) { - netmap_nic_to_bdg(ifp, q); - } else { - selwakeuppri(&kring->si, PI_NET); - if (na->num_rx_rings > 1 /* or multiple listeners */ ) - selwakeuppri(&na->rx_si, PI_NET); - } - *work_done = 1; /* do not fire napi again */ - } else { /* TX path */ - if (q >= na->num_tx_rings) - return 0; // not a physical queue - kring = na->tx_rings + q; - selwakeuppri(&kring->si, PI_NET); - if (na->num_tx_rings > 1 /* or multiple listeners */ ) - selwakeuppri(&na->tx_si, PI_NET); - } + netmap_common_irq(ifp, q, work_done); return 1; } -#ifdef linux /* linux-specific routines */ - - /* - * Remap linux arguments into the FreeBSD call. - * - pwait is the poll table, passed as 'dev'; - * If pwait == NULL someone else already woke up before. We can report - * events but they are filtered upstream. - * If pwait != NULL, then pwait->key contains the list of events. - * - events is computed from pwait as above. - * - file is passed as 'td'; - */ -static u_int -linux_netmap_poll(struct file * file, struct poll_table_struct *pwait) -{ -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) - int events = POLLIN | POLLOUT; /* XXX maybe... */ -#elif LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) - int events = pwait ? pwait->key : POLLIN | POLLOUT; -#else /* in 3.4.0 field 'key' was renamed to '_key' */ - int events = pwait ? pwait->_key : POLLIN | POLLOUT; -#endif - return netmap_poll((void *)pwait, events, (void *)file); -} - - -static int -linux_netmap_mmap(struct file *f, struct vm_area_struct *vma) -{ - int error = 0; - unsigned long off, va; - vm_ooffset_t pa; - struct netmap_priv_d *priv = f->private_data; - /* - * vma->vm_start: start of mapping user address space - * vma->vm_end: end of the mapping user address space - * vma->vm_pfoff: offset of first page in the device - */ - - // XXX security checks - - error = netmap_get_memory(priv); - ND("get_memory returned %d", error); - if (error) - return -error; - - if ((vma->vm_start & ~PAGE_MASK) || (vma->vm_end & ~PAGE_MASK)) { - ND("vm_start = %lx vm_end = %lx", vma->vm_start, vma->vm_end); - return -EINVAL; - } - - for (va = vma->vm_start, off = vma->vm_pgoff; - va < vma->vm_end; - va += PAGE_SIZE, off++) - { - pa = netmap_mem_ofstophys(priv->np_mref, off << PAGE_SHIFT); - if (pa == 0) - return -EINVAL; - - ND("va %lx pa %p", va, pa); - error = remap_pfn_range(vma, va, pa >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot); - if (error) - return error; - } - return 0; -} - - -/* - * This one is probably already protected by the netif lock XXX - */ -static netdev_tx_t -linux_netmap_start_xmit(struct sk_buff *skb, struct net_device *dev) -{ - netmap_transmit(dev, skb); - return (NETDEV_TX_OK); -} - - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) // XXX was 37 -#define LIN_IOCTL_NAME .ioctl -int -linux_netmap_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long data /* arg */) -#else -#define LIN_IOCTL_NAME .unlocked_ioctl -long -linux_netmap_ioctl(struct file *file, u_int cmd, u_long data /* arg */) -#endif -{ - int ret; - struct nmreq nmr; - bzero(&nmr, sizeof(nmr)); - - if (cmd == NIOCTXSYNC || cmd == NIOCRXSYNC) { - data = 0; /* no argument required here */ - } - if (data && copy_from_user(&nmr, (void *)data, sizeof(nmr) ) != 0) - return -EFAULT; - ret = netmap_ioctl(NULL, cmd, (caddr_t)&nmr, 0, (void *)file); - if (data && copy_to_user((void*)data, &nmr, sizeof(nmr) ) != 0) - return -EFAULT; - return -ret; -} - - -static int -netmap_release(struct inode *inode, struct file *file) -{ - (void)inode; /* UNUSED */ - if (file->private_data) - netmap_dtor(file->private_data); - return (0); -} - - -static int -linux_netmap_open(struct inode *inode, struct file *file) -{ - struct netmap_priv_d *priv; - (void)inode; /* UNUSED */ - - priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, - M_NOWAIT | M_ZERO); - if (priv == NULL) - return -ENOMEM; - - file->private_data = priv; - - return (0); -} - - -static struct file_operations netmap_fops = { - .owner = THIS_MODULE, - .open = linux_netmap_open, - .mmap = linux_netmap_mmap, - LIN_IOCTL_NAME = linux_netmap_ioctl, - .poll = linux_netmap_poll, - .release = netmap_release, -}; - - -static struct miscdevice netmap_cdevsw = { /* same name as FreeBSD */ - MISC_DYNAMIC_MINOR, - "netmap", - &netmap_fops, -}; - -static int netmap_init(void); -static void netmap_fini(void); - - -/* Errors have negative values on linux */ -static int linux_netmap_init(void) -{ - return -netmap_init(); -} - -module_init(linux_netmap_init); -module_exit(netmap_fini); -/* export certain symbols to other modules */ -EXPORT_SYMBOL(netmap_attach); // driver attach routines -EXPORT_SYMBOL(netmap_detach); // driver detach routines -EXPORT_SYMBOL(netmap_ring_reinit); // ring init on error -EXPORT_SYMBOL(netmap_buffer_lut); -EXPORT_SYMBOL(netmap_total_buffers); // index check -EXPORT_SYMBOL(netmap_buffer_base); -EXPORT_SYMBOL(netmap_reset); // ring init routines -EXPORT_SYMBOL(netmap_buf_size); -EXPORT_SYMBOL(netmap_rx_irq); // default irq handler -EXPORT_SYMBOL(netmap_no_pendintr); // XXX mitigation - should go away -EXPORT_SYMBOL(netmap_bdg_ctl); // bridge configuration routine -EXPORT_SYMBOL(netmap_bdg_learning); // the default lookup function -EXPORT_SYMBOL(netmap_disable_all_rings); -EXPORT_SYMBOL(netmap_enable_all_rings); - - -MODULE_AUTHOR("http://info.iet.unipi.it/~luigi/netmap/"); -MODULE_DESCRIPTION("The netmap packet I/O framework"); -MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */ - -#else /* __FreeBSD__ */ - - -static struct cdevsw netmap_cdevsw = { - .d_version = D_VERSION, - .d_name = "netmap", - .d_open = netmap_open, - .d_mmap_single = netmap_mmap_single, - .d_ioctl = netmap_ioctl, - .d_poll = netmap_poll, - .d_close = netmap_close, -}; -#endif /* __FreeBSD__ */ - -/* - *---- support for virtual bridge ----- - */ - -/* ----- FreeBSD if_bridge hash function ------- */ - -/* - * The following hash function is adapted from "Hash Functions" by Bob Jenkins - * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). + * Module loader and unloader * - * http://www.burtleburtle.net/bob/hash/spooky.html + * netmap_init() creates the /dev/netmap device and initializes + * all global variables. Returns 0 on success, errno on failure + * (but there is no chance) + * + * netmap_fini() destroys everything. */ -#define mix(a, b, c) \ -do { \ - a -= b; a -= c; a ^= (c >> 13); \ - b -= c; b -= a; b ^= (a << 8); \ - c -= a; c -= b; c ^= (b >> 13); \ - a -= b; a -= c; a ^= (c >> 12); \ - b -= c; b -= a; b ^= (a << 16); \ - c -= a; c -= b; c ^= (b >> 5); \ - a -= b; a -= c; a ^= (c >> 3); \ - b -= c; b -= a; b ^= (a << 10); \ - c -= a; c -= b; c ^= (b >> 15); \ -} while (/*CONSTCOND*/0) - -static __inline uint32_t -nm_bridge_rthash(const uint8_t *addr) -{ - uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key - - b += addr[5] << 8; - b += addr[4]; - a += addr[3] << 24; - a += addr[2] << 16; - a += addr[1] << 8; - a += addr[0]; - - mix(a, b, c); -#define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) - return (c & BRIDGE_RTHASH_MASK); -} - -#undef mix - -static int -bdg_netmap_reg(struct ifnet *ifp, int onoff) -{ - /* the interface is already attached to the bridge, - * so we only need to toggle IFCAP_NETMAP. - */ - if (onoff) { - ifp->if_capenable |= IFCAP_NETMAP; - } else { - ifp->if_capenable &= ~IFCAP_NETMAP; - } - return 0; -} +static struct cdev *netmap_dev; /* /dev/netmap character device. */ +extern struct cdevsw netmap_cdevsw; -/* - * Lookup function for a learning bridge. - * Update the hash table with the source address, - * and then returns the destination port index, and the - * ring in *dst_ring (at the moment, always use ring 0) - */ -u_int -netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, - struct netmap_adapter *na) +void +netmap_fini(void) { - struct nm_hash_ent *ht = na->na_bdg->ht; - uint32_t sh, dh; - u_int dst, mysrc = na->bdg_port; - uint64_t smac, dmac; - - if (buf_len < 14) { - D("invalid buf length %d", buf_len); - return NM_BDG_NOPORT; - } - dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; - smac = le64toh(*(uint64_t *)(buf + 4)); - smac >>= 16; - - /* - * The hash is somewhat expensive, there might be some - * worthwhile optimizations here. - */ - if ((buf[6] & 1) == 0) { /* valid src */ - uint8_t *s = buf+6; - sh = nm_bridge_rthash(s); // XXX hash of source - /* update source port forwarding entry */ - ht[sh].mac = smac; /* XXX expire ? */ - ht[sh].ports = mysrc; - if (netmap_verbose) - D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", - s[0], s[1], s[2], s[3], s[4], s[5], mysrc); - } - dst = NM_BDG_BROADCAST; - if ((buf[0] & 1) == 0) { /* unicast */ - dh = nm_bridge_rthash(buf); // XXX hash of dst - if (ht[dh].mac == dmac) { /* found dst */ - dst = ht[dh].ports; - } - /* XXX otherwise return NM_BDG_UNKNOWN ? */ - } - *dst_ring = 0; - return dst; + // XXX destroy_bridges() ? + if (netmap_dev) + destroy_dev(netmap_dev); + netmap_mem_fini(); + NMG_LOCK_DESTROY(); + printf("netmap: unloaded module.\n"); } -/* - * This flush routine supports only unicast and broadcast but a large - * number of ports, and lets us replace the learn and dispatch functions. - */ int -nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_adapter *na, - u_int ring_nr) -{ - struct nm_bdg_q *dst_ents, *brddst; - uint16_t num_dsts = 0, *dsts; - struct nm_bridge *b = na->na_bdg; - u_int i, j, me = na->bdg_port; - - /* - * The work area (pointed by ft) is followed by an array of - * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS - * queues per port plus one for the broadcast traffic. - * Then we have an array of destination indexes. - */ - dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); - dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); - - /* first pass: find a destination for each packet in the batch */ - for (i = 0; likely(i < n); i += ft[i].ft_frags) { - uint8_t dst_ring = ring_nr; /* default, same ring as origin */ - uint16_t dst_port, d_i; - struct nm_bdg_q *d; - - ND("slot %d frags %d", i, ft[i].ft_frags); - dst_port = b->nm_bdg_lookup(ft[i].ft_buf, ft[i].ft_len, - &dst_ring, na); - if (netmap_verbose > 255) - RD(5, "slot %d port %d -> %d", i, me, dst_port); - if (dst_port == NM_BDG_NOPORT) - continue; /* this packet is identified to be dropped */ - else if (unlikely(dst_port > NM_BDG_MAXPORTS)) - continue; - else if (dst_port == NM_BDG_BROADCAST) - dst_ring = 0; /* broadcasts always go to ring 0 */ - else if (unlikely(dst_port == me || - !b->bdg_ports[dst_port])) - continue; - - /* get a position in the scratch pad */ - d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; - d = dst_ents + d_i; - - /* append the first fragment to the list */ - if (d->bq_head == NM_FT_NULL) { /* new destination */ - d->bq_head = d->bq_tail = i; - /* remember this position to be scanned later */ - if (dst_port != NM_BDG_BROADCAST) - dsts[num_dsts++] = d_i; - } else { - ft[d->bq_tail].ft_next = i; - d->bq_tail = i; - } - d->bq_len += ft[i].ft_frags; - } - - /* - * Broadcast traffic goes to ring 0 on all destinations. - * So we need to add these rings to the list of ports to scan. - * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is - * expensive. We should keep a compact list of active destinations - * so we could shorten this loop. - */ - brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; - if (brddst->bq_head != NM_FT_NULL) { - for (j = 0; likely(j < b->bdg_active_ports); j++) { - uint16_t d_i; - i = b->bdg_port_index[j]; - if (unlikely(i == me)) - continue; - d_i = i * NM_BDG_MAXRINGS; - if (dst_ents[d_i].bq_head == NM_FT_NULL) - dsts[num_dsts++] = d_i; - } - } - - ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); - /* second pass: scan destinations (XXX will be modular somehow) */ - for (i = 0; i < num_dsts; i++) { - struct ifnet *dst_ifp; - struct netmap_adapter *dst_na; - struct netmap_kring *kring; - struct netmap_ring *ring; - u_int dst_nr, is_vp, lim, j, sent = 0, d_i, next, brd_next; - u_int needed, howmany; - int retry = netmap_txsync_retry; - struct nm_bdg_q *d; - uint32_t my_start = 0, lease_idx = 0; - int nrings; - - d_i = dsts[i]; - ND("second pass %d port %d", i, d_i); - d = dst_ents + d_i; - // XXX fix the division - dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; - /* protect from the lookup function returning an inactive - * destination port - */ - if (unlikely(dst_na == NULL)) - goto cleanup; - if (dst_na->na_flags & NAF_SW_ONLY) - goto cleanup; - dst_ifp = dst_na->ifp; - /* - * The interface may be in !netmap mode in two cases: - * - when na is attached but not activated yet; - * - when na is being deactivated but is still attached. - */ - if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) { - ND("not in netmap mode!"); - goto cleanup; - } - - /* there is at least one either unicast or broadcast packet */ - brd_next = brddst->bq_head; - next = d->bq_head; - /* we need to reserve this many slots. If fewer are - * available, some packets will be dropped. - * Packets may have multiple fragments, so we may not use - * there is a chance that we may not use all of the slots - * we have claimed, so we will need to handle the leftover - * ones when we regain the lock. - */ - needed = d->bq_len + brddst->bq_len; - - is_vp = nma_is_vp(dst_na); - ND(5, "pass 2 dst %d is %x %s", - i, d_i, is_vp ? "virtual" : "nic/host"); - dst_nr = d_i & (NM_BDG_MAXRINGS-1); - if (is_vp) { /* virtual port */ - nrings = dst_na->num_rx_rings; - } else { - nrings = dst_na->num_tx_rings; - } - if (dst_nr >= nrings) - dst_nr = dst_nr % nrings; - kring = is_vp ? &dst_na->rx_rings[dst_nr] : - &dst_na->tx_rings[dst_nr]; - ring = kring->ring; - lim = kring->nkr_num_slots - 1; - -retry: - - /* reserve the buffers in the queue and an entry - * to report completion, and drop lock. - * XXX this might become a helper function. - */ - mtx_lock(&kring->q_lock); - if (kring->nkr_stopped) { - mtx_unlock(&kring->q_lock); - goto cleanup; - } - /* on physical interfaces, do a txsync to recover - * slots for packets already transmitted. - * XXX maybe we could be optimistic and rely on a retry - * in case of failure. - */ - if (nma_is_hw(dst_na)) { - dst_na->nm_txsync(dst_ifp, dst_nr, 0); - } - my_start = j = kring->nkr_hwlease; - howmany = nm_kr_space(kring, is_vp); - if (needed < howmany) - howmany = needed; - lease_idx = nm_kr_lease(kring, howmany, is_vp); - mtx_unlock(&kring->q_lock); - - /* only retry if we need more than available slots */ - if (retry && needed <= howmany) - retry = 0; - - /* copy to the destination queue */ - while (howmany > 0) { - struct netmap_slot *slot; - struct nm_bdg_fwd *ft_p, *ft_end; - u_int cnt; - - /* find the queue from which we pick next packet. - * NM_FT_NULL is always higher than valid indexes - * so we never dereference it if the other list - * has packets (and if both are empty we never - * get here). - */ - if (next < brd_next) { - ft_p = ft + next; - next = ft_p->ft_next; - } else { /* insert broadcast */ - ft_p = ft + brd_next; - brd_next = ft_p->ft_next; - } - cnt = ft_p->ft_frags; // cnt > 0 - if (unlikely(cnt > howmany)) - break; /* no more space */ - howmany -= cnt; - if (netmap_verbose && cnt > 1) - RD(5, "rx %d frags to %d", cnt, j); - ft_end = ft_p + cnt; - do { - void *dst, *src = ft_p->ft_buf; - size_t len = (ft_p->ft_len + 63) & ~63; - - slot = &ring->slot[j]; - dst = BDG_NMB(dst_na->nm_mem, slot); - /* round to a multiple of 64 */ - - ND("send %d %d bytes at %s:%d", - i, ft_p->ft_len, dst_ifp->if_xname, j); - if (ft_p->ft_flags & NS_INDIRECT) { - if (copyin(src, dst, len)) { - // invalid user pointer, pretend len is 0 - ft_p->ft_len = 0; - } - } else { - //memcpy(dst, src, len); - pkt_copy(src, dst, (int)len); - } - slot->len = ft_p->ft_len; - slot->flags = (cnt << 8)| NS_MOREFRAG; - j = nm_next(j, lim); - ft_p++; - sent++; - } while (ft_p != ft_end); - slot->flags = (cnt << 8); /* clear flag on last entry */ - /* are we done ? */ - if (next == NM_FT_NULL && brd_next == NM_FT_NULL) - break; - } - { - /* current position */ - uint32_t *p = kring->nkr_leases; /* shorthand */ - uint32_t update_pos; - int still_locked = 1; - - mtx_lock(&kring->q_lock); - if (unlikely(howmany > 0)) { - /* not used all bufs. If i am the last one - * i can recover the slots, otherwise must - * fill them with 0 to mark empty packets. - */ - ND("leftover %d bufs", howmany); - if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { - /* yes i am the last one */ - ND("roll back nkr_hwlease to %d", j); - kring->nkr_hwlease = j; - } else { - while (howmany-- > 0) { - ring->slot[j].len = 0; - ring->slot[j].flags = 0; - j = nm_next(j, lim); - } - } - } - p[lease_idx] = j; /* report I am done */ - - update_pos = is_vp ? nm_kr_rxpos(kring) : ring->cur; - - if (my_start == update_pos) { - /* all slots before my_start have been reported, - * so scan subsequent leases to see if other ranges - * have been completed, and to a selwakeup or txsync. - */ - while (lease_idx != kring->nkr_lease_idx && - p[lease_idx] != NR_NOSLOT) { - j = p[lease_idx]; - p[lease_idx] = NR_NOSLOT; - lease_idx = nm_next(lease_idx, lim); - } - /* j is the new 'write' position. j != my_start - * means there are new buffers to report - */ - if (likely(j != my_start)) { - if (is_vp) { - uint32_t old_avail = kring->nr_hwavail; - - kring->nr_hwavail = (j >= kring->nr_hwcur) ? - j - kring->nr_hwcur : - j + lim + 1 - kring->nr_hwcur; - if (kring->nr_hwavail < old_avail) { - D("avail shrink %d -> %d", - old_avail, kring->nr_hwavail); - } - still_locked = 0; - mtx_unlock(&kring->q_lock); - selwakeuppri(&kring->si, PI_NET); - } else { - ring->cur = j; - /* XXX update avail ? */ - still_locked = 0; - dst_na->nm_txsync(dst_ifp, dst_nr, 0); - mtx_unlock(&kring->q_lock); - - /* retry to send more packets */ - if (nma_is_hw(dst_na) && retry--) - goto retry; - } - } - } - if (still_locked) - mtx_unlock(&kring->q_lock); - } -cleanup: - d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ - d->bq_len = 0; - } - brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ - brddst->bq_len = 0; - return 0; -} - - -/* - * main dispatch routine for the bridge. - * We already know that only one thread is running this. - * we must run nm_bdg_preflush without lock. - */ -static int -bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags) -{ - struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring = &na->tx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, k, lim = kring->nkr_num_slots - 1; - - k = ring->cur; - if (k > lim) - return netmap_ring_reinit(kring); - - if (bridge_batch <= 0) { /* testing only */ - j = k; // used all - goto done; - } - if (bridge_batch > NM_BDG_BATCH) - bridge_batch = NM_BDG_BATCH; - - j = nm_bdg_preflush(na, ring_nr, kring, k); - if (j != k) - D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail); - /* k-j modulo ring size is the number of slots processed */ - if (k < j) - k += kring->nkr_num_slots; - kring->nr_hwavail = lim - (k - j); - -done: - kring->nr_hwcur = j; - ring->avail = kring->nr_hwavail; - if (netmap_verbose) - D("%s ring %d flags %d", ifp->if_xname, ring_nr, flags); - return 0; -} - - -/* - * user process reading from a VALE switch. - * Already protected against concurrent calls from userspace, - * but we must acquire the queue's lock to protect against - * writers on the same queue. - */ -static int -bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags) -{ - struct netmap_adapter *na = NA(ifp); - struct netmap_kring *kring = &na->rx_rings[ring_nr]; - struct netmap_ring *ring = kring->ring; - u_int j, lim = kring->nkr_num_slots - 1; - u_int k = ring->cur, resvd = ring->reserved; - int n; - - mtx_lock(&kring->q_lock); - if (k > lim) { - D("ouch dangerous reset!!!"); - n = netmap_ring_reinit(kring); - goto done; - } - - /* skip past packets that userspace has released */ - j = kring->nr_hwcur; /* netmap ring index */ - if (resvd > 0) { - if (resvd + ring->avail >= lim + 1) { - D("XXX invalid reserve/avail %d %d", resvd, ring->avail); - ring->reserved = resvd = 0; // XXX panic... - } - k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd; - } - - if (j != k) { /* userspace has released some packets. */ - n = k - j; - if (n < 0) - n += kring->nkr_num_slots; - ND("userspace releases %d packets", n); - for (n = 0; likely(j != k); n++) { - struct netmap_slot *slot = &ring->slot[j]; - void *addr = BDG_NMB(na->nm_mem, slot); - - if (addr == netmap_buffer_base) { /* bad buf */ - D("bad buffer index %d, ignore ?", - slot->buf_idx); - } - slot->flags &= ~NS_BUF_CHANGED; - j = nm_next(j, lim); - } - kring->nr_hwavail -= n; - kring->nr_hwcur = k; - } - /* tell userspace that there are new packets */ - ring->avail = kring->nr_hwavail - resvd; - n = 0; -done: - mtx_unlock(&kring->q_lock); - return n; -} - - -static int -bdg_netmap_attach(struct netmap_adapter *arg) -{ - struct netmap_adapter na; - - ND("attaching virtual bridge"); - bzero(&na, sizeof(na)); - - na.ifp = arg->ifp; - na.na_flags = NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; - na.num_tx_rings = arg->num_tx_rings; - na.num_rx_rings = arg->num_rx_rings; - na.num_tx_desc = arg->num_tx_desc; - na.num_rx_desc = arg->num_rx_desc; - na.nm_txsync = bdg_netmap_txsync; - na.nm_rxsync = bdg_netmap_rxsync; - na.nm_register = bdg_netmap_reg; - na.nm_mem = netmap_mem_private_new(arg->ifp->if_xname, - na.num_tx_rings, na.num_tx_desc, - na.num_rx_rings, na.num_rx_desc); - return netmap_attach(&na, na.num_tx_rings); -} - - -static struct cdev *netmap_dev; /* /dev/netmap character device. */ - - -/* - * Module loader. - * - * Create the /dev/netmap device and initialize all global - * variables. - * - * Return 0 on success, errno on failure. - */ -static int netmap_init(void) { - int i, error; + int error; NMG_LOCK_INIT(); error = netmap_mem_init(); - if (error != 0) { - printf("netmap: unable to initialize the memory allocator.\n"); - return (error); - } - printf("netmap: loaded module\n"); + if (error != 0) + goto fail; + /* XXX could use make_dev_credv() to get error number */ netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660, "netmap"); + if (!netmap_dev) + goto fail; - bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ - for (i = 0; i < NM_BRIDGES; i++) - BDG_RWINIT(&nm_bridges[i]); - return (error); -} - - -/* - * Module unloader. - * - * Free all the memory, and destroy the ``/dev/netmap`` device. - */ -static void -netmap_fini(void) -{ - destroy_dev(netmap_dev); - netmap_mem_fini(); - NMG_LOCK_DESTROY(); - printf("netmap: unloaded module.\n"); -} - - -#ifdef __FreeBSD__ -/* - * Kernel entry point. - * - * Initialize/finalize the module and return. - * - * Return 0 on success, errno on failure. - */ -static int -netmap_loader(__unused struct module *module, int event, __unused void *arg) -{ - int error = 0; - - switch (event) { - case MOD_LOAD: - error = netmap_init(); - break; - - case MOD_UNLOAD: - netmap_fini(); - break; - - default: - error = EOPNOTSUPP; - break; - } - - return (error); + netmap_init_bridges(); + printf("netmap: loaded module\n"); + return (0); +fail: + netmap_fini(); + return (EINVAL); /* may be incorrect */ } - - -DEV_MODULE(netmap, netmap_loader, NULL); -#endif /* __FreeBSD__ */ diff --git a/sys/dev/netmap/netmap_freebsd.c b/sys/dev/netmap/netmap_freebsd.c new file mode 100644 index 000000000..a8e287c6d --- /dev/null +++ b/sys/dev/netmap/netmap_freebsd.c @@ -0,0 +1,655 @@ +/* + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* $FreeBSD$ */ + +#include +#include +#include +#include /* defines used in kernel.h */ +#include /* POLLIN, POLLOUT */ +#include /* types used in module initialization */ +#include /* DEV_MODULE */ +#include + +#include + +#include /* vtophys */ +#include /* vtophys */ +#include +#include +#include +#include +#include + + +#include +#include /* sockaddrs */ +#include +#include +#include +#include /* bus_dmamap_* */ +#include /* in6_cksum_pseudo() */ +#include /* in_pseudo(), in_cksum_hdr() */ + +#include +#include +#include + + +/* ======================== FREEBSD-SPECIFIC ROUTINES ================== */ + +rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum) +{ + /* TODO XXX please use the FreeBSD implementation for this. */ + uint16_t *words = (uint16_t *)data; + int nw = len / 2; + int i; + + for (i = 0; i < nw; i++) + cur_sum += be16toh(words[i]); + + if (len & 1) + cur_sum += (data[len-1] << 8); + + return cur_sum; +} + +/* Fold a raw checksum: 'cur_sum' is in host byte order, while the + * return value is in network byte order. + */ +uint16_t nm_csum_fold(rawsum_t cur_sum) +{ + /* TODO XXX please use the FreeBSD implementation for this. */ + while (cur_sum >> 16) + cur_sum = (cur_sum & 0xFFFF) + (cur_sum >> 16); + + return htobe16((~cur_sum) & 0xFFFF); +} + +uint16_t nm_csum_ipv4(struct nm_iphdr *iph) +{ +#if 0 + return in_cksum_hdr((void *)iph); +#else + return nm_csum_fold(nm_csum_raw((uint8_t*)iph, sizeof(struct nm_iphdr), 0)); +#endif +} + +void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, + size_t datalen, uint16_t *check) +{ + uint16_t pseudolen = datalen + iph->protocol; + + /* Compute and insert the pseudo-header cheksum. */ + *check = in_pseudo(iph->saddr, iph->daddr, + htobe16(pseudolen)); + /* Compute the checksum on TCP/UDP header + payload + * (includes the pseudo-header). + */ + *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); +} + +void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, + size_t datalen, uint16_t *check) +{ +#ifdef INET6 + *check = in6_cksum_pseudo((void*)ip6h, datalen, ip6h->nexthdr, 0); + *check = nm_csum_fold(nm_csum_raw(data, datalen, 0)); +#else + static int notsupported = 0; + if (!notsupported) { + notsupported = 1; + D("inet6 segmentation not supported"); + } +#endif +} + + +/* + * Intercept the rx routine in the standard device driver. + * Second argument is non-zero to intercept, 0 to restore + */ +int +netmap_catch_rx(struct netmap_adapter *na, int intercept) +{ + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + struct ifnet *ifp = na->ifp; + + if (intercept) { + if (gna->save_if_input) { + D("cannot intercept again"); + return EINVAL; /* already set */ + } + gna->save_if_input = ifp->if_input; + ifp->if_input = generic_rx_handler; + } else { + if (!gna->save_if_input){ + D("cannot restore"); + return EINVAL; /* not saved */ + } + ifp->if_input = gna->save_if_input; + gna->save_if_input = NULL; + } + + return 0; +} + + +/* + * Intercept the packet steering routine in the tx path, + * so that we can decide which queue is used for an mbuf. + * Second argument is non-zero to intercept, 0 to restore. + * On freebsd we just intercept if_transmit. + */ +void +netmap_catch_tx(struct netmap_generic_adapter *gna, int enable) +{ + struct netmap_adapter *na = &gna->up.up; + struct ifnet *ifp = na->ifp; + + if (enable) { + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_transmit; + } else { + ifp->if_transmit = na->if_transmit; + } +} + + +/* + * Transmit routine used by generic_netmap_txsync(). Returns 0 on success + * and non-zero on error (which may be packet drops or other errors). + * addr and len identify the netmap buffer, m is the (preallocated) + * mbuf to use for transmissions. + * + * We should add a reference to the mbuf so the m_freem() at the end + * of the transmission does not consume resources. + * + * On FreeBSD, and on multiqueue cards, we can force the queue using + * if ((m->m_flags & M_FLOWID) != 0) + * i = m->m_pkthdr.flowid % adapter->num_queues; + * else + * i = curcpu % adapter->num_queues; + * + */ +int +generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, + void *addr, u_int len, u_int ring_nr) +{ + int ret; + + m->m_len = m->m_pkthdr.len = 0; + + // copy data to the mbuf + m_copyback(m, 0, len, addr); + // inc refcount. We are alone, so we can skip the atomic + atomic_fetchadd_int(m->m_ext.ref_cnt, 1); + m->m_flags |= M_FLOWID; + m->m_pkthdr.flowid = ring_nr; + m->m_pkthdr.rcvif = ifp; /* used for tx notification */ + ret = NA(ifp)->if_transmit(ifp, m); + return ret; +} + + +/* + * The following two functions are empty until we have a generic + * way to extract the info from the ifp + */ +int +generic_find_num_desc(struct ifnet *ifp, unsigned int *tx, unsigned int *rx) +{ + D("called"); + return 0; +} + + +void +generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq) +{ + D("called"); + *txq = netmap_generic_rings; + *rxq = netmap_generic_rings; +} + + +void netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na) +{ + ND("called"); + mit->mit_pending = 0; + mit->mit_na = na; +} + + +void netmap_mitigation_start(struct nm_generic_mit *mit) +{ + ND("called"); +} + + +void netmap_mitigation_restart(struct nm_generic_mit *mit) +{ + ND("called"); +} + + +int netmap_mitigation_active(struct nm_generic_mit *mit) +{ + ND("called"); + return 0; +} + + +void netmap_mitigation_cleanup(struct nm_generic_mit *mit) +{ + ND("called"); +} + + +/* + * In order to track whether pages are still mapped, we hook into + * the standard cdev_pager and intercept the constructor and + * destructor. + */ + +struct netmap_vm_handle_t { + struct cdev *dev; + struct netmap_priv_d *priv; +}; + + +static int +netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, + vm_ooffset_t foff, struct ucred *cred, u_short *color) +{ + struct netmap_vm_handle_t *vmh = handle; + + if (netmap_verbose) + D("handle %p size %jd prot %d foff %jd", + handle, (intmax_t)size, prot, (intmax_t)foff); + dev_ref(vmh->dev); + return 0; +} + + +static void +netmap_dev_pager_dtor(void *handle) +{ + struct netmap_vm_handle_t *vmh = handle; + struct cdev *dev = vmh->dev; + struct netmap_priv_d *priv = vmh->priv; + + if (netmap_verbose) + D("handle %p", handle); + netmap_dtor(priv); + free(vmh, M_DEVBUF); + dev_rel(dev); +} + + +static int +netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset, + int prot, vm_page_t *mres) +{ + struct netmap_vm_handle_t *vmh = object->handle; + struct netmap_priv_d *priv = vmh->priv; + vm_paddr_t paddr; + vm_page_t page; + vm_memattr_t memattr; + vm_pindex_t pidx; + + ND("object %p offset %jd prot %d mres %p", + object, (intmax_t)offset, prot, mres); + memattr = object->memattr; + pidx = OFF_TO_IDX(offset); + paddr = netmap_mem_ofstophys(priv->np_mref, offset); + if (paddr == 0) + return VM_PAGER_FAIL; + + if (((*mres)->flags & PG_FICTITIOUS) != 0) { + /* + * If the passed in result page is a fake page, update it with + * the new physical address. + */ + page = *mres; + vm_page_updatefake(page, paddr, memattr); + } else { + /* + * Replace the passed in reqpage page with our own fake page and + * free up the all of the original pages. + */ +#ifndef VM_OBJECT_WUNLOCK /* FreeBSD < 10.x */ +#define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK +#define VM_OBJECT_WLOCK VM_OBJECT_LOCK +#endif /* VM_OBJECT_WUNLOCK */ + + VM_OBJECT_WUNLOCK(object); + page = vm_page_getfake(paddr, memattr); + VM_OBJECT_WLOCK(object); + vm_page_lock(*mres); + vm_page_free(*mres); + vm_page_unlock(*mres); + *mres = page; + vm_page_insert(page, object, pidx); + } + page->valid = VM_PAGE_BITS_ALL; + return (VM_PAGER_OK); +} + + +static struct cdev_pager_ops netmap_cdev_pager_ops = { + .cdev_pg_ctor = netmap_dev_pager_ctor, + .cdev_pg_dtor = netmap_dev_pager_dtor, + .cdev_pg_fault = netmap_dev_pager_fault, +}; + + +static int +netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff, + vm_size_t objsize, vm_object_t *objp, int prot) +{ + int error; + struct netmap_vm_handle_t *vmh; + struct netmap_priv_d *priv; + vm_object_t obj; + + if (netmap_verbose) + D("cdev %p foff %jd size %jd objp %p prot %d", cdev, + (intmax_t )*foff, (intmax_t )objsize, objp, prot); + + vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (vmh == NULL) + return ENOMEM; + vmh->dev = cdev; + + NMG_LOCK(); + error = devfs_get_cdevpriv((void**)&priv); + if (error) + goto err_unlock; + vmh->priv = priv; + priv->np_refcount++; + NMG_UNLOCK(); + + error = netmap_get_memory(priv); + if (error) + goto err_deref; + + obj = cdev_pager_allocate(vmh, OBJT_DEVICE, + &netmap_cdev_pager_ops, objsize, prot, + *foff, NULL); + if (obj == NULL) { + D("cdev_pager_allocate failed"); + error = EINVAL; + goto err_deref; + } + + *objp = obj; + return 0; + +err_deref: + NMG_LOCK(); + priv->np_refcount--; +err_unlock: + NMG_UNLOCK(); +// err: + free(vmh, M_DEVBUF); + return error; +} + + +// XXX can we remove this ? +static int +netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td) +{ + if (netmap_verbose) + D("dev %p fflag 0x%x devtype %d td %p", + dev, fflag, devtype, td); + return 0; +} + + +static int +netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td) +{ + struct netmap_priv_d *priv; + int error; + + (void)dev; + (void)oflags; + (void)devtype; + (void)td; + + // XXX wait or nowait ? + priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (priv == NULL) + return ENOMEM; + + error = devfs_set_cdevpriv(priv, netmap_dtor); + if (error) + return error; + + priv->np_refcount = 1; + + return 0; +} + +/******************** kqueue support ****************/ + +/* + * The OS_selwakeup also needs to issue a KNOTE_UNLOCKED. + * We use a non-zero argument to distinguish the call from the one + * in kevent_scan() which instead also needs to run netmap_poll(). + * The knote uses a global mutex for the time being. We might + * try to reuse the one in the si, but it is not allocated + * permanently so it might be a bit tricky. + * + * The *kqfilter function registers one or another f_event + * depending on read or write mode. + * In the call to f_event() td_fpop is NULL so any child function + * calling devfs_get_cdevpriv() would fail - and we need it in + * netmap_poll(). As a workaround we store priv into kn->kn_hook + * and pass it as first argument to netmap_poll(), which then + * uses the failure to tell that we are called from f_event() + * and do not need the selrecord(). + */ + +void freebsd_selwakeup(struct selinfo *si, int pri); + +void +freebsd_selwakeup(struct selinfo *si, int pri) +{ + if (netmap_verbose) + D("on knote %p", &si->si_note); + selwakeuppri(si, pri); + /* use a non-zero hint to tell the notification from the + * call done in kqueue_scan() which uses 0 + */ + KNOTE_UNLOCKED(&si->si_note, 0x100 /* notification */); +} + +static void +netmap_knrdetach(struct knote *kn) +{ + struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; + struct selinfo *si = priv->np_rxsi; + + D("remove selinfo %p", si); + knlist_remove(&si->si_note, kn, 0); +} + +static void +netmap_knwdetach(struct knote *kn) +{ + struct netmap_priv_d *priv = (struct netmap_priv_d *)kn->kn_hook; + struct selinfo *si = priv->np_txsi; + + D("remove selinfo %p", si); + knlist_remove(&si->si_note, kn, 0); +} + +/* + * callback from notifies (generated externally) and our + * calls to kevent(). The former we just return 1 (ready) + * since we do not know better. + * In the latter we call netmap_poll and return 0/1 accordingly. + */ +static int +netmap_knrw(struct knote *kn, long hint, int events) +{ + struct netmap_priv_d *priv; + int revents; + + if (hint != 0) { + ND(5, "call from notify"); + return 1; /* assume we are ready */ + } + priv = kn->kn_hook; + /* the notification may come from an external thread, + * in which case we do not want to run the netmap_poll + * This should be filtered above, but check just in case. + */ + if (curthread != priv->np_td) { /* should not happen */ + RD(5, "curthread changed %p %p", curthread, priv->np_td); + return 1; + } else { + revents = netmap_poll((void *)priv, events, curthread); + return (events & revents) ? 1 : 0; + } +} + +static int +netmap_knread(struct knote *kn, long hint) +{ + return netmap_knrw(kn, hint, POLLIN); +} + +static int +netmap_knwrite(struct knote *kn, long hint) +{ + return netmap_knrw(kn, hint, POLLOUT); +} + +static struct filterops netmap_rfiltops = { + .f_isfd = 1, + .f_detach = netmap_knrdetach, + .f_event = netmap_knread, +}; + +static struct filterops netmap_wfiltops = { + .f_isfd = 1, + .f_detach = netmap_knwdetach, + .f_event = netmap_knwrite, +}; + + +/* + * This is called when a thread invokes kevent() to record + * a change in the configuration of the kqueue(). + * The 'priv' should be the same as in the netmap device. + */ +static int +netmap_kqfilter(struct cdev *dev, struct knote *kn) +{ + struct netmap_priv_d *priv; + int error; + struct netmap_adapter *na; + struct selinfo *si; + int ev = kn->kn_filter; + + if (ev != EVFILT_READ && ev != EVFILT_WRITE) { + D("bad filter request %d", ev); + return 1; + } + error = devfs_get_cdevpriv((void**)&priv); + if (error) { + D("device not yet setup"); + return 1; + } + na = priv->np_na; + if (na == NULL) { + D("no netmap adapter for this file descriptor"); + return 1; + } + /* the si is indicated in the priv */ + si = (ev == EVFILT_WRITE) ? priv->np_txsi : priv->np_rxsi; + // XXX lock(priv) ? + kn->kn_fop = (ev == EVFILT_WRITE) ? + &netmap_wfiltops : &netmap_rfiltops; + kn->kn_hook = priv; + knlist_add(&si->si_note, kn, 1); + // XXX unlock(priv) + ND("register %p %s td %p priv %p kn %p np_nifp %p kn_fp/fpop %s", + na, na->ifp->if_xname, curthread, priv, kn, + priv->np_nifp, + kn->kn_fp == curthread->td_fpop ? "match" : "MISMATCH"); + return 0; +} + +struct cdevsw netmap_cdevsw = { + .d_version = D_VERSION, + .d_name = "netmap", + .d_open = netmap_open, + .d_mmap_single = netmap_mmap_single, + .d_ioctl = netmap_ioctl, + .d_poll = netmap_poll, + .d_kqfilter = netmap_kqfilter, + .d_close = netmap_close, +}; +/*--- end of kqueue support ----*/ + +/* + * Kernel entry point. + * + * Initialize/finalize the module and return. + * + * Return 0 on success, errno on failure. + */ +static int +netmap_loader(__unused struct module *module, int event, __unused void *arg) +{ + int error = 0; + + switch (event) { + case MOD_LOAD: + error = netmap_init(); + break; + + case MOD_UNLOAD: + netmap_fini(); + break; + + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + + +DEV_MODULE(netmap, netmap_loader, NULL); diff --git a/sys/dev/netmap/netmap_generic.c b/sys/dev/netmap/netmap_generic.c new file mode 100644 index 000000000..63253b6b0 --- /dev/null +++ b/sys/dev/netmap/netmap_generic.c @@ -0,0 +1,806 @@ +/* + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This module implements netmap support on top of standard, + * unmodified device drivers. + * + * A NIOCREGIF request is handled here if the device does not + * have native support. TX and RX rings are emulated as follows: + * + * NIOCREGIF + * We preallocate a block of TX mbufs (roughly as many as + * tx descriptors; the number is not critical) to speed up + * operation during transmissions. The refcount on most of + * these buffers is artificially bumped up so we can recycle + * them more easily. Also, the destructor is intercepted + * so we use it as an interrupt notification to wake up + * processes blocked on a poll(). + * + * For each receive ring we allocate one "struct mbq" + * (an mbuf tailq plus a spinlock). We intercept packets + * (through if_input) + * on the receive path and put them in the mbq from which + * netmap receive routines can grab them. + * + * TX: + * in the generic_txsync() routine, netmap buffers are copied + * (or linked, in a future) to the preallocated mbufs + * and pushed to the transmit queue. Some of these mbufs + * (those with NS_REPORT, or otherwise every half ring) + * have the refcount=1, others have refcount=2. + * When the destructor is invoked, we take that as + * a notification that all mbufs up to that one in + * the specific ring have been completed, and generate + * the equivalent of a transmit interrupt. + * + * RX: + * + */ + +#ifdef __FreeBSD__ + +#include /* prerequisite */ +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include /* PROT_EXEC */ +#include +#include /* sockaddrs */ +#include +#include +#include +#include /* bus_dmamap_* in netmap_kern.h */ + +// XXX temporary - D() defined here +#include +#include +#include + +#define rtnl_lock() D("rtnl_lock called"); +#define rtnl_unlock() D("rtnl_unlock called"); +#define MBUF_TXQ(m) ((m)->m_pkthdr.flowid) +#define MBUF_RXQ(m) ((m)->m_pkthdr.flowid) +#define smp_mb() + +/* + * mbuf wrappers + */ + +/* + * we allocate an EXT_PACKET + */ +#define netmap_get_mbuf(len) m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR|M_NOFREE) + +/* mbuf destructor, also need to change the type to EXT_EXTREF, + * add an M_NOFREE flag, and then clear the flag and + * chain into uma_zfree(zone_pack, mf) + * (or reinstall the buffer ?) + */ +#define SET_MBUF_DESTRUCTOR(m, fn) do { \ + (m)->m_ext.ext_free = (void *)fn; \ + (m)->m_ext.ext_type = EXT_EXTREF; \ +} while (0) + + +#define GET_MBUF_REFCNT(m) ((m)->m_ext.ref_cnt ? *(m)->m_ext.ref_cnt : -1) + + + +#else /* linux */ + +#include "bsd_glue.h" + +#include /* rtnl_[un]lock() */ +#include /* struct ethtool_ops, get_ringparam */ +#include + +//#define RATE /* Enables communication statistics. */ + +//#define REG_RESET + +#endif /* linux */ + + +/* Common headers. */ +#include +#include +#include + + + +/* ======================== usage stats =========================== */ + +#ifdef RATE +#define IFRATE(x) x +struct rate_stats { + unsigned long txpkt; + unsigned long txsync; + unsigned long txirq; + unsigned long rxpkt; + unsigned long rxirq; + unsigned long rxsync; +}; + +struct rate_context { + unsigned refcount; + struct timer_list timer; + struct rate_stats new; + struct rate_stats old; +}; + +#define RATE_PRINTK(_NAME_) \ + printk( #_NAME_ " = %lu Hz\n", (cur._NAME_ - ctx->old._NAME_)/RATE_PERIOD); +#define RATE_PERIOD 2 +static void rate_callback(unsigned long arg) +{ + struct rate_context * ctx = (struct rate_context *)arg; + struct rate_stats cur = ctx->new; + int r; + + RATE_PRINTK(txpkt); + RATE_PRINTK(txsync); + RATE_PRINTK(txirq); + RATE_PRINTK(rxpkt); + RATE_PRINTK(rxsync); + RATE_PRINTK(rxirq); + printk("\n"); + + ctx->old = cur; + r = mod_timer(&ctx->timer, jiffies + + msecs_to_jiffies(RATE_PERIOD * 1000)); + if (unlikely(r)) + D("[v1000] Error: mod_timer()"); +} + +static struct rate_context rate_ctx; + +#else /* !RATE */ +#define IFRATE(x) +#endif /* !RATE */ + + +/* =============== GENERIC NETMAP ADAPTER SUPPORT ================= */ +#define GENERIC_BUF_SIZE netmap_buf_size /* Size of the mbufs in the Tx pool. */ + +/* + * Wrapper used by the generic adapter layer to notify + * the poller threads. Differently from netmap_rx_irq(), we check + * only IFCAP_NETMAP instead of NAF_NATIVE_ON to enable the irq. + */ +static void +netmap_generic_irq(struct ifnet *ifp, u_int q, u_int *work_done) +{ + if (unlikely(!(ifp->if_capenable & IFCAP_NETMAP))) + return; + + netmap_common_irq(ifp, q, work_done); +} + + +/* Enable/disable netmap mode for a generic network interface. */ +static int +generic_netmap_register(struct netmap_adapter *na, int enable) +{ + struct ifnet *ifp = na->ifp; + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + struct mbuf *m; + int error; + int i, r; + + if (!na) + return EINVAL; + +#ifdef REG_RESET + error = ifp->netdev_ops->ndo_stop(ifp); + if (error) { + return error; + } +#endif /* REG_RESET */ + + if (enable) { /* Enable netmap mode. */ + /* Init the mitigation support. */ + gna->mit = malloc(na->num_rx_rings * sizeof(struct nm_generic_mit), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (!gna->mit) { + D("mitigation allocation failed"); + error = ENOMEM; + goto out; + } + for (r=0; rnum_rx_rings; r++) + netmap_mitigation_init(&gna->mit[r], na); + + /* Initialize the rx queue, as generic_rx_handler() can + * be called as soon as netmap_catch_rx() returns. + */ + for (r=0; rnum_rx_rings; r++) { + mbq_safe_init(&na->rx_rings[r].rx_queue); + } + + /* + * Preallocate packet buffers for the tx rings. + */ + for (r=0; rnum_tx_rings; r++) + na->tx_rings[r].tx_pool = NULL; + for (r=0; rnum_tx_rings; r++) { + na->tx_rings[r].tx_pool = malloc(na->num_tx_desc * sizeof(struct mbuf *), + M_DEVBUF, M_NOWAIT | M_ZERO); + if (!na->tx_rings[r].tx_pool) { + D("tx_pool allocation failed"); + error = ENOMEM; + goto free_tx_pools; + } + for (i=0; inum_tx_desc; i++) + na->tx_rings[r].tx_pool[i] = NULL; + for (i=0; inum_tx_desc; i++) { + m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (!m) { + D("tx_pool[%d] allocation failed", i); + error = ENOMEM; + goto free_tx_pools; + } + na->tx_rings[r].tx_pool[i] = m; + } + } + rtnl_lock(); + /* Prepare to intercept incoming traffic. */ + error = netmap_catch_rx(na, 1); + if (error) { + D("netdev_rx_handler_register() failed (%d)", error); + goto register_handler; + } + ifp->if_capenable |= IFCAP_NETMAP; + + /* Make netmap control the packet steering. */ + netmap_catch_tx(gna, 1); + + rtnl_unlock(); + +#ifdef RATE + if (rate_ctx.refcount == 0) { + D("setup_timer()"); + memset(&rate_ctx, 0, sizeof(rate_ctx)); + setup_timer(&rate_ctx.timer, &rate_callback, (unsigned long)&rate_ctx); + if (mod_timer(&rate_ctx.timer, jiffies + msecs_to_jiffies(1500))) { + D("Error: mod_timer()"); + } + } + rate_ctx.refcount++; +#endif /* RATE */ + + } else if (na->tx_rings[0].tx_pool) { + /* Disable netmap mode. We enter here only if the previous + generic_netmap_register(na, 1) was successfull. + If it was not, na->tx_rings[0].tx_pool was set to NULL by the + error handling code below. */ + rtnl_lock(); + + ifp->if_capenable &= ~IFCAP_NETMAP; + + /* Release packet steering control. */ + netmap_catch_tx(gna, 0); + + /* Do not intercept packets on the rx path. */ + netmap_catch_rx(na, 0); + + rtnl_unlock(); + + /* Free the mbufs going to the netmap rings */ + for (r=0; rnum_rx_rings; r++) { + mbq_safe_purge(&na->rx_rings[r].rx_queue); + mbq_safe_destroy(&na->rx_rings[r].rx_queue); + } + + for (r=0; rnum_rx_rings; r++) + netmap_mitigation_cleanup(&gna->mit[r]); + free(gna->mit, M_DEVBUF); + + for (r=0; rnum_tx_rings; r++) { + for (i=0; inum_tx_desc; i++) { + m_freem(na->tx_rings[r].tx_pool[i]); + } + free(na->tx_rings[r].tx_pool, M_DEVBUF); + } + +#ifdef RATE + if (--rate_ctx.refcount == 0) { + D("del_timer()"); + del_timer(&rate_ctx.timer); + } +#endif + } + +#ifdef REG_RESET + error = ifp->netdev_ops->ndo_open(ifp); + if (error) { + goto free_tx_pools; + } +#endif + + return 0; + +register_handler: + rtnl_unlock(); +free_tx_pools: + for (r=0; rnum_tx_rings; r++) { + if (na->tx_rings[r].tx_pool == NULL) + continue; + for (i=0; inum_tx_desc; i++) + if (na->tx_rings[r].tx_pool[i]) + m_freem(na->tx_rings[r].tx_pool[i]); + free(na->tx_rings[r].tx_pool, M_DEVBUF); + na->tx_rings[r].tx_pool = NULL; + } + for (r=0; rnum_rx_rings; r++) { + netmap_mitigation_cleanup(&gna->mit[r]); + mbq_safe_destroy(&na->rx_rings[r].rx_queue); + } + free(gna->mit, M_DEVBUF); +out: + + return error; +} + +/* + * Callback invoked when the device driver frees an mbuf used + * by netmap to transmit a packet. This usually happens when + * the NIC notifies the driver that transmission is completed. + */ +static void +generic_mbuf_destructor(struct mbuf *m) +{ + if (netmap_verbose) + D("Tx irq (%p) queue %d", m, MBUF_TXQ(m)); + netmap_generic_irq(MBUF_IFP(m), MBUF_TXQ(m), NULL); +#ifdef __FreeBSD__ + m->m_ext.ext_type = EXT_PACKET; + m->m_ext.ext_free = NULL; + if (*(m->m_ext.ref_cnt) == 0) + *(m->m_ext.ref_cnt) = 1; + uma_zfree(zone_pack, m); +#endif /* __FreeBSD__ */ + IFRATE(rate_ctx.new.txirq++); +} + +/* Record completed transmissions and update hwtail. + * + * The oldest tx buffer not yet completed is at nr_hwtail + 1, + * nr_hwcur is the first unsent buffer. + */ +static u_int +generic_netmap_tx_clean(struct netmap_kring *kring) +{ + u_int const lim = kring->nkr_num_slots - 1; + u_int nm_i = nm_next(kring->nr_hwtail, lim); + u_int hwcur = kring->nr_hwcur; + u_int n = 0; + struct mbuf **tx_pool = kring->tx_pool; + + while (nm_i != hwcur) { /* buffers not completed */ + struct mbuf *m = tx_pool[nm_i]; + + if (unlikely(m == NULL)) { + /* this is done, try to replenish the entry */ + tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (unlikely(m == NULL)) { + D("mbuf allocation failed, XXX error"); + // XXX how do we proceed ? break ? + return -ENOMEM; + } + } else if (GET_MBUF_REFCNT(m) != 1) { + break; /* This mbuf is still busy: its refcnt is 2. */ + } + n++; + nm_i = nm_next(nm_i, lim); + } + kring->nr_hwtail = nm_prev(nm_i, lim); + ND("tx completed [%d] -> hwtail %d", n, kring->nr_hwtail); + + return n; +} + + +/* + * We have pending packets in the driver between nr_hwtail +1 and hwcur. + * Compute a position in the middle, to be used to generate + * a notification. + */ +static inline u_int +generic_tx_event_middle(struct netmap_kring *kring, u_int hwcur) +{ + u_int n = kring->nkr_num_slots; + u_int ntc = nm_next(kring->nr_hwtail, n-1); + u_int e; + + if (hwcur >= ntc) { + e = (hwcur + ntc) / 2; + } else { /* wrap around */ + e = (hwcur + n + ntc) / 2; + if (e >= n) { + e -= n; + } + } + + if (unlikely(e >= n)) { + D("This cannot happen"); + e = 0; + } + + return e; +} + +/* + * We have pending packets in the driver between nr_hwtail+1 and hwcur. + * Schedule a notification approximately in the middle of the two. + * There is a race but this is only called within txsync which does + * a double check. + */ +static void +generic_set_tx_event(struct netmap_kring *kring, u_int hwcur) +{ + struct mbuf *m; + u_int e; + + if (nm_next(kring->nr_hwtail, kring->nkr_num_slots -1) == hwcur) { + return; /* all buffers are free */ + } + e = generic_tx_event_middle(kring, hwcur); + + m = kring->tx_pool[e]; + if (m == NULL) { + /* This can happen if there is already an event on the netmap + slot 'e': There is nothing to do. */ + return; + } + ND("Event at %d mbuf %p refcnt %d", e, m, GET_MBUF_REFCNT(m)); + kring->tx_pool[e] = NULL; + SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor); + + // XXX wmb() ? + /* Decrement the refcount an free it if we have the last one. */ + m_freem(m); + smp_mb(); +} + + +/* + * generic_netmap_txsync() transforms netmap buffers into mbufs + * and passes them to the standard device driver + * (ndo_start_xmit() or ifp->if_transmit() ). + * On linux this is not done directly, but using dev_queue_xmit(), + * since it implements the TX flow control (and takes some locks). + */ +static int +generic_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct ifnet *ifp = na->ifp; + struct netmap_kring *kring = &na->tx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int nm_i; /* index into the netmap ring */ // j + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = kring->rhead; + + IFRATE(rate_ctx.new.txsync++); + + // TODO: handle the case of mbuf allocation failure + + rmb(); + + /* + * First part: process new packets to send. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { /* we have new packets to send */ + while (nm_i != head) { + struct netmap_slot *slot = &ring->slot[nm_i]; + u_int len = slot->len; + void *addr = NMB(slot); + + /* device-specific */ + struct mbuf *m; + int tx_ret; + + NM_CHECK_ADDR_LEN(addr, len); + + /* Tale a mbuf from the tx pool and copy in the user packet. */ + m = kring->tx_pool[nm_i]; + if (unlikely(!m)) { + RD(5, "This should never happen"); + kring->tx_pool[nm_i] = m = netmap_get_mbuf(GENERIC_BUF_SIZE); + if (unlikely(m == NULL)) { + D("mbuf allocation failed"); + break; + } + } + /* XXX we should ask notifications when NS_REPORT is set, + * or roughly every half frame. We can optimize this + * by lazily requesting notifications only when a + * transmission fails. Probably the best way is to + * break on failures and set notifications when + * ring->cur == ring->tail || nm_i != cur + */ + tx_ret = generic_xmit_frame(ifp, m, addr, len, ring_nr); + if (unlikely(tx_ret)) { + RD(5, "start_xmit failed: err %d [nm_i %u, head %u, hwtail %u]", + tx_ret, nm_i, head, kring->nr_hwtail); + /* + * No room for this mbuf in the device driver. + * Request a notification FOR A PREVIOUS MBUF, + * then call generic_netmap_tx_clean(kring) to do the + * double check and see if we can free more buffers. + * If there is space continue, else break; + * NOTE: the double check is necessary if the problem + * occurs in the txsync call after selrecord(). + * Also, we need some way to tell the caller that not + * all buffers were queued onto the device (this was + * not a problem with native netmap driver where space + * is preallocated). The bridge has a similar problem + * and we solve it there by dropping the excess packets. + */ + generic_set_tx_event(kring, nm_i); + if (generic_netmap_tx_clean(kring)) { /* space now available */ + continue; + } else { + break; + } + } + slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED); + nm_i = nm_next(nm_i, lim); + IFRATE(rate_ctx.new.txpkt ++); + } + + /* Update hwcur to the next slot to transmit. */ + kring->nr_hwcur = nm_i; /* not head, we could break early */ + } + + /* + * Second, reclaim completed buffers + */ + if (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) { + /* No more available slots? Set a notification event + * on a netmap slot that will be cleaned in the future. + * No doublecheck is performed, since txsync() will be + * called twice by netmap_poll(). + */ + generic_set_tx_event(kring, nm_i); + } + ND("tx #%d, hwtail = %d", n, kring->nr_hwtail); + + generic_netmap_tx_clean(kring); + + nm_txsync_finalize(kring); + + return 0; +} + + +/* + * This handler is registered (through netmap_catch_rx()) + * within the attached network interface + * in the RX subsystem, so that every mbuf passed up by + * the driver can be stolen to the network stack. + * Stolen packets are put in a queue where the + * generic_netmap_rxsync() callback can extract them. + */ +void +generic_rx_handler(struct ifnet *ifp, struct mbuf *m) +{ + struct netmap_adapter *na = NA(ifp); + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na; + u_int work_done; + u_int rr = MBUF_RXQ(m); // receive ring number + + if (rr >= na->num_rx_rings) { + rr = rr % na->num_rx_rings; // XXX expensive... + } + + /* limit the size of the queue */ + if (unlikely(mbq_len(&na->rx_rings[rr].rx_queue) > 1024)) { + m_freem(m); + } else { + mbq_safe_enqueue(&na->rx_rings[rr].rx_queue, m); + } + + if (netmap_generic_mit < 32768) { + /* no rx mitigation, pass notification up */ + netmap_generic_irq(na->ifp, rr, &work_done); + IFRATE(rate_ctx.new.rxirq++); + } else { + /* same as send combining, filter notification if there is a + * pending timer, otherwise pass it up and start a timer. + */ + if (likely(netmap_mitigation_active(&gna->mit[rr]))) { + /* Record that there is some pending work. */ + gna->mit[rr].mit_pending = 1; + } else { + netmap_generic_irq(na->ifp, rr, &work_done); + IFRATE(rate_ctx.new.rxirq++); + netmap_mitigation_start(&gna->mit[rr]); + } + } +} + +/* + * generic_netmap_rxsync() extracts mbufs from the queue filled by + * generic_netmap_rx_handler() and puts their content in the netmap + * receive ring. + * Access must be protected because the rx handler is asynchronous, + */ +static int +generic_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int nm_i; /* index into the netmap ring */ //j, + u_int n; + u_int const lim = kring->nkr_num_slots - 1; + u_int const head = nm_rxsync_prologue(kring); + int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR; + + if (head > lim) + return netmap_ring_reinit(kring); + + /* + * First part: import newly received packets. + */ + if (netmap_no_pendintr || force_update) { + /* extract buffers from the rx queue, stop at most one + * slot before nr_hwcur (stop_i) + */ + uint16_t slot_flags = kring->nkr_slot_flags; + u_int stop_i = nm_prev(kring->nr_hwcur, lim); + + nm_i = kring->nr_hwtail; /* first empty slot in the receive ring */ + for (n = 0; nm_i != stop_i; n++) { + int len; + void *addr = NMB(&ring->slot[nm_i]); + struct mbuf *m; + + /* we only check the address here on generic rx rings */ + if (addr == netmap_buffer_base) { /* Bad buffer */ + return netmap_ring_reinit(kring); + } + /* + * Call the locked version of the function. + * XXX Ideally we could grab a batch of mbufs at once + * and save some locking overhead. + */ + m = mbq_safe_dequeue(&kring->rx_queue); + if (!m) /* no more data */ + break; + len = MBUF_LEN(m); + m_copydata(m, 0, len, addr); + ring->slot[nm_i].len = len; + ring->slot[nm_i].flags = slot_flags; + m_freem(m); + nm_i = nm_next(nm_i, lim); + } + if (n) { + kring->nr_hwtail = nm_i; + IFRATE(rate_ctx.new.rxpkt += n); + } + kring->nr_kflags &= ~NKR_PENDINTR; + } + + // XXX should we invert the order ? + /* + * Second part: skip past packets that userspace has released. + */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + /* Userspace has released some packets. */ + for (n = 0; nm_i != head; n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + + slot->flags &= ~NS_BUF_CHANGED; + nm_i = nm_next(nm_i, lim); + } + kring->nr_hwcur = head; + } + /* tell userspace that there might be new packets. */ + nm_rxsync_finalize(kring); + IFRATE(rate_ctx.new.rxsync++); + + return 0; +} + +static void +generic_netmap_dtor(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + struct netmap_generic_adapter *gna = (struct netmap_generic_adapter*)na; + struct netmap_adapter *prev_na = gna->prev; + + if (prev_na != NULL) { + D("Released generic NA %p", gna); + if_rele(na->ifp); + netmap_adapter_put(prev_na); + } + if (ifp != NULL) { + WNA(ifp) = prev_na; + D("Restored native NA %p", prev_na); + na->ifp = NULL; + } +} + +/* + * generic_netmap_attach() makes it possible to use netmap on + * a device without native netmap support. + * This is less performant than native support but potentially + * faster than raw sockets or similar schemes. + * + * In this "emulated" mode, netmap rings do not necessarily + * have the same size as those in the NIC. We use a default + * value and possibly override it if the OS has ways to fetch the + * actual configuration. + */ +int +generic_netmap_attach(struct ifnet *ifp) +{ + struct netmap_adapter *na; + struct netmap_generic_adapter *gna; + int retval; + u_int num_tx_desc, num_rx_desc; + + num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */ + + generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); + ND("Netmap ring size: TX = %d, RX = %d", num_tx_desc, num_rx_desc); + + gna = malloc(sizeof(*gna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (gna == NULL) { + D("no memory on attach, give up"); + return ENOMEM; + } + na = (struct netmap_adapter *)gna; + na->ifp = ifp; + na->num_tx_desc = num_tx_desc; + na->num_rx_desc = num_rx_desc; + na->nm_register = &generic_netmap_register; + na->nm_txsync = &generic_netmap_txsync; + na->nm_rxsync = &generic_netmap_rxsync; + na->nm_dtor = &generic_netmap_dtor; + /* when using generic, IFCAP_NETMAP is set so we force + * NAF_SKIP_INTR to use the regular interrupt handler + */ + na->na_flags = NAF_SKIP_INTR | NAF_HOST_RINGS; + + ND("[GNA] num_tx_queues(%d), real_num_tx_queues(%d), len(%lu)", + ifp->num_tx_queues, ifp->real_num_tx_queues, + ifp->tx_queue_len); + ND("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)", + ifp->num_rx_queues, ifp->real_num_rx_queues); + + generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings); + + retval = netmap_attach_common(na); + if (retval) { + free(gna, M_DEVBUF); + } + + return retval; +} diff --git a/sys/dev/netmap/netmap_kern.h b/sys/dev/netmap/netmap_kern.h index 12bd88252..ddcb0e318 100644 --- a/sys/dev/netmap/netmap_kern.h +++ b/sys/dev/netmap/netmap_kern.h @@ -1,5 +1,6 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -33,27 +34,65 @@ #ifndef _NET_NETMAP_KERN_H_ #define _NET_NETMAP_KERN_H_ +#define WITH_VALE // comment out to disable VALE support +#define WITH_PIPES + #if defined(__FreeBSD__) #define likely(x) __builtin_expect((long)!!(x), 1L) #define unlikely(x) __builtin_expect((long)!!(x), 0L) #define NM_LOCK_T struct mtx +#define NMG_LOCK_T struct mtx +#define NMG_LOCK_INIT() mtx_init(&netmap_global_lock, \ + "netmap global lock", NULL, MTX_DEF) +#define NMG_LOCK_DESTROY() mtx_destroy(&netmap_global_lock) +#define NMG_LOCK() mtx_lock(&netmap_global_lock) +#define NMG_UNLOCK() mtx_unlock(&netmap_global_lock) +#define NMG_LOCK_ASSERT() mtx_assert(&netmap_global_lock, MA_OWNED) + #define NM_SELINFO_T struct selinfo #define MBUF_LEN(m) ((m)->m_pkthdr.len) -#define NM_SEND_UP(ifp, m) ((ifp)->if_input)(ifp, m) +#define MBUF_IFP(m) ((m)->m_pkthdr.rcvif) +#define NM_SEND_UP(ifp, m) ((NA(ifp))->if_input)(ifp, m) + +#define NM_ATOMIC_T volatile int // XXX ? +/* atomic operations */ +#include +#define NM_ATOMIC_TEST_AND_SET(p) (!atomic_cmpset_acq_int((p), 0, 1)) +#define NM_ATOMIC_CLEAR(p) atomic_store_rel_int((p), 0) + -#define NM_ATOMIC_T volatile int +MALLOC_DECLARE(M_NETMAP); + +// XXX linux struct, not used in FreeBSD +struct net_device_ops { +}; +struct hrtimer { +}; #elif defined (linux) #define NM_LOCK_T safe_spinlock_t // see bsd_glue.h #define NM_SELINFO_T wait_queue_head_t #define MBUF_LEN(m) ((m)->len) -#define NM_SEND_UP(ifp, m) netif_rx(m) +#define MBUF_IFP(m) ((m)->dev) +#define NM_SEND_UP(ifp, m) \ + do { \ + m->priority = NM_MAGIC_PRIORITY; \ + netif_rx(m); \ + } while (0) #define NM_ATOMIC_T volatile long unsigned int +// XXX a mtx would suffice here too 20130404 gl +#define NMG_LOCK_T struct semaphore +#define NMG_LOCK_INIT() sema_init(&netmap_global_lock, 1) +#define NMG_LOCK_DESTROY() +#define NMG_LOCK() down(&netmap_global_lock) +#define NMG_UNLOCK() up(&netmap_global_lock) +#define NMG_LOCK_ASSERT() // XXX to be completed + #ifndef DEV_NETMAP #define DEV_NETMAP #endif /* DEV_NETMAP */ @@ -91,9 +130,9 @@ do { \ struct timeval __xxts; \ microtime(&__xxts); \ - printf("%03d.%06d %s [%d] " format "\n", \ + printf("%03d.%06d [%4d] %-25s " format "\n", \ (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \ - __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + __LINE__, __FUNCTION__, ##__VA_ARGS__); \ } while (0) /* rate limited, lps indicates how many per second */ @@ -115,20 +154,32 @@ struct netmap_priv_d; const char *nm_dump_buf(char *p, int len, int lim, char *dst); +#include "netmap_mbq.h" + +extern NMG_LOCK_T netmap_global_lock; + /* * private, kernel view of a ring. Keeps track of the status of * a ring across system calls. * * nr_hwcur index of the next buffer to refill. - * It corresponds to ring->cur - ring->reserved + * It corresponds to ring->head + * at the time the system call returns. * - * nr_hwavail the number of slots "owned" by userspace. - * nr_hwavail =:= ring->avail + ring->reserved + * nr_hwtail index of the first buffer owned by the kernel. + * On RX, hwcur->hwtail are receive buffers + * not yet released. hwcur is advanced following + * ring->head, hwtail is advanced on incoming packets, + * and a wakeup is generated when hwtail passes ring->cur + * On TX, hwcur->rcur have been filled by the sender + * but not sent yet to the NIC; rcur->hwtail are available + * for new transmissions, and hwtail->hwcur-1 are pending + * transmissions not yet acknowledged. * * The indexes in the NIC and netmap rings are offset by nkr_hwofs slots. * This is so that, on a reset, buffers owned by userspace are not * modified by the kernel. In particular: - * RX rings: the next empty buffer (hwcur + hwavail + hwofs) coincides with + * RX rings: the next empty buffer (hwtail + hwofs) coincides with * the next empty buffer as known by the hardware (next_to_check or so). * TX rings: hwcur + hwofs coincides with next_to_send * @@ -146,43 +197,97 @@ const char *nm_dump_buf(char *p, int len, int lim, char *dst); * from nr_hwlease, advances it, then does the * copy outside the lock. * In RX rings (used for VALE ports), - * nkr_hwcur + nkr_hwavail <= nkr_hwlease < nkr_hwcur+N-1 + * nkr_hwtail <= nkr_hwlease < nkr_hwcur+N-1 * In TX rings (used for NIC or host stack ports) - * nkr_hwcur <= nkr_hwlease < nkr_hwcur+ nkr_hwavail + * nkr_hwcur <= nkr_hwlease < nkr_hwtail * nkr_leases array of nkr_num_slots where writers can report * completion of their block. NR_NOSLOT (~0) indicates * that the writer has not finished yet - * nkr_lease_idx index of next free slot in nr_leases, to be assigned + * nkr_lease_idx index of next free slot in nr_leases, to be assigned * * The kring is manipulated by txsync/rxsync and generic netmap function. - * q_lock is used to arbitrate access to the kring from within the netmap - * code, and this and other protections guarantee that there is never - * more than 1 concurrent call to txsync or rxsync. So we are free - * to manipulate the kring from within txsync/rxsync without any extra - * locks. + * + * Concurrent rxsync or txsync on the same ring are prevented through + * by nm_kr_lock() which in turn uses nr_busy. This is all we need + * for NIC rings, and for TX rings attached to the host stack. + * + * RX rings attached to the host stack use an mbq (rx_queue) on both + * rxsync_from_host() and netmap_transmit(). The mbq is protected + * by its internal lock. + * + * RX rings attached to the VALE switch are accessed by both sender + * and receiver. They are protected through the q_lock on the RX ring. */ struct netmap_kring { - struct netmap_ring *ring; - uint32_t nr_hwcur; - uint32_t nr_hwavail; - uint32_t nr_kflags; /* private driver flags */ -#define NKR_PENDINTR 0x1 // Pending interrupt. - uint32_t nkr_num_slots; - int32_t nkr_hwofs; /* offset between NIC and netmap ring */ + struct netmap_ring *ring; + + uint32_t nr_hwcur; + uint32_t nr_hwtail; + + /* + * Copies of values in user rings, so we do not need to look + * at the ring (which could be modified). These are set in the + * *sync_prologue()/finalize() routines. + */ + uint32_t rhead; + uint32_t rcur; + uint32_t rtail; + + uint32_t nr_kflags; /* private driver flags */ +#define NKR_PENDINTR 0x1 // Pending interrupt. + uint32_t nkr_num_slots; + + /* + * On a NIC reset, the NIC ring indexes may be reset but the + * indexes in the netmap rings remain the same. nkr_hwofs + * keeps track of the offset between the two. + */ + int32_t nkr_hwofs; uint16_t nkr_slot_flags; /* initial value for flags */ + + /* last_reclaim is opaque marker to help reduce the frequency + * of operations such as reclaiming tx buffers. A possible use + * is set it to ticks and do the reclaim only once per tick. + */ + uint64_t last_reclaim; + + + NM_SELINFO_T si; /* poll/select wait queue */ + NM_LOCK_T q_lock; /* protects kring and ring. */ + NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */ + struct netmap_adapter *na; + + /* The folloiwing fields are for VALE switch support */ struct nm_bdg_fwd *nkr_ft; - uint32_t *nkr_leases; -#define NR_NOSLOT ((uint32_t)~0) - uint32_t nkr_hwlease; - uint32_t nkr_lease_idx; + uint32_t *nkr_leases; +#define NR_NOSLOT ((uint32_t)~0) /* used in nkr_*lease* */ + uint32_t nkr_hwlease; + uint32_t nkr_lease_idx; + + volatile int nkr_stopped; // XXX what for ? + + /* Support for adapters without native netmap support. + * On tx rings we preallocate an array of tx buffers + * (same size as the netmap ring), on rx rings we + * store incoming mbufs in a queue that is drained by + * a rxsync. + */ + struct mbuf **tx_pool; + // u_int nr_ntc; /* Emulation of a next-to-clean RX ring pointer. */ + struct mbq rx_queue; /* intercepted rx mbufs. */ + + uint32_t ring_id; /* debugging */ + char name[64]; /* diagnostic */ + + int (*nm_sync)(struct netmap_kring *kring, int flags); - NM_SELINFO_T si; /* poll/select wait queue */ - NM_LOCK_T q_lock; /* protects kring and ring. */ - NM_ATOMIC_T nr_busy; /* prevent concurrent syscalls */ +#ifdef WITH_PIPES + struct netmap_kring *pipe; + struct netmap_ring *save_ring; +#endif /* WITH_PIPES */ - volatile int nkr_stopped; } __attribute__((__aligned__(64))); @@ -193,6 +298,15 @@ nm_next(uint32_t i, uint32_t lim) return unlikely (i == lim) ? 0 : i + 1; } + +/* return the previous index, with wraparound */ +static inline uint32_t +nm_prev(uint32_t i, uint32_t lim) +{ + return unlikely (i == 0) ? lim : i - 1; +} + + /* * * Here is the layout for the Rx and Tx rings. @@ -203,36 +317,36 @@ nm_next(uint32_t i, uint32_t lim) | | | | |XXX free slot XXX| |XXX free slot XXX| +-----------------+ +-----------------+ - | |<-hwcur | |<-hwcur - | reserved h | | (ready | - +----------- w -+ | to be | - cur->| a | | sent) h | - | v | +---------- w | - | a | cur->| (being a | - | i | | prepared) v | - | avail l | | a | - +-----------------+ + a ------ i + - | | ... | v l |<-hwlease - | (being | ... | a | ... - | prepared) | ... | i | ... - +-----------------+ ... | l | ... - | |<-hwlease +-----------------+ +head->| owned by user |<-hwcur | not sent to nic |<-hwcur + | | | yet | + +-----------------+ | | + cur->| available to | | | + | user, not read | +-----------------+ + | yet | cur->| (being | + | | | prepared) | | | | | + +-----------------+ + ------ + +tail->| |<-hwtail | |<-hwlease + | (being | ... | | ... + | prepared) | ... | | ... + +-----------------+ ... | | ... + | |<-hwlease +-----------------+ + | | tail->| |<-hwtail | | | | | | | | | | | | +-----------------+ +-----------------+ - * The cur/avail (user view) and hwcur/hwavail (kernel view) + * The cur/tail (user view) and hwcur/hwtail (kernel view) * are used in the normal operation of the card. * * When a ring is the output of a switch port (Rx ring for * a VALE port, Tx ring for the host stack or NIC), slots * are reserved in blocks through 'hwlease' which points * to the next unused slot. - * On an Rx ring, hwlease is always after hwavail, - * and completions cause avail to advance. - * On a Tx ring, hwlease is always between cur and hwavail, + * On an Rx ring, hwlease is always after hwtail, + * and completions cause hwtail to advance. + * On a Tx ring, hwlease is always between cur and hwtail, * and completions cause cur to advance. * * nm_kr_space() returns the maximum number of slots that @@ -244,23 +358,26 @@ nm_next(uint32_t i, uint32_t lim) - +enum txrx { NR_RX = 0, NR_TX = 1 }; /* - * This struct extends the 'struct adapter' (or - * equivalent) device descriptor. It contains all fields needed to - * support netmap operation. + * The "struct netmap_adapter" extends the "struct adapter" + * (or equivalent) device descriptor. + * It contains all base fields needed to support netmap operation. + * There are in fact different types of netmap adapters + * (native, generic, VALE switch...) so a netmap_adapter is + * just the first field in the derived type. */ struct netmap_adapter { /* * On linux we do not have a good way to tell if an interface - * is netmap-capable. So we use the following trick: + * is netmap-capable. So we always use the following trick: * NA(ifp) points here, and the first entry (which hopefully * always exists and is at least 32 bits) contains a magic * value which we can use to detect that the interface is good. */ uint32_t magic; - uint32_t na_flags; /* future place for IFCAP_NETMAP */ + uint32_t na_flags; /* enabled, and other flags */ #define NAF_SKIP_INTR 1 /* use the regular interrupt handler. * useful during initialization */ @@ -272,17 +389,17 @@ struct netmap_adapter { #define NAF_MEM_OWNER 8 /* the adapter is responsible for the * deallocation of the memory allocator */ - int refcount; /* number of user-space descriptors using this +#define NAF_NATIVE_ON 16 /* the adapter is native and the attached + * interface is in netmap mode + */ +#define NAF_NETMAP_ON 32 /* netmap is active (either native or + * emulated. Where possible (e.g. FreeBSD) + * IFCAP_NETMAP also mirrors this flag. + */ +#define NAF_HOST_RINGS 64 /* the adapter supports the host rings */ + int active_fds; /* number of user-space descriptors using this interface, which is equal to the number of struct netmap_if objs in the mapped region. */ - /* - * The selwakeup in the interrupt thread can use per-ring - * and/or global wait queues. We track how many clients - * of each type we have so we can optimize the drivers, - * and especially avoid huge contention on the locks. - */ - int na_single; /* threads attached to a single hw queue */ - int na_multi; /* threads attached to multiple hw queues */ u_int num_rx_rings; /* number of adapter receive rings */ u_int num_tx_rings; /* number of adapter transmit rings */ @@ -297,165 +414,324 @@ struct netmap_adapter { struct netmap_kring *tx_rings; /* array of TX rings. */ struct netmap_kring *rx_rings; /* array of RX rings. */ + void *tailroom; /* space below the rings array */ + /* (used for leases) */ + + NM_SELINFO_T tx_si, rx_si; /* global wait queues */ + /* count users of the global wait queues */ + int tx_si_users, rx_si_users; + /* copy of if_qflush and if_transmit pointers, to intercept * packets from the network stack when netmap is active. */ int (*if_transmit)(struct ifnet *, struct mbuf *); + /* copy of if_input for netmap_send_up() */ + void (*if_input)(struct ifnet *, struct mbuf *); + /* references to the ifnet and device routines, used by * the generic netmap functions. */ struct ifnet *ifp; /* adapter is ifp->if_softc */ - NM_LOCK_T core_lock; /* used if no device lock available */ + /*---- callbacks for this netmap adapter -----*/ + /* + * nm_dtor() is the cleanup routine called when destroying + * the adapter. + * + * nm_register() is called on NIOCREGIF and close() to enter + * or exit netmap mode on the NIC + * + * nm_txsync() pushes packets to the underlying hw/switch + * + * nm_rxsync() collects packets from the underlying hw/switch + * + * nm_config() returns configuration information from the OS + * + * nm_krings_create() create and init the krings array + * (the array layout must conform to the description + * found above the definition of netmap_krings_create) + * + * nm_krings_delete() cleanup and delete the kring array + * + * nm_notify() is used to act after data have become available. + * For hw devices this is typically a selwakeup(), + * but for NIC/host ports attached to a switch (or vice-versa) + * we also need to invoke the 'txsync' code downstream. + */ - int (*nm_register)(struct ifnet *, int onoff); + /* private cleanup */ + void (*nm_dtor)(struct netmap_adapter *); - int (*nm_txsync)(struct ifnet *, u_int ring, int flags); - int (*nm_rxsync)(struct ifnet *, u_int ring, int flags); + int (*nm_register)(struct netmap_adapter *, int onoff); + + int (*nm_txsync)(struct netmap_adapter *, u_int ring, int flags); + int (*nm_rxsync)(struct netmap_adapter *, u_int ring, int flags); #define NAF_FORCE_READ 1 #define NAF_FORCE_RECLAIM 2 /* return configuration information */ - int (*nm_config)(struct ifnet *, u_int *txr, u_int *txd, - u_int *rxr, u_int *rxd); + int (*nm_config)(struct netmap_adapter *, + u_int *txr, u_int *txd, u_int *rxr, u_int *rxd); + int (*nm_krings_create)(struct netmap_adapter *); + void (*nm_krings_delete)(struct netmap_adapter *); + int (*nm_notify)(struct netmap_adapter *, + u_int ring, enum txrx, int flags); +#define NAF_DISABLE_NOTIFY 8 + + /* standard refcount to control the lifetime of the adapter + * (it should be equal to the lifetime of the corresponding ifp) + */ + int na_refcount; + + /* memory allocator (opaque) + * We also cache a pointer to the lut_entry for translating + * buffer addresses, and the total number of buffers. + */ + struct netmap_mem_d *nm_mem; + struct lut_entry *na_lut; + uint32_t na_lut_objtotal; /* max buffer index */ + + /* used internally. If non-null, the interface cannot be bound + * from userspace + */ + void *na_private; + +#ifdef WITH_PIPES + struct netmap_pipe_adapter **na_pipes; + int na_next_pipe; + int na_max_pipes; +#endif /* WITH_PIPES */ +}; + + +/* + * If the NIC is owned by the kernel + * (i.e., bridge), neither another bridge nor user can use it; + * if the NIC is owned by a user, only users can share it. + * Evaluation must be done under NMG_LOCK(). + */ +#define NETMAP_OWNED_BY_KERN(na) (na->na_private) +#define NETMAP_OWNED_BY_ANY(na) \ + (NETMAP_OWNED_BY_KERN(na) || (na->active_fds > 0)) + + +/* + * derived netmap adapters for various types of ports + */ +struct netmap_vp_adapter { /* VALE software port */ + struct netmap_adapter up; /* * Bridge support: * * bdg_port is the port number used in the bridge; - * na_bdg_refcount is a refcount used for bridge ports, - * when it goes to 0 we can detach+free this port - * (a bridge port is always attached if it exists; - * it is not always registered) * na_bdg points to the bridge this NA is attached to. */ int bdg_port; - int na_bdg_refcount; struct nm_bridge *na_bdg; - /* When we attach a physical interface to the bridge, we - * allow the controlling process to terminate, so we need - * a place to store the netmap_priv_d data structure. - * This is only done when physical interfaces are attached to a bridge. + int retry; + + /* Offset of ethernet header for each packet. */ + u_int virt_hdr_len; + /* Maximum Frame Size, used in bdg_mismatch_datapath() */ + u_int mfs; +}; + + +struct netmap_hw_adapter { /* physical device */ + struct netmap_adapter up; + + struct net_device_ops nm_ndo; // XXX linux only +}; + +/* Mitigation support. */ +struct nm_generic_mit { + struct hrtimer mit_timer; + int mit_pending; + struct netmap_adapter *mit_na; /* backpointer */ +}; + +struct netmap_generic_adapter { /* emulated device */ + struct netmap_hw_adapter up; + + /* Pointer to a previously used netmap adapter. */ + struct netmap_adapter *prev; + + /* generic netmap adapters support: + * a net_device_ops struct overrides ndo_select_queue(), + * save_if_input saves the if_input hook (FreeBSD), + * mit implements rx interrupt mitigation, */ - struct netmap_priv_d *na_kpriv; + struct net_device_ops generic_ndo; + void (*save_if_input)(struct ifnet *, struct mbuf *); - /* memory allocator */ - struct netmap_mem_d *nm_mem; + struct nm_generic_mit *mit; #ifdef linux - struct net_device_ops nm_ndo; -#endif /* linux */ + netdev_tx_t (*save_start_xmit)(struct mbuf *, struct ifnet *); +#endif }; +static __inline int +netmap_real_tx_rings(struct netmap_adapter *na) +{ + return na->num_tx_rings + !!(na->na_flags & NAF_HOST_RINGS); +} + +static __inline int +netmap_real_rx_rings(struct netmap_adapter *na) +{ + return na->num_rx_rings + !!(na->na_flags & NAF_HOST_RINGS); +} + +#ifdef WITH_VALE + /* - * Available space in the ring. + * Bridge wrapper for non VALE ports attached to a VALE switch. + * + * The real device must already have its own netmap adapter (hwna). + * The bridge wrapper and the hwna adapter share the same set of + * netmap rings and buffers, but they have two separate sets of + * krings descriptors, with tx/rx meanings swapped: + * + * netmap + * bwrap krings rings krings hwna + * +------+ +------+ +-----+ +------+ +------+ + * |tx_rings->| |\ /| |----| |<-tx_rings| + * | | +------+ \ / +-----+ +------+ | | + * | | X | | + * | | / \ | | + * | | +------+/ \+-----+ +------+ | | + * |rx_rings->| | | |----| |<-rx_rings| + * | | +------+ +-----+ +------+ | | + * +------+ +------+ + * + * - packets coming from the bridge go to the brwap rx rings, + * which are also the hwna tx rings. The bwrap notify callback + * will then complete the hwna tx (see netmap_bwrap_notify). + * + * - packets coming from the outside go to the hwna rx rings, + * which are also the bwrap tx rings. The (overwritten) hwna + * notify method will then complete the bridge tx + * (see netmap_bwrap_intr_notify). + * + * The bridge wrapper may optionally connect the hwna 'host' rings + * to the bridge. This is done by using a second port in the + * bridge and connecting it to the 'host' netmap_vp_adapter + * contained in the netmap_bwrap_adapter. The brwap host adapter + * cross-links the hwna host rings in the same way as shown above. + * + * - packets coming from the bridge and directed to the host stack + * are handled by the bwrap host notify callback + * (see netmap_bwrap_host_notify) + * + * - packets coming from the host stack are still handled by the + * overwritten hwna notify callback (netmap_bwrap_intr_notify), + * but are diverted to the host adapter depending on the ring number. + * */ +struct netmap_bwrap_adapter { + struct netmap_vp_adapter up; + struct netmap_vp_adapter host; /* for host rings */ + struct netmap_adapter *hwna; /* the underlying device */ + + /* backup of the hwna notify callback */ + int (*save_notify)(struct netmap_adapter *, + u_int ring, enum txrx, int flags); + + /* + * When we attach a physical interface to the bridge, we + * allow the controlling process to terminate, so we need + * a place to store the netmap_priv_d data structure. + * This is only done when physical interfaces + * are attached to a bridge. + */ + struct netmap_priv_d *na_kpriv; +}; + + +#endif /* WITH_VALE */ + +#ifdef WITH_PIPES + +#define NM_MAXPIPES 64 /* max number of pipes per adapter */ + +struct netmap_pipe_adapter { + struct netmap_adapter up; + + u_int id; /* pipe identifier */ + int role; /* either NR_REG_PIPE_MASTER or NR_REG_PIPE_SLAVE */ + + struct netmap_adapter *parent; /* adapter that owns the memory */ + struct netmap_pipe_adapter *peer; /* the other end of the pipe */ + int peer_ref; /* 1 iff we are holding a ref to the peer */ + + u_int parent_slot; /* index in the parent pipe array */ +}; + +#endif /* WITH_PIPES */ + + +/* return slots reserved to rx clients; used in drivers */ static inline uint32_t -nm_kr_space(struct netmap_kring *k, int is_rx) +nm_kr_rxspace(struct netmap_kring *k) { - int space; - - if (is_rx) { - int busy = k->nkr_hwlease - k->nr_hwcur; - if (busy < 0) - busy += k->nkr_num_slots; - space = k->nkr_num_slots - 1 - busy; - } else { - space = k->nr_hwcur + k->nr_hwavail - k->nkr_hwlease; - if (space < 0) - space += k->nkr_num_slots; - } -#if 0 - // sanity check - if (k->nkr_hwlease >= k->nkr_num_slots || - k->nr_hwcur >= k->nkr_num_slots || - k->nr_hwavail >= k->nkr_num_slots || - busy < 0 || - busy >= k->nkr_num_slots) { - D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, - k->nkr_lease_idx, k->nkr_num_slots); - } -#endif + int space = k->nr_hwtail - k->nr_hwcur; + if (space < 0) + space += k->nkr_num_slots; + ND("preserving %d rx slots %d -> %d", space, k->nr_hwcur, k->nr_hwtail); + return space; } -/* return update position */ -static inline uint32_t -nm_kr_rxpos(struct netmap_kring *k) +/* True if no space in the tx ring. only valid after txsync_prologue */ +static inline int +nm_kr_txempty(struct netmap_kring *kring) { - uint32_t pos = k->nr_hwcur + k->nr_hwavail; - if (pos >= k->nkr_num_slots) - pos -= k->nkr_num_slots; -#if 0 - if (pos >= k->nkr_num_slots || - k->nkr_hwlease >= k->nkr_num_slots || - k->nr_hwcur >= k->nkr_num_slots || - k->nr_hwavail >= k->nkr_num_slots || - k->nkr_lease_idx >= k->nkr_num_slots) { - D("invalid kring, cur %d avail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, - k->nkr_lease_idx, k->nkr_num_slots); - } -#endif - return pos; + return kring->rcur == kring->nr_hwtail; } -/* make a lease on the kring for N positions. return the - * lease index +/* + * protect against multiple threads using the same ring. + * also check that the ring has not been stopped. + * We only care for 0 or !=0 as a return code. */ -static inline uint32_t -nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) +#define NM_KR_BUSY 1 +#define NM_KR_STOPPED 2 + + +static __inline void nm_kr_put(struct netmap_kring *kr) { - uint32_t lim = k->nkr_num_slots - 1; - uint32_t lease_idx = k->nkr_lease_idx; + NM_ATOMIC_CLEAR(&kr->nr_busy); +} - k->nkr_leases[lease_idx] = NR_NOSLOT; - k->nkr_lease_idx = nm_next(lease_idx, lim); - if (n > nm_kr_space(k, is_rx)) { - D("invalid request for %d slots", n); - panic("x"); +static __inline int nm_kr_tryget(struct netmap_kring *kr) +{ + /* check a first time without taking the lock + * to avoid starvation for nm_kr_get() + */ + if (unlikely(kr->nkr_stopped)) { + ND("ring %p stopped (%d)", kr, kr->nkr_stopped); + return NM_KR_STOPPED; } - /* XXX verify that there are n slots */ - k->nkr_hwlease += n; - if (k->nkr_hwlease > lim) - k->nkr_hwlease -= lim + 1; - - if (k->nkr_hwlease >= k->nkr_num_slots || - k->nr_hwcur >= k->nkr_num_slots || - k->nr_hwavail >= k->nkr_num_slots || - k->nkr_lease_idx >= k->nkr_num_slots) { - D("invalid kring %s, cur %d avail %d lease %d lease_idx %d lim %d", - k->na->ifp->if_xname, - k->nr_hwcur, k->nr_hwavail, k->nkr_hwlease, - k->nkr_lease_idx, k->nkr_num_slots); + if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))) + return NM_KR_BUSY; + /* check a second time with lock held */ + if (unlikely(kr->nkr_stopped)) { + ND("ring %p stopped (%d)", kr, kr->nkr_stopped); + nm_kr_put(kr); + return NM_KR_STOPPED; } - return lease_idx; + return 0; } /* - * XXX NETMAP_DELETING() is unused - * - * The combination of "enable" (ifp->if_capenable & IFCAP_NETMAP) - * and refcount gives the status of the interface, namely: - * - * enable refcount Status - * - * FALSE 0 normal operation - * FALSE != 0 -- (impossible) - * TRUE 1 netmap mode - * TRUE 0 being deleted. - */ - -#define NETMAP_DELETING(_na) ( ((_na)->refcount == 0) && \ - ( (_na)->ifp->if_capenable & IFCAP_NETMAP) ) - - -/* - * The following are support routines used by individual drivers to + * The following functions are used by individual drivers to * support netmap operation. * * netmap_attach() initializes a struct netmap_adapter, allocating the @@ -472,38 +748,252 @@ nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) * netmap_reset() is a helper routine to be called in the driver * when reinitializing a ring. */ -int netmap_attach(struct netmap_adapter *, u_int); +int netmap_attach(struct netmap_adapter *); +int netmap_attach_common(struct netmap_adapter *); +void netmap_detach_common(struct netmap_adapter *na); void netmap_detach(struct ifnet *); int netmap_transmit(struct ifnet *, struct mbuf *); -enum txrx { NR_RX = 0, NR_TX = 1 }; struct netmap_slot *netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n, u_int new_cur); int netmap_ring_reinit(struct netmap_kring *); +/* default functions to handle rx/tx interrupts */ +int netmap_rx_irq(struct ifnet *, u_int, u_int *); +#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) +void netmap_common_irq(struct ifnet *, u_int, u_int *work_done); + +void netmap_disable_all_rings(struct ifnet *); +void netmap_enable_all_rings(struct ifnet *); +void netmap_disable_ring(struct netmap_kring *kr); + + +/* set/clear native flags and if_transmit/netdev_ops */ +static inline void +nm_set_native_flags(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + + na->na_flags |= (NAF_NATIVE_ON | NAF_NETMAP_ON); +#ifdef IFCAP_NETMAP /* or FreeBSD ? */ + ifp->if_capenable |= IFCAP_NETMAP; +#endif +#ifdef __FreeBSD__ + na->if_transmit = ifp->if_transmit; + ifp->if_transmit = netmap_transmit; +#else + na->if_transmit = (void *)ifp->netdev_ops; + ifp->netdev_ops = &((struct netmap_hw_adapter *)na)->nm_ndo; +#endif +} + + +static inline void +nm_clear_native_flags(struct netmap_adapter *na) +{ + struct ifnet *ifp = na->ifp; + +#ifdef __FreeBSD__ + ifp->if_transmit = na->if_transmit; +#else + ifp->netdev_ops = (void *)na->if_transmit; +#endif + na->na_flags &= ~(NAF_NATIVE_ON | NAF_NETMAP_ON); +#ifdef IFCAP_NETMAP /* or FreeBSD ? */ + ifp->if_capenable &= ~IFCAP_NETMAP; +#endif +} + + +/* + * validates parameters in the ring/kring, returns a value for head + * If any error, returns ring_size to force a reinit. + */ +uint32_t nm_txsync_prologue(struct netmap_kring *); + + +/* + * validates parameters in the ring/kring, returns a value for head, + * and the 'reserved' value in the argument. + * If any error, returns ring_size lim to force a reinit. + */ +uint32_t nm_rxsync_prologue(struct netmap_kring *); + + +/* + * update kring and ring at the end of txsync. + */ +static inline void +nm_txsync_finalize(struct netmap_kring *kring) +{ + /* update ring tail to what the kernel knows */ + kring->ring->tail = kring->rtail = kring->nr_hwtail; + + /* note, head/rhead/hwcur might be behind cur/rcur + * if no carrier + */ + ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d", + kring->name, kring->nr_hwcur, kring->nr_hwtail, + kring->rhead, kring->rcur, kring->rtail); +} + + +/* + * update kring and ring at the end of rxsync + */ +static inline void +nm_rxsync_finalize(struct netmap_kring *kring) +{ + /* tell userspace that there might be new packets */ + //struct netmap_ring *ring = kring->ring; + ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail, + kring->nr_hwtail); + kring->ring->tail = kring->rtail = kring->nr_hwtail; + /* make a copy of the state for next round */ + kring->rhead = kring->ring->head; + kring->rcur = kring->ring->cur; +} + + +/* check/fix address and len in tx rings */ +#if 1 /* debug version */ +#define NM_CHECK_ADDR_LEN(_a, _l) do { \ + if (_a == netmap_buffer_base || _l > NETMAP_BUF_SIZE) { \ + RD(5, "bad addr/len ring %d slot %d idx %d len %d", \ + ring_nr, nm_i, slot->buf_idx, len); \ + if (_l > NETMAP_BUF_SIZE) \ + _l = NETMAP_BUF_SIZE; \ + } } while (0) +#else /* no debug version */ +#define NM_CHECK_ADDR_LEN(_a, _l) do { \ + if (_l > NETMAP_BUF_SIZE) \ + _l = NETMAP_BUF_SIZE; \ + } while (0) +#endif + + +/*---------------------------------------------------------------*/ +/* + * Support routines to be used with the VALE switch + */ +int netmap_update_config(struct netmap_adapter *na); +int netmap_krings_create(struct netmap_adapter *na, u_int tailroom); +void netmap_krings_delete(struct netmap_adapter *na); +int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait); + + +struct netmap_if * +netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na, + uint16_t ringid, uint32_t flags, int *err); + + + u_int nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg); +int netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +int netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na); + +#ifdef WITH_VALE /* - * The following bridge-related interfaces are used by other kernel modules - * In the version that only supports unicast or broadcast, the lookup + * The following bridge-related functions are used by other + * kernel modules. + * + * VALE only supports unicast or broadcast. The lookup * function can return 0 .. NM_BDG_MAXPORTS-1 for regular ports, * NM_BDG_MAXPORTS for broadcast, NM_BDG_MAXPORTS+1 for unknown. * XXX in practice "unknown" might be handled same as broadcast. */ -typedef u_int (*bdg_lookup_fn_t)(char *buf, u_int len, uint8_t *ring_nr, - struct netmap_adapter *); -int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func); -u_int netmap_bdg_learning(char *, u_int, uint8_t *, struct netmap_adapter *); -#define NM_NAME "vale" /* prefix for the bridge port name */ -#define NM_BDG_MAXPORTS 254 /* up to 32 for bitmap, 254 ok otherwise */ +typedef u_int (*bdg_lookup_fn_t)(char *buf, u_int len, + uint8_t *ring_nr, struct netmap_vp_adapter *); +u_int netmap_bdg_learning(char *, u_int, uint8_t *, + struct netmap_vp_adapter *); + +#define NM_BDG_MAXPORTS 254 /* up to 254 */ #define NM_BDG_BROADCAST NM_BDG_MAXPORTS #define NM_BDG_NOPORT (NM_BDG_MAXPORTS+1) +#define NM_NAME "vale" /* prefix for bridge port name */ + + +/* these are redefined in case of no VALE support */ +int netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +void netmap_init_bridges(void); +int netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func); + +#else /* !WITH_VALE */ +#define netmap_get_bdg_na(_1, _2, _3) 0 +#define netmap_init_bridges(_1) +#define netmap_bdg_ctl(_1, _2) EINVAL +#endif /* !WITH_VALE */ + +#ifdef WITH_PIPES +/* max number of pipes per device */ +#define NM_MAXPIPES 64 /* XXX how many? */ +/* in case of no error, returns the actual number of pipes in nmr->nr_arg1 */ +int netmap_pipe_alloc(struct netmap_adapter *, struct nmreq *nmr); +void netmap_pipe_dealloc(struct netmap_adapter *); +int netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create); +#else /* !WITH_PIPES */ +#define NM_MAXPIPES 0 +#define netmap_pipe_alloc(_1, _2) EOPNOTSUPP +#define netmap_pipe_dealloc(_1) +#define netmap_get_pipe_na(_1, _2, _3) 0 +#endif + +/* Various prototypes */ +int netmap_poll(struct cdev *dev, int events, struct thread *td); +int netmap_init(void); +void netmap_fini(void); +int netmap_get_memory(struct netmap_priv_d* p); +void netmap_dtor(void *data); +int netmap_dtor_locked(struct netmap_priv_d *priv); + +int netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td); + +/* netmap_adapter creation/destruction */ +#define NM_IFPNAME(ifp) ((ifp) ? (ifp)->if_xname : "zombie") + +// #define NM_DEBUG_PUTGET 1 + +#ifdef NM_DEBUG_PUTGET + +#define NM_DBG(f) __##f + +void __netmap_adapter_get(struct netmap_adapter *na); + +#define netmap_adapter_get(na) \ + do { \ + struct netmap_adapter *__na = na; \ + D("getting %p:%s (%d)", __na, NM_IFPNAME(__na->ifp), __na->na_refcount); \ + __netmap_adapter_get(__na); \ + } while (0) + +int __netmap_adapter_put(struct netmap_adapter *na); + +#define netmap_adapter_put(na) \ + ({ \ + struct netmap_adapter *__na = na; \ + D("putting %p:%s (%d)", __na, NM_IFPNAME(__na->ifp), __na->na_refcount); \ + __netmap_adapter_put(__na); \ + }) + +#else /* !NM_DEBUG_PUTGET */ + +#define NM_DBG(f) f +void netmap_adapter_get(struct netmap_adapter *na); +int netmap_adapter_put(struct netmap_adapter *na); + +#endif /* !NM_DEBUG_PUTGET */ + + +/* + * module variables + */ extern u_int netmap_buf_size; #define NETMAP_BUF_SIZE netmap_buf_size // XXX remove -extern int netmap_mitigate; +extern int netmap_mitigate; // XXX not really used extern int netmap_no_pendintr; -extern u_int netmap_total_buffers; -extern char *netmap_buffer_base; +extern u_int netmap_total_buffers; // global allocator +extern char *netmap_buffer_base; // global allocator extern int netmap_verbose; // XXX debugging enum { /* verbose flags */ NM_VERB_ON = 1, /* generic verbose */ @@ -516,18 +1006,19 @@ enum { /* verbose flags */ NM_VERB_NIC_TXSYNC = 0x2000, }; +extern int netmap_txsync_retry; +extern int netmap_generic_mit; +extern int netmap_generic_ringsize; +extern int netmap_generic_rings; + /* * NA returns a pointer to the struct netmap adapter from the ifp, * WNA is used to write it. - * SWNA() is used for the "host stack" endpoint associated - * to an interface. It is allocated together with the main NA(), - * as an array of two objects. */ #ifndef WNA #define WNA(_ifp) (_ifp)->if_pspare[0] #endif #define NA(_ifp) ((struct netmap_adapter *)WNA(_ifp)) -#define SWNA(_ifp) (NA(_ifp) + 1) /* * Macros to determine if an interface is netmap capable or netmap enabled. @@ -561,7 +1052,8 @@ enum { /* verbose flags */ #endif /* linux */ #ifdef __FreeBSD__ -/* Callback invoked by the dma machinery after a successfull dmamap_load */ + +/* Callback invoked by the dma machinery after a successful dmamap_load */ static void netmap_dmamap_cb(__unused void *arg, __unused bus_dma_segment_t * segs, __unused int nseg, __unused int error) { @@ -588,6 +1080,7 @@ netmap_reload_map(bus_dma_tag_t tag, bus_dmamap_t map, void *buf) netmap_dmamap_cb, NULL, BUS_DMA_NOWAIT); } } + #else /* linux */ /* @@ -695,16 +1188,193 @@ PNMB(struct netmap_slot *slot, uint64_t *pp) return ret; } -/* default functions to handle rx/tx interrupts */ -int netmap_rx_irq(struct ifnet *, u_int, u_int *); -#define netmap_tx_irq(_n, _q) netmap_rx_irq(_n, _q, NULL) +/* Generic version of NMB, which uses device-specific memory. */ +static inline void * +BDG_NMB(struct netmap_adapter *na, struct netmap_slot *slot) +{ + struct lut_entry *lut = na->na_lut; + uint32_t i = slot->buf_idx; + return (unlikely(i >= na->na_lut_objtotal)) ? + lut[0].vaddr : lut[i].vaddr; +} -#ifdef __FreeBSD__ -MALLOC_DECLARE(M_NETMAP); -#endif /* __FreeBSD__ */ -void netmap_disable_all_rings(struct ifnet *); -void netmap_enable_all_rings(struct ifnet *); +void netmap_txsync_to_host(struct netmap_adapter *na); + + +/* + * Structure associated to each thread which registered an interface. + * + * The first 4 fields of this structure are written by NIOCREGIF and + * read by poll() and NIOC?XSYNC. + * + * There is low contention among writers (a correct user program + * should have none) and among writers and readers, so we use a + * single global lock to protect the structure initialization; + * since initialization involves the allocation of memory, + * we reuse the memory allocator lock. + * + * Read access to the structure is lock free. Readers must check that + * np_nifp is not NULL before using the other fields. + * If np_nifp is NULL initialization has not been performed, + * so they should return an error to userspace. + * + * The ref_done field is used to regulate access to the refcount in the + * memory allocator. The refcount must be incremented at most once for + * each open("/dev/netmap"). The increment is performed by the first + * function that calls netmap_get_memory() (currently called by + * mmap(), NIOCGINFO and NIOCREGIF). + * If the refcount is incremented, it is then decremented when the + * private structure is destroyed. + */ +struct netmap_priv_d { + struct netmap_if * volatile np_nifp; /* netmap if descriptor. */ + + struct netmap_adapter *np_na; + uint32_t np_flags; /* from the ioctl */ + u_int np_txqfirst, np_txqlast; /* range of tx rings to scan */ + u_int np_rxqfirst, np_rxqlast; /* range of rx rings to scan */ + uint16_t np_txpoll; /* XXX and also np_rxpoll ? */ + + struct netmap_mem_d *np_mref; /* use with NMG_LOCK held */ + /* np_refcount is only used on FreeBSD */ + int np_refcount; /* use with NMG_LOCK held */ + + /* pointers to the selinfo to be used for selrecord. + * Either the local or the global one depending on the + * number of rings. + */ + NM_SELINFO_T *np_rxsi, *np_txsi; + struct thread *np_td; /* kqueue, just debugging */ +}; + + +/* + * generic netmap emulation for devices that do not have + * native netmap support. + */ +int generic_netmap_attach(struct ifnet *ifp); + +int netmap_catch_rx(struct netmap_adapter *na, int intercept); +void generic_rx_handler(struct ifnet *ifp, struct mbuf *m);; +void netmap_catch_tx(struct netmap_generic_adapter *na, int enable); +int generic_xmit_frame(struct ifnet *ifp, struct mbuf *m, void *addr, u_int len, u_int ring_nr); +int generic_find_num_desc(struct ifnet *ifp, u_int *tx, u_int *rx); +void generic_find_num_queues(struct ifnet *ifp, u_int *txq, u_int *rxq); + +/* + * netmap_mitigation API. This is used by the generic adapter + * to reduce the number of interrupt requests/selwakeup + * to clients on incoming packets. + */ +void netmap_mitigation_init(struct nm_generic_mit *mit, struct netmap_adapter *na); +void netmap_mitigation_start(struct nm_generic_mit *mit); +void netmap_mitigation_restart(struct nm_generic_mit *mit); +int netmap_mitigation_active(struct nm_generic_mit *mit); +void netmap_mitigation_cleanup(struct nm_generic_mit *mit); + + + +/* Shared declarations for the VALE switch. */ + +/* + * Each transmit queue accumulates a batch of packets into + * a structure before forwarding. Packets to the same + * destination are put in a list using ft_next as a link field. + * ft_frags and ft_next are valid only on the first fragment. + */ +struct nm_bdg_fwd { /* forwarding entry for a bridge */ + void *ft_buf; /* netmap or indirect buffer */ + uint8_t ft_frags; /* how many fragments (only on 1st frag) */ + uint8_t _ft_port; /* dst port (unused) */ + uint16_t ft_flags; /* flags, e.g. indirect */ + uint16_t ft_len; /* src fragment len */ + uint16_t ft_next; /* next packet to same destination */ +}; + +/* struct 'virtio_net_hdr' from linux. */ +struct nm_vnet_hdr { +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ +#define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ + uint8_t flags; +#define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ +#define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ +#define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ +#define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ +#define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ + uint8_t gso_type; + uint16_t hdr_len; + uint16_t gso_size; + uint16_t csum_start; + uint16_t csum_offset; +}; + +#define WORST_CASE_GSO_HEADER (14+40+60) /* IPv6 + TCP */ + +/* Private definitions for IPv4, IPv6, UDP and TCP headers. */ + +struct nm_iphdr { + uint8_t version_ihl; + uint8_t tos; + uint16_t tot_len; + uint16_t id; + uint16_t frag_off; + uint8_t ttl; + uint8_t protocol; + uint16_t check; + uint32_t saddr; + uint32_t daddr; + /*The options start here. */ +}; + +struct nm_tcphdr { + uint16_t source; + uint16_t dest; + uint32_t seq; + uint32_t ack_seq; + uint8_t doff; /* Data offset + Reserved */ + uint8_t flags; + uint16_t window; + uint16_t check; + uint16_t urg_ptr; +}; + +struct nm_udphdr { + uint16_t source; + uint16_t dest; + uint16_t len; + uint16_t check; +}; + +struct nm_ipv6hdr { + uint8_t priority_version; + uint8_t flow_lbl[3]; + + uint16_t payload_len; + uint8_t nexthdr; + uint8_t hop_limit; + + uint8_t saddr[16]; + uint8_t daddr[16]; +}; + +/* Type used to store a checksum (in host byte order) that hasn't been + * folded yet. + */ +#define rawsum_t uint32_t + +rawsum_t nm_csum_raw(uint8_t *data, size_t len, rawsum_t cur_sum); +uint16_t nm_csum_ipv4(struct nm_iphdr *iph); +void nm_csum_tcpudp_ipv4(struct nm_iphdr *iph, void *data, + size_t datalen, uint16_t *check); +void nm_csum_tcpudp_ipv6(struct nm_ipv6hdr *ip6h, void *data, + size_t datalen, uint16_t *check); +uint16_t nm_csum_fold(rawsum_t cur_sum); + +void bdg_mismatch_datapath(struct netmap_vp_adapter *na, + struct netmap_vp_adapter *dst_na, + struct nm_bdg_fwd *ft_p, struct netmap_ring *ring, + u_int *j, u_int lim, u_int *howmany); #endif /* _NET_NETMAP_KERN_H_ */ diff --git a/sys/dev/netmap/netmap_mbq.c b/sys/dev/netmap/netmap_mbq.c new file mode 100644 index 000000000..2606b13d4 --- /dev/null +++ b/sys/dev/netmap/netmap_mbq.c @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + + +#ifdef linux +#include "bsd_glue.h" +#else /* __FreeBSD__ */ +#include +#include +#include +#include +#include +#endif /* __FreeBSD__ */ + +#include "netmap_mbq.h" + + +static inline void __mbq_init(struct mbq *q) +{ + q->head = q->tail = NULL; + q->count = 0; +} + + +void mbq_safe_init(struct mbq *q) +{ + mtx_init(&q->lock, "mbq", NULL, MTX_SPIN); + __mbq_init(q); +} + + +void mbq_init(struct mbq *q) +{ + __mbq_init(q); +} + + +static inline void __mbq_enqueue(struct mbq *q, struct mbuf *m) +{ + m->m_nextpkt = NULL; + if (q->tail) { + q->tail->m_nextpkt = m; + q->tail = m; + } else { + q->head = q->tail = m; + } + q->count++; +} + + +void mbq_safe_enqueue(struct mbq *q, struct mbuf *m) +{ + mtx_lock(&q->lock); + __mbq_enqueue(q, m); + mtx_unlock(&q->lock); +} + + +void mbq_enqueue(struct mbq *q, struct mbuf *m) +{ + __mbq_enqueue(q, m); +} + + +static inline struct mbuf *__mbq_dequeue(struct mbq *q) +{ + struct mbuf *ret = NULL; + + if (q->head) { + ret = q->head; + q->head = ret->m_nextpkt; + if (q->head == NULL) { + q->tail = NULL; + } + q->count--; + ret->m_nextpkt = NULL; + } + + return ret; +} + + +struct mbuf *mbq_safe_dequeue(struct mbq *q) +{ + struct mbuf *ret; + + mtx_lock(&q->lock); + ret = __mbq_dequeue(q); + mtx_unlock(&q->lock); + + return ret; +} + + +struct mbuf *mbq_dequeue(struct mbq *q) +{ + return __mbq_dequeue(q); +} + + +/* XXX seems pointless to have a generic purge */ +static void __mbq_purge(struct mbq *q, int safe) +{ + struct mbuf *m; + + for (;;) { + m = safe ? mbq_safe_dequeue(q) : mbq_dequeue(q); + if (m) { + m_freem(m); + } else { + break; + } + } +} + + +void mbq_purge(struct mbq *q) +{ + __mbq_purge(q, 0); +} + + +void mbq_safe_purge(struct mbq *q) +{ + __mbq_purge(q, 1); +} + + +void mbq_safe_destroy(struct mbq *q) +{ + mtx_destroy(&q->lock); +} + + +void mbq_destroy(struct mbq *q) +{ +} diff --git a/sys/dev/netmap/netmap_mbq.h b/sys/dev/netmap/netmap_mbq.h new file mode 100644 index 000000000..d273d8a8f --- /dev/null +++ b/sys/dev/netmap/netmap_mbq.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2013-2014 Vincenzo Maffione. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * $FreeBSD$ + */ + + +#ifndef __NETMAP_MBQ_H__ +#define __NETMAP_MBQ_H__ + +/* + * These function implement an mbuf tailq with an optional lock. + * The base functions act ONLY ON THE QUEUE, whereas the "safe" + * variants (mbq_safe_*) also handle the lock. + */ + +/* XXX probably rely on a previous definition of SPINLOCK_T */ +#ifdef linux +#define SPINLOCK_T safe_spinlock_t +#else +#define SPINLOCK_T struct mtx +#endif + +/* A FIFO queue of mbufs with an optional lock. */ +struct mbq { + struct mbuf *head; + struct mbuf *tail; + int count; + SPINLOCK_T lock; +}; + +/* XXX "destroy" does not match "init" as a name. + * We should also clarify whether init can be used while + * holding a lock, and whether mbq_safe_destroy() is a NOP. + */ +void mbq_init(struct mbq *q); +void mbq_destroy(struct mbq *q); +void mbq_enqueue(struct mbq *q, struct mbuf *m); +struct mbuf *mbq_dequeue(struct mbq *q); +void mbq_purge(struct mbq *q); + +/* XXX missing mbq_lock() and mbq_unlock */ + +void mbq_safe_init(struct mbq *q); +void mbq_safe_destroy(struct mbq *q); +void mbq_safe_enqueue(struct mbq *q, struct mbuf *m); +struct mbuf *mbq_safe_dequeue(struct mbq *q); +void mbq_safe_purge(struct mbq *q); + +static inline unsigned int mbq_len(struct mbq *q) +{ + return q->count; +} + +#endif /* __NETMAP_MBQ_H_ */ diff --git a/sys/dev/netmap/netmap_mem2.c b/sys/dev/netmap/netmap_mem2.c index a78904216..549184509 100644 --- a/sys/dev/netmap/netmap_mem2.c +++ b/sys/dev/netmap/netmap_mem2.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2013 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -8,7 +8,7 @@ * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. + * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -82,6 +82,21 @@ struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = { }, }; +struct netmap_obj_params netmap_min_priv_params[NETMAP_POOLS_NR] = { + [NETMAP_IF_POOL] = { + .size = 1024, + .num = 1, + }, + [NETMAP_RING_POOL] = { + .size = 5*PAGE_SIZE, + .num = 4, + }, + [NETMAP_BUF_POOL] = { + .size = 2048, + .num = 4098, + }, +}; + /* * nm_mem is the memory allocator used for all physical interfaces @@ -118,9 +133,16 @@ struct netmap_mem_d nm_mem = { /* Our memory allocator. */ .config = netmap_mem_global_config, .finalize = netmap_mem_global_finalize, .deref = netmap_mem_global_deref, + + .nm_id = 1, + + .prev = &nm_mem, + .next = &nm_mem, }; +struct netmap_mem_d *netmap_last_mem_d = &nm_mem; + // XXX logically belongs to nm_mem struct lut_entry *netmap_buffer_lut; /* exported */ @@ -135,7 +157,7 @@ const struct netmap_mem_d nm_blueprint = { .objminsize = sizeof(struct netmap_if), .objmaxsize = 4096, .nummin = 1, - .nummax = 10, + .nummax = 100, }, [NETMAP_RING_POOL] = { .name = "%s_ring", @@ -167,18 +189,72 @@ const struct netmap_mem_d nm_blueprint = { #define DECLARE_SYSCTLS(id, name) \ SYSCTL_INT(_dev_netmap, OID_AUTO, name##_size, \ CTLFLAG_RW, &netmap_params[id].size, 0, "Requested size of netmap " STRINGIFY(name) "s"); \ - SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_size, \ - CTLFLAG_RD, &nm_mem.pools[id]._objsize, 0, "Current size of netmap " STRINGIFY(name) "s"); \ - SYSCTL_INT(_dev_netmap, OID_AUTO, name##_num, \ - CTLFLAG_RW, &netmap_params[id].num, 0, "Requested number of netmap " STRINGIFY(name) "s"); \ - SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_num, \ - CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s") + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_size, \ + CTLFLAG_RD, &nm_mem.pools[id]._objsize, 0, "Current size of netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_num, \ + CTLFLAG_RW, &netmap_params[id].num, 0, "Requested number of netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, name##_curr_num, \ + CTLFLAG_RD, &nm_mem.pools[id].objtotal, 0, "Current number of netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_size, \ + CTLFLAG_RW, &netmap_min_priv_params[id].size, 0, \ + "Default size of private netmap " STRINGIFY(name) "s"); \ + SYSCTL_INT(_dev_netmap, OID_AUTO, priv_##name##_num, \ + CTLFLAG_RW, &netmap_min_priv_params[id].num, 0, \ + "Default number of private netmap " STRINGIFY(name) "s") SYSCTL_DECL(_dev_netmap); DECLARE_SYSCTLS(NETMAP_IF_POOL, if); DECLARE_SYSCTLS(NETMAP_RING_POOL, ring); DECLARE_SYSCTLS(NETMAP_BUF_POOL, buf); +static int +nm_mem_assign_id(struct netmap_mem_d *nmd) +{ + nm_memid_t id; + struct netmap_mem_d *scan = netmap_last_mem_d; + int error = ENOMEM; + + NMA_LOCK(&nm_mem); + + do { + /* we rely on unsigned wrap around */ + id = scan->nm_id + 1; + if (id == 0) /* reserve 0 as error value */ + id = 1; + scan = scan->next; + if (id != scan->nm_id) { + nmd->nm_id = id; + nmd->prev = scan->prev; + nmd->next = scan; + scan->prev->next = nmd; + scan->prev = nmd; + netmap_last_mem_d = nmd; + error = 0; + break; + } + } while (scan != netmap_last_mem_d); + + NMA_UNLOCK(&nm_mem); + return error; +} + +static void +nm_mem_release_id(struct netmap_mem_d *nmd) +{ + NMA_LOCK(&nm_mem); + + nmd->prev->next = nmd->next; + nmd->next->prev = nmd->prev; + + if (netmap_last_mem_d == nmd) + netmap_last_mem_d = nmd->prev; + + nmd->prev = nmd->next = NULL; + + NMA_UNLOCK(&nm_mem); +} + + /* * First, find the allocator that contains the requested offset, * then locate the cluster through a lookup table. @@ -216,7 +292,8 @@ netmap_mem_ofstophys(struct netmap_mem_d* nmd, vm_ooffset_t offset) } int -netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags) +netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags, + nm_memid_t *id) { int error = 0; NMA_LOCK(nmd); @@ -234,6 +311,7 @@ netmap_mem_get_info(struct netmap_mem_d* nmd, u_int* size, u_int *memflags) } } *memflags = nmd->flags; + *id = nmd->nm_id; out: NMA_UNLOCK(nmd); return error; @@ -310,7 +388,7 @@ netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_ } if (p->objfree == 0) { - D("%s allocator: run out of memory", p->name); + D("no more %s objects", p->name); return NULL; } if (start) @@ -343,21 +421,34 @@ netmap_obj_malloc(struct netmap_obj_pool *p, u_int len, uint32_t *start, uint32_ /* - * free by index, not by address. This is slow, but is only used - * for a small number of objects (rings, nifp) + * free by index, not by address. + * XXX should we also cleanup the content ? */ -static void +static int netmap_obj_free(struct netmap_obj_pool *p, uint32_t j) { + uint32_t *ptr, mask; + if (j >= p->objtotal) { D("invalid index %u, max %u", j, p->objtotal); - return; + return 1; + } + ptr = &p->bitmap[j / 32]; + mask = (1 << (j % 32)); + if (*ptr & mask) { + D("ouch, double free on buffer %d", j); + return 1; + } else { + *ptr |= mask; + p->objfree++; + return 0; } - p->bitmap[j / 32] |= (1 << (j % 32)); - p->objfree++; - return; } +/* + * free by address. This is slow but is only used for a few + * objects (rings, nifp) + */ static void netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) { @@ -388,35 +479,83 @@ netmap_obj_free_va(struct netmap_obj_pool *p, void *vaddr) netmap_obj_malloc(&(n)->pools[NETMAP_BUF_POOL], NETMAP_BDG_BUF_SIZE(n), _pos, _index) +#if 0 // XXX unused /* Return the index associated to the given packet buffer */ #define netmap_buf_index(n, v) \ (netmap_obj_offset(&(n)->pools[NETMAP_BUF_POOL], (v)) / NETMAP_BDG_BUF_SIZE(n)) +#endif + +/* + * allocate extra buffers in a linked list. + * returns the actual number. + */ +uint32_t +netmap_extra_alloc(struct netmap_adapter *na, uint32_t *head, uint32_t n) +{ + struct netmap_mem_d *nmd = na->nm_mem; + uint32_t i, pos = 0; /* opaque, scan position in the bitmap */ + + NMA_LOCK(nmd); + + *head = 0; /* default, 'null' index ie empty list */ + for (i = 0 ; i < n; i++) { + uint32_t cur = *head; /* save current head */ + uint32_t *p = netmap_buf_malloc(nmd, &pos, head); + if (p == NULL) { + D("no more buffers after %d of %d", i, n); + *head = cur; /* restore */ + break; + } + RD(5, "allocate buffer %d -> %d", *head, cur); + *p = cur; /* link to previous head */ + } + + NMA_UNLOCK(nmd); + + return i; +} + +static void +netmap_extra_free(struct netmap_adapter *na, uint32_t head) +{ + struct lut_entry *lut = na->na_lut; + struct netmap_mem_d *nmd = na->nm_mem; + struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; + uint32_t i, cur, *buf; + + D("freeing the extra list"); + for (i = 0; head >=2 && head < p->objtotal; i++) { + cur = head; + buf = lut[head].vaddr; + head = *buf; + *buf = 0; + if (netmap_obj_free(p, cur)) + break; + } + if (head != 0) + D("breaking with head %d", head); + D("freed %d buffers", i); +} /* Return nonzero on error */ static int -netmap_new_bufs(struct netmap_mem_d *nmd, struct netmap_if *nifp, - struct netmap_slot *slot, u_int n) +netmap_new_bufs(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n) { struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; u_int i = 0; /* slot counter */ uint32_t pos = 0; /* slot in p->bitmap */ uint32_t index = 0; /* buffer index */ - (void)nifp; /* UNUSED */ for (i = 0; i < n; i++) { void *vaddr = netmap_buf_malloc(nmd, &pos, &index); if (vaddr == NULL) { - D("unable to locate empty packet buffer"); + D("no more buffers after %d of %d", i, n); goto cleanup; } slot[i].buf_idx = index; slot[i].len = p->_objsize; - /* XXX setting flags=NS_BUF_CHANGED forces a pointer reload - * in the NIC ring. This is a hack that hides missing - * initializations in the drivers, and should go away. - */ - // slot[i].flags = NS_BUF_CHANGED; + slot[i].flags = 0; } ND("allocated %d buffers, %d available, first at %d", n, p->objfree, pos); @@ -431,13 +570,25 @@ cleanup: return (ENOMEM); } +static void +netmap_mem_set_ring(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n, uint32_t index) +{ + struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; + u_int i; + + for (i = 0; i < n; i++) { + slot[i].buf_idx = index; + slot[i].len = p->_objsize; + slot[i].flags = 0; + } +} + static void -netmap_free_buf(struct netmap_mem_d *nmd, struct netmap_if *nifp, uint32_t i) +netmap_free_buf(struct netmap_mem_d *nmd, uint32_t i) { struct netmap_obj_pool *p = &nmd->pools[NETMAP_BUF_POOL]; - (void)nifp; if (i < 2 || i >= p->objtotal) { D("Cannot free buf#%d: should be in [2, %d[", i, p->objtotal); return; @@ -445,6 +596,18 @@ netmap_free_buf(struct netmap_mem_d *nmd, struct netmap_if *nifp, uint32_t i) netmap_obj_free(p, i); } + +static void +netmap_free_bufs(struct netmap_mem_d *nmd, struct netmap_slot *slot, u_int n) +{ + u_int i; + + for (i = 0; i < n; i++) { + if (slot[i].buf_idx > 2) + netmap_free_buf(nmd, slot[i].buf_idx); + } +} + static void netmap_reset_obj_allocator(struct netmap_obj_pool *p) { @@ -513,7 +676,7 @@ netmap_config_obj_allocator(struct netmap_obj_pool *p, u_int objtotal, u_int obj p->r_objsize = objsize; #define MAX_CLUSTSIZE (1<<17) -#define LINE_ROUND 64 +#define LINE_ROUND NM_CACHE_ALIGN // 64 if (objsize >= MAX_CLUSTSIZE) { /* we could do it but there is no point */ D("unsupported allocation for %d bytes", objsize); @@ -684,7 +847,9 @@ static void netmap_mem_reset_all(struct netmap_mem_d *nmd) { int i; - D("resetting %p", nmd); + + if (netmap_verbose) + D("resetting %p", nmd); for (i = 0; i < NETMAP_POOLS_NR; i++) { netmap_reset_obj_allocator(&nmd->pools[i]); } @@ -710,12 +875,14 @@ netmap_mem_finalize_all(struct netmap_mem_d *nmd) nmd->pools[NETMAP_BUF_POOL].bitmap[0] = ~3; nmd->flags |= NETMAP_MEM_FINALIZED; - D("Have %d KB for interfaces, %d KB for rings and %d MB for buffers", - nmd->pools[NETMAP_IF_POOL].memtotal >> 10, - nmd->pools[NETMAP_RING_POOL].memtotal >> 10, - nmd->pools[NETMAP_BUF_POOL].memtotal >> 20); + if (netmap_verbose) + D("interfaces %d KB, rings %d KB, buffers %d MB", + nmd->pools[NETMAP_IF_POOL].memtotal >> 10, + nmd->pools[NETMAP_RING_POOL].memtotal >> 10, + nmd->pools[NETMAP_BUF_POOL].memtotal >> 20); - D("Free buffers: %d", nmd->pools[NETMAP_BUF_POOL].objfree); + if (netmap_verbose) + D("Free buffers: %d", nmd->pools[NETMAP_BUF_POOL].objfree); return 0; @@ -731,10 +898,13 @@ netmap_mem_private_delete(struct netmap_mem_d *nmd) { if (nmd == NULL) return; - D("deleting %p", nmd); + if (netmap_verbose) + D("deleting %p", nmd); if (nmd->refcount > 0) D("bug: deleting mem allocator with refcount=%d!", nmd->refcount); - D("done deleting %p", nmd); + nm_mem_release_id(nmd); + if (netmap_verbose) + D("done deleting %p", nmd); NMA_LOCK_DESTROY(nmd); free(nmd, M_DEVBUF); } @@ -760,7 +930,8 @@ netmap_mem_private_finalize(struct netmap_mem_d *nmd) } -static void netmap_mem_private_deref(struct netmap_mem_d *nmd) +static void +netmap_mem_private_deref(struct netmap_mem_d *nmd) { NMA_LOCK(nmd); if (--nmd->refcount <= 0) @@ -768,35 +939,70 @@ static void netmap_mem_private_deref(struct netmap_mem_d *nmd) NMA_UNLOCK(nmd); } + +/* + * allocator for private memory + */ struct netmap_mem_d * -netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int rxd) +netmap_mem_private_new(const char *name, u_int txr, u_int txd, + u_int rxr, u_int rxd, u_int extra_bufs, u_int npipes, int *perr) { struct netmap_mem_d *d = NULL; struct netmap_obj_params p[NETMAP_POOLS_NR]; - int i; - u_int maxd; + int i, err; + u_int v, maxd; d = malloc(sizeof(struct netmap_mem_d), M_DEVBUF, M_NOWAIT | M_ZERO); - if (d == NULL) - return NULL; + if (d == NULL) { + err = ENOMEM; + goto error; + } *d = nm_blueprint; - /* XXX the rest of the code assumes the stack rings are alwasy present */ + err = nm_mem_assign_id(d); + if (err) + goto error; + + /* account for the fake host rings */ txr++; rxr++; - p[NETMAP_IF_POOL].size = sizeof(struct netmap_if) + - sizeof(ssize_t) * (txr + rxr); - p[NETMAP_IF_POOL].num = 2; + + /* copy the min values */ + for (i = 0; i < NETMAP_POOLS_NR; i++) { + p[i] = netmap_min_priv_params[i]; + } + + /* possibly increase them to fit user request */ + v = sizeof(struct netmap_if) + sizeof(ssize_t) * (txr + rxr); + if (p[NETMAP_IF_POOL].size < v) + p[NETMAP_IF_POOL].size = v; + v = 2 + 4 * npipes; + if (p[NETMAP_IF_POOL].num < v) + p[NETMAP_IF_POOL].num = v; maxd = (txd > rxd) ? txd : rxd; - p[NETMAP_RING_POOL].size = sizeof(struct netmap_ring) + - sizeof(struct netmap_slot) * maxd; - p[NETMAP_RING_POOL].num = txr + rxr; - p[NETMAP_BUF_POOL].size = 2048; /* XXX find a way to let the user choose this */ - p[NETMAP_BUF_POOL].num = rxr * (rxd + 2) + txr * (txd + 2); + v = sizeof(struct netmap_ring) + sizeof(struct netmap_slot) * maxd; + if (p[NETMAP_RING_POOL].size < v) + p[NETMAP_RING_POOL].size = v; + /* each pipe endpoint needs two tx rings (1 normal + 1 host, fake) + * and two rx rings (again, 1 normal and 1 fake host) + */ + v = txr + rxr + 8 * npipes; + if (p[NETMAP_RING_POOL].num < v) + p[NETMAP_RING_POOL].num = v; + /* for each pipe we only need the buffers for the 4 "real" rings. + * On the other end, the pipe ring dimension may be different from + * the parent port ring dimension. As a compromise, we allocate twice the + * space actually needed if the pipe rings were the same size as the parent rings + */ + v = (4 * npipes + rxr) * rxd + (4 * npipes + txr) * txd + 2 + extra_bufs; + /* the +2 is for the tx and rx fake buffers (indices 0 and 1) */ + if (p[NETMAP_BUF_POOL].num < v) + p[NETMAP_BUF_POOL].num = v; - D("req if %d*%d ring %d*%d buf %d*%d", + if (netmap_verbose) + D("req if %d*%d ring %d*%d buf %d*%d", p[NETMAP_IF_POOL].num, p[NETMAP_IF_POOL].size, p[NETMAP_RING_POOL].num, @@ -808,8 +1014,9 @@ netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int snprintf(d->pools[i].name, NETMAP_POOL_MAX_NAMSZ, nm_blueprint.pools[i].name, name); - if (netmap_config_obj_allocator(&d->pools[i], - p[i].num, p[i].size)) + err = netmap_config_obj_allocator(&d->pools[i], + p[i].num, p[i].size); + if (err) goto error; } @@ -820,6 +1027,8 @@ netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int return d; error: netmap_mem_private_delete(d); + if (perr) + *perr = err; return NULL; } @@ -845,7 +1054,7 @@ netmap_mem_global_config(struct netmap_mem_d *nmd) netmap_reset_obj_allocator(&nmd->pools[i]); } nmd->flags &= ~NETMAP_MEM_FINALIZED; - } + } for (i = 0; i < NETMAP_POOLS_NR; i++) { nmd->lasterr = netmap_config_obj_allocator(&nmd->pools[i], @@ -923,191 +1132,192 @@ netmap_mem_fini(void) static void netmap_free_rings(struct netmap_adapter *na) { - u_int i; + struct netmap_kring *kring; + struct netmap_ring *ring; if (!na->tx_rings) return; - for (i = 0; i < na->num_tx_rings + 1; i++) { - if (na->tx_rings[i].ring) { - netmap_ring_free(na->nm_mem, na->tx_rings[i].ring); - na->tx_rings[i].ring = NULL; - } + for (kring = na->tx_rings; kring != na->rx_rings; kring++) { + ring = kring->ring; + if (ring == NULL) + continue; + netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots); + netmap_ring_free(na->nm_mem, ring); + kring->ring = NULL; } - for (i = 0; i < na->num_rx_rings + 1; i++) { - if (na->rx_rings[i].ring) { - netmap_ring_free(na->nm_mem, na->rx_rings[i].ring); - na->rx_rings[i].ring = NULL; - } + for (/* cont'd from above */; kring != na->tailroom; kring++) { + ring = kring->ring; + if (ring == NULL) + continue; + netmap_free_bufs(na->nm_mem, ring->slot, kring->nkr_num_slots); + netmap_ring_free(na->nm_mem, ring); + kring->ring = NULL; } - free(na->tx_rings, M_DEVBUF); - na->tx_rings = na->rx_rings = NULL; } - - -/* call with NMA_LOCK held */ -/* - * Allocate the per-fd structure netmap_if. - * If this is the first instance, also allocate the krings, rings etc. +/* call with NMA_LOCK held * * - * We assume that the configuration stored in na - * (number of tx/rx rings and descs) does not change while - * the interface is in netmap mode. + * Allocate netmap rings and buffers for this card + * The rings are contiguous, but have variable size. + * The kring array must follow the layout described + * in netmap_krings_create(). */ -extern int nma_is_vp(struct netmap_adapter *na); -struct netmap_if * -netmap_mem_if_new(const char *ifname, struct netmap_adapter *na) +int +netmap_mem_rings_create(struct netmap_adapter *na) { - struct netmap_if *nifp; struct netmap_ring *ring; - ssize_t base; /* handy for relative offsets between rings and nifp */ - u_int i, len, ndesc, ntx, nrx; + u_int len, ndesc; struct netmap_kring *kring; - uint32_t *tx_leases = NULL, *rx_leases = NULL; - - /* - * verify whether virtual port need the stack ring - */ - ntx = na->num_tx_rings + 1; /* shorthand, include stack ring */ - nrx = na->num_rx_rings + 1; /* shorthand, include stack ring */ - /* - * the descriptor is followed inline by an array of offsets - * to the tx and rx rings in the shared memory region. - * For virtual rx rings we also allocate an array of - * pointers to assign to nkr_leases. - */ + u_int i; NMA_LOCK(na->nm_mem); - len = sizeof(struct netmap_if) + (nrx + ntx) * sizeof(ssize_t); - nifp = netmap_if_malloc(na->nm_mem, len); - if (nifp == NULL) { - NMA_UNLOCK(na->nm_mem); - return NULL; - } - - /* initialize base fields -- override const */ - *(u_int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings; - *(u_int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings; - strncpy(nifp->ni_name, ifname, (size_t)IFNAMSIZ); - - if (na->refcount) { /* already setup, we are done */ - goto final; - } - - len = (ntx + nrx) * sizeof(struct netmap_kring); - /* - * Leases are attached to TX rings on NIC/host ports, - * and to RX rings on VALE ports. - */ - if (nma_is_vp(na)) { - len += sizeof(uint32_t) * na->num_rx_desc * na->num_rx_rings; - } else { - len += sizeof(uint32_t) * na->num_tx_desc * ntx; - } - - na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO); - if (na->tx_rings == NULL) { - D("Cannot allocate krings for %s", ifname); - goto cleanup; - } - na->rx_rings = na->tx_rings + ntx; - - if (nma_is_vp(na)) { - rx_leases = (uint32_t *)(na->rx_rings + nrx); - } else { - tx_leases = (uint32_t *)(na->rx_rings + nrx); - } - - /* - * First instance, allocate netmap rings and buffers for this card - * The rings are contiguous, but have variable size. - */ - for (i = 0; i < ntx; i++) { /* Transmit rings */ - kring = &na->tx_rings[i]; - ndesc = na->num_tx_desc; - bzero(kring, sizeof(*kring)); + /* transmit rings */ + for (i =0, kring = na->tx_rings; kring != na->rx_rings; kring++, i++) { + if (kring->ring) { + ND("%s %ld already created", kring->name, kring - na->tx_rings); + continue; /* already created by somebody else */ + } + ndesc = kring->nkr_num_slots; len = sizeof(struct netmap_ring) + ndesc * sizeof(struct netmap_slot); ring = netmap_ring_malloc(na->nm_mem, len); if (ring == NULL) { - D("Cannot allocate tx_ring[%d] for %s", i, ifname); + D("Cannot allocate tx_ring"); goto cleanup; } - ND("txring[%d] at %p ofs %d", i, ring); - kring->na = na; + ND("txring at %p", ring); kring->ring = ring; - if (tx_leases) { - kring->nkr_leases = tx_leases; - tx_leases += ndesc; - } - *(uint32_t *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc; - *(ssize_t *)(uintptr_t)&ring->buf_ofs = + *(uint32_t *)(uintptr_t)&ring->num_slots = ndesc; + *(int64_t *)(uintptr_t)&ring->buf_ofs = (na->nm_mem->pools[NETMAP_IF_POOL].memtotal + na->nm_mem->pools[NETMAP_RING_POOL].memtotal) - netmap_ring_offset(na->nm_mem, ring); - /* - * IMPORTANT: - * Always keep one slot empty, so we can detect new - * transmissions comparing cur and nr_hwcur (they are - * the same only if there are no new transmissions). - */ - ring->avail = kring->nr_hwavail = ndesc - 1; - ring->cur = kring->nr_hwcur = 0; + /* copy values from kring */ + ring->head = kring->rhead; + ring->cur = kring->rcur; + ring->tail = kring->rtail; *(uint16_t *)(uintptr_t)&ring->nr_buf_size = NETMAP_BDG_BUF_SIZE(na->nm_mem); - ND("initializing slots for txring[%d]", i); - if (netmap_new_bufs(na->nm_mem, nifp, ring->slot, ndesc)) { - D("Cannot allocate buffers for tx_ring[%d] for %s", i, ifname); - goto cleanup; + ND("%s h %d c %d t %d", kring->name, + ring->head, ring->cur, ring->tail); + ND("initializing slots for txring"); + if (i != na->num_tx_rings || (na->na_flags & NAF_HOST_RINGS)) { + /* this is a real ring */ + if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { + D("Cannot allocate buffers for tx_ring"); + goto cleanup; + } + } else { + /* this is a fake tx ring, set all indices to 0 */ + netmap_mem_set_ring(na->nm_mem, ring->slot, ndesc, 0); } } - for (i = 0; i < nrx; i++) { /* Receive rings */ - kring = &na->rx_rings[i]; - ndesc = na->num_rx_desc; - bzero(kring, sizeof(*kring)); + /* receive rings */ + for ( i = 0 /* kring cont'd from above */ ; kring != na->tailroom; kring++, i++) { + if (kring->ring) { + ND("%s %ld already created", kring->name, kring - na->rx_rings); + continue; /* already created by somebody else */ + } + ndesc = kring->nkr_num_slots; len = sizeof(struct netmap_ring) + ndesc * sizeof(struct netmap_slot); ring = netmap_ring_malloc(na->nm_mem, len); if (ring == NULL) { - D("Cannot allocate rx_ring[%d] for %s", i, ifname); + D("Cannot allocate rx_ring"); goto cleanup; } - ND("rxring[%d] at %p ofs %d", i, ring); - - kring->na = na; + ND("rxring at %p", ring); kring->ring = ring; - if (rx_leases && i < na->num_rx_rings) { - kring->nkr_leases = rx_leases; - rx_leases += ndesc; - } - *(uint32_t *)(uintptr_t)&ring->num_slots = kring->nkr_num_slots = ndesc; - *(ssize_t *)(uintptr_t)&ring->buf_ofs = + *(uint32_t *)(uintptr_t)&ring->num_slots = ndesc; + *(int64_t *)(uintptr_t)&ring->buf_ofs = (na->nm_mem->pools[NETMAP_IF_POOL].memtotal + na->nm_mem->pools[NETMAP_RING_POOL].memtotal) - netmap_ring_offset(na->nm_mem, ring); - ring->cur = kring->nr_hwcur = 0; - ring->avail = kring->nr_hwavail = 0; /* empty */ + /* copy values from kring */ + ring->head = kring->rhead; + ring->cur = kring->rcur; + ring->tail = kring->rtail; *(int *)(uintptr_t)&ring->nr_buf_size = NETMAP_BDG_BUF_SIZE(na->nm_mem); - ND("initializing slots for rxring[%d]", i); - if (netmap_new_bufs(na->nm_mem, nifp, ring->slot, ndesc)) { - D("Cannot allocate buffers for rx_ring[%d] for %s", i, ifname); - goto cleanup; + ND("%s h %d c %d t %d", kring->name, + ring->head, ring->cur, ring->tail); + ND("initializing slots for rxring %p", ring); + if (i != na->num_rx_rings || (na->na_flags & NAF_HOST_RINGS)) { + /* this is a real ring */ + if (netmap_new_bufs(na->nm_mem, ring->slot, ndesc)) { + D("Cannot allocate buffers for rx_ring"); + goto cleanup; + } + } else { + /* this is a fake rx ring, set all indices to 1 */ + netmap_mem_set_ring(na->nm_mem, ring->slot, ndesc, 1); } } -#ifdef linux - // XXX initialize the selrecord structs. - for (i = 0; i < ntx; i++) - init_waitqueue_head(&na->tx_rings[i].si); - for (i = 0; i < nrx; i++) - init_waitqueue_head(&na->rx_rings[i].si); - init_waitqueue_head(&na->tx_si); - init_waitqueue_head(&na->rx_si); -#endif -final: + + NMA_UNLOCK(na->nm_mem); + + return 0; + +cleanup: + netmap_free_rings(na); + + NMA_UNLOCK(na->nm_mem); + + return ENOMEM; +} + +void +netmap_mem_rings_delete(struct netmap_adapter *na) +{ + /* last instance, release bufs and rings */ + NMA_LOCK(na->nm_mem); + + netmap_free_rings(na); + + NMA_UNLOCK(na->nm_mem); +} + + +/* call with NMA_LOCK held */ +/* + * Allocate the per-fd structure netmap_if. + * + * We assume that the configuration stored in na + * (number of tx/rx rings and descs) does not change while + * the interface is in netmap mode. + */ +struct netmap_if * +netmap_mem_if_new(const char *ifname, struct netmap_adapter *na) +{ + struct netmap_if *nifp; + ssize_t base; /* handy for relative offsets between rings and nifp */ + u_int i, len, ntx, nrx; + + /* account for the (eventually fake) host rings */ + ntx = na->num_tx_rings + 1; + nrx = na->num_rx_rings + 1; + /* + * the descriptor is followed inline by an array of offsets + * to the tx and rx rings in the shared memory region. + */ + + NMA_LOCK(na->nm_mem); + + len = sizeof(struct netmap_if) + (nrx + ntx) * sizeof(ssize_t); + nifp = netmap_if_malloc(na->nm_mem, len); + if (nifp == NULL) { + NMA_UNLOCK(na->nm_mem); + return NULL; + } + + /* initialize base fields -- override const */ + *(u_int *)(uintptr_t)&nifp->ni_tx_rings = na->num_tx_rings; + *(u_int *)(uintptr_t)&nifp->ni_rx_rings = na->num_rx_rings; + strncpy(nifp->ni_name, ifname, (size_t)IFNAMSIZ); + /* * fill the slots for the rx and tx rings. They contain the offset * between the ring and nifp, so the information is usable in @@ -1126,13 +1336,6 @@ final: NMA_UNLOCK(na->nm_mem); return (nifp); -cleanup: - netmap_free_rings(na); - netmap_if_free(na->nm_mem, nifp); - - NMA_UNLOCK(na->nm_mem); - - return NULL; } void @@ -1142,26 +1345,8 @@ netmap_mem_if_delete(struct netmap_adapter *na, struct netmap_if *nifp) /* nothing to do */ return; NMA_LOCK(na->nm_mem); - - if (na->refcount <= 0) { - /* last instance, release bufs and rings */ - u_int i, j, lim; - struct netmap_ring *ring; - - for (i = 0; i < na->num_tx_rings + 1; i++) { - ring = na->tx_rings[i].ring; - lim = na->tx_rings[i].nkr_num_slots; - for (j = 0; j < lim; j++) - netmap_free_buf(na->nm_mem, nifp, ring->slot[j].buf_idx); - } - for (i = 0; i < na->num_rx_rings + 1; i++) { - ring = na->rx_rings[i].ring; - lim = na->rx_rings[i].nkr_num_slots; - for (j = 0; j < lim; j++) - netmap_free_buf(na->nm_mem, nifp, ring->slot[j].buf_idx); - } - netmap_free_rings(na); - } + if (nifp->ni_bufs_head) + netmap_extra_free(na, nifp->ni_bufs_head); netmap_if_free(na->nm_mem, nifp); NMA_UNLOCK(na->nm_mem); @@ -1179,12 +1364,14 @@ netmap_mem_global_deref(struct netmap_mem_d *nmd) NMA_UNLOCK(nmd); } -int netmap_mem_finalize(struct netmap_mem_d *nmd) +int +netmap_mem_finalize(struct netmap_mem_d *nmd) { return nmd->finalize(nmd); } -void netmap_mem_deref(struct netmap_mem_d *nmd) +void +netmap_mem_deref(struct netmap_mem_d *nmd) { return nmd->deref(nmd); } diff --git a/sys/dev/netmap/netmap_mem2.h b/sys/dev/netmap/netmap_mem2.h index 83f31d011..e83616a51 100644 --- a/sys/dev/netmap/netmap_mem2.h +++ b/sys/dev/netmap/netmap_mem2.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2013 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. + * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -160,6 +160,7 @@ typedef int (*netmap_mem_config_t)(struct netmap_mem_d*); typedef int (*netmap_mem_finalize_t)(struct netmap_mem_d*); typedef void (*netmap_mem_deref_t)(struct netmap_mem_d*); +typedef uint16_t nm_memid_t; /* We implement two kinds of netmap_mem_d structures: * @@ -189,9 +190,14 @@ struct netmap_mem_d { /* the three allocators */ struct netmap_obj_pool pools[NETMAP_POOLS_NR]; - netmap_mem_config_t config; + netmap_mem_config_t config; netmap_mem_finalize_t finalize; netmap_mem_deref_t deref; + + nm_memid_t nm_id; /* allocator identifier */ + + /* list of all existing allocators, sorted by nm_id */ + struct netmap_mem_d *prev, *next; }; extern struct netmap_mem_d nm_mem; @@ -200,17 +206,22 @@ vm_paddr_t netmap_mem_ofstophys(struct netmap_mem_d *, vm_ooffset_t); int netmap_mem_finalize(struct netmap_mem_d *); int netmap_mem_init(void); void netmap_mem_fini(void); -struct netmap_if * netmap_mem_if_new(const char *, struct netmap_adapter *); -void netmap_mem_if_delete(struct netmap_adapter *na, struct netmap_if *nifp); +struct netmap_if * + netmap_mem_if_new(const char *, struct netmap_adapter *); +void netmap_mem_if_delete(struct netmap_adapter *, struct netmap_if *); +int netmap_mem_rings_create(struct netmap_adapter *); +void netmap_mem_rings_delete(struct netmap_adapter *); void netmap_mem_deref(struct netmap_mem_d *); -int netmap_mem_get_info(struct netmap_mem_d *nm_mem, u_int *size, u_int *memflags); -ssize_t netmap_mem_if_offset(struct netmap_mem_d *nm_mem, const void *vaddr); -struct netmap_mem_d* - netmap_mem_private_new(const char *name, u_int txr, u_int txd, u_int rxr, u_int rxd); -void netmap_mem_private_delete(struct netmap_mem_d *nm_mem); +int netmap_mem_get_info(struct netmap_mem_d *, u_int *size, u_int *memflags, uint16_t *id); +ssize_t netmap_mem_if_offset(struct netmap_mem_d *, const void *vaddr); +struct netmap_mem_d* netmap_mem_private_new(const char *name, + u_int txr, u_int txd, u_int rxr, u_int rxd, u_int extra_bufs, u_int npipes, + int* error); +void netmap_mem_private_delete(struct netmap_mem_d *); #define NETMAP_BDG_BUF_SIZE(n) ((n)->pools[NETMAP_BUF_POOL]._objsize) +uint32_t netmap_extra_alloc(struct netmap_adapter *, uint32_t *, uint32_t n); #endif diff --git a/sys/dev/netmap/netmap_offloadings.c b/sys/dev/netmap/netmap_offloadings.c new file mode 100644 index 000000000..a776a2424 --- /dev/null +++ b/sys/dev/netmap/netmap_offloadings.c @@ -0,0 +1,401 @@ +/* + * Copyright (C) 2014 Vincenzo Maffione. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* $FreeBSD$ */ + +#if defined(__FreeBSD__) +#include /* prerequisite */ + +#include +#include +#include /* defines used in kernel.h */ +#include /* types used in module initialization */ +#include +#include /* struct socket */ +#include /* sockaddrs */ +#include +#include +#include /* bus_dmamap_* */ +#include + +#elif defined(linux) + +#include "bsd_glue.h" + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +#include +#include + + + +/* This routine is called by bdg_mismatch_datapath() when it finishes + * accumulating bytes for a segment, in order to fix some fields in the + * segment headers (which still contain the same content as the header + * of the original GSO packet). 'buf' points to the beginning (e.g. + * the ethernet header) of the segment, and 'len' is its length. + */ +static void gso_fix_segment(uint8_t *buf, size_t len, u_int idx, + u_int segmented_bytes, u_int last_segment, + u_int tcp, u_int iphlen) +{ + struct nm_iphdr *iph = (struct nm_iphdr *)(buf + 14); + struct nm_ipv6hdr *ip6h = (struct nm_ipv6hdr *)(buf + 14); + uint16_t *check = NULL; + uint8_t *check_data = NULL; + + if (iphlen == 20) { + /* Set the IPv4 "Total Length" field. */ + iph->tot_len = htobe16(len-14); + ND("ip total length %u", be16toh(ip->tot_len)); + + /* Set the IPv4 "Identification" field. */ + iph->id = htobe16(be16toh(iph->id) + idx); + ND("ip identification %u", be16toh(iph->id)); + + /* Compute and insert the IPv4 header checksum. */ + iph->check = 0; + iph->check = nm_csum_ipv4(iph); + ND("IP csum %x", be16toh(iph->check)); + } else {/* if (iphlen == 40) */ + /* Set the IPv6 "Payload Len" field. */ + ip6h->payload_len = htobe16(len-14-iphlen); + } + + if (tcp) { + struct nm_tcphdr *tcph = (struct nm_tcphdr *)(buf + 14 + iphlen); + + /* Set the TCP sequence number. */ + tcph->seq = htobe32(be32toh(tcph->seq) + segmented_bytes); + ND("tcp seq %u", be32toh(tcph->seq)); + + /* Zero the PSH and FIN TCP flags if this is not the last + segment. */ + if (!last_segment) + tcph->flags &= ~(0x8 | 0x1); + ND("last_segment %u", last_segment); + + check = &tcph->check; + check_data = (uint8_t *)tcph; + } else { /* UDP */ + struct nm_udphdr *udph = (struct nm_udphdr *)(buf + 14 + iphlen); + + /* Set the UDP 'Length' field. */ + udph->len = htobe16(len-14-iphlen); + + check = &udph->check; + check_data = (uint8_t *)udph; + } + + /* Compute and insert TCP/UDP checksum. */ + *check = 0; + if (iphlen == 20) + nm_csum_tcpudp_ipv4(iph, check_data, len-14-iphlen, check); + else + nm_csum_tcpudp_ipv6(ip6h, check_data, len-14-iphlen, check); + + ND("TCP/UDP csum %x", be16toh(*check)); +} + + +/* The VALE mismatch datapath implementation. */ +void bdg_mismatch_datapath(struct netmap_vp_adapter *na, + struct netmap_vp_adapter *dst_na, + struct nm_bdg_fwd *ft_p, struct netmap_ring *ring, + u_int *j, u_int lim, u_int *howmany) +{ + struct netmap_slot *slot = NULL; + struct nm_vnet_hdr *vh = NULL; + /* Number of source slots to process. */ + u_int frags = ft_p->ft_frags; + struct nm_bdg_fwd *ft_end = ft_p + frags; + + /* Source and destination pointers. */ + uint8_t *dst, *src; + size_t src_len, dst_len; + + u_int j_start = *j; + u_int dst_slots = 0; + + /* If the source port uses the offloadings, while destination doesn't, + * we grab the source virtio-net header and do the offloadings here. + */ + if (na->virt_hdr_len && !dst_na->virt_hdr_len) { + vh = (struct nm_vnet_hdr *)ft_p->ft_buf; + } + + /* Init source and dest pointers. */ + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + slot = &ring->slot[*j]; + dst = BDG_NMB(&dst_na->up, slot); + dst_len = src_len; + + /* We are processing the first input slot and there is a mismatch + * between source and destination virt_hdr_len (SHL and DHL). + * When the a client is using virtio-net headers, the header length + * can be: + * - 10: the header corresponds to the struct nm_vnet_hdr + * - 12: the first 10 bytes correspond to the struct + * virtio_net_hdr, and the last 2 bytes store the + * "mergeable buffers" info, which is an optional + * hint that can be zeroed for compability + * + * The destination header is therefore built according to the + * following table: + * + * SHL | DHL | destination header + * ----------------------------- + * 0 | 10 | zero + * 0 | 12 | zero + * 10 | 0 | doesn't exist + * 10 | 12 | first 10 bytes are copied from source header, last 2 are zero + * 12 | 0 | doesn't exist + * 12 | 10 | copied from the first 10 bytes of source header + */ + bzero(dst, dst_na->virt_hdr_len); + if (na->virt_hdr_len && dst_na->virt_hdr_len) + memcpy(dst, src, sizeof(struct nm_vnet_hdr)); + /* Skip the virtio-net headers. */ + src += na->virt_hdr_len; + src_len -= na->virt_hdr_len; + dst += dst_na->virt_hdr_len; + dst_len = dst_na->virt_hdr_len + src_len; + + /* Here it could be dst_len == 0 (which implies src_len == 0), + * so we avoid passing a zero length fragment. + */ + if (dst_len == 0) { + ft_p++; + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + dst_len = src_len; + } + + if (vh && vh->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + u_int gso_bytes = 0; + /* Length of the GSO packet header. */ + u_int gso_hdr_len = 0; + /* Pointer to the GSO packet header. Assume it is in a single fragment. */ + uint8_t *gso_hdr = NULL; + /* Index of the current segment. */ + u_int gso_idx = 0; + /* Payload data bytes segmented so far (e.g. TCP data bytes). */ + u_int segmented_bytes = 0; + /* Length of the IP header (20 if IPv4, 40 if IPv6). */ + u_int iphlen = 0; + /* Is this a TCP or an UDP GSO packet? */ + u_int tcp = ((vh->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) + == VIRTIO_NET_HDR_GSO_UDP) ? 0 : 1; + + /* Segment the GSO packet contained into the input slots (frags). */ + while (ft_p != ft_end) { + size_t copy; + + /* Grab the GSO header if we don't have it. */ + if (!gso_hdr) { + uint16_t ethertype; + + gso_hdr = src; + + /* Look at the 'Ethertype' field to see if this packet + * is IPv4 or IPv6. + */ + ethertype = be16toh(*((uint16_t *)(gso_hdr + 12))); + if (ethertype == 0x0800) + iphlen = 20; + else /* if (ethertype == 0x86DD) */ + iphlen = 40; + ND(3, "type=%04x", ethertype); + + /* Compute gso_hdr_len. For TCP we need to read the + * content of the 'Data Offset' field. + */ + if (tcp) { + struct nm_tcphdr *tcph = + (struct nm_tcphdr *)&gso_hdr[14+iphlen]; + + gso_hdr_len = 14 + iphlen + 4*(tcph->doff >> 4); + } else + gso_hdr_len = 14 + iphlen + 8; /* UDP */ + + ND(3, "gso_hdr_len %u gso_mtu %d", gso_hdr_len, + dst_na->mfs); + + /* Advance source pointers. */ + src += gso_hdr_len; + src_len -= gso_hdr_len; + if (src_len == 0) { + ft_p++; + if (ft_p == ft_end) + break; + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + continue; + } + } + + /* Fill in the header of the current segment. */ + if (gso_bytes == 0) { + memcpy(dst, gso_hdr, gso_hdr_len); + gso_bytes = gso_hdr_len; + } + + /* Fill in data and update source and dest pointers. */ + copy = src_len; + if (gso_bytes + copy > dst_na->mfs) + copy = dst_na->mfs - gso_bytes; + memcpy(dst + gso_bytes, src, copy); + gso_bytes += copy; + src += copy; + src_len -= copy; + + /* A segment is complete or we have processed all the + the GSO payload bytes. */ + if (gso_bytes >= dst_na->mfs || + (src_len == 0 && ft_p + 1 == ft_end)) { + /* After raw segmentation, we must fix some header + * fields and compute checksums, in a protocol dependent + * way. */ + gso_fix_segment(dst, gso_bytes, gso_idx, + segmented_bytes, + src_len == 0 && ft_p + 1 == ft_end, + tcp, iphlen); + + ND("frame %u completed with %d bytes", gso_idx, (int)gso_bytes); + slot->len = gso_bytes; + slot->flags = 0; + segmented_bytes += gso_bytes - gso_hdr_len; + + dst_slots++; + + /* Next destination slot. */ + *j = nm_next(*j, lim); + slot = &ring->slot[*j]; + dst = BDG_NMB(&dst_na->up, slot); + + gso_bytes = 0; + gso_idx++; + } + + /* Next input slot. */ + if (src_len == 0) { + ft_p++; + if (ft_p == ft_end) + break; + src = ft_p->ft_buf; + src_len = ft_p->ft_len; + } + } + ND(3, "%d bytes segmented", segmented_bytes); + + } else { + /* Address of a checksum field into a destination slot. */ + uint16_t *check = NULL; + /* Accumulator for an unfolded checksum. */ + rawsum_t csum = 0; + + /* Process a non-GSO packet. */ + + /* Init 'check' if necessary. */ + if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + if (unlikely(vh->csum_offset + vh->csum_start > src_len)) + D("invalid checksum request"); + else + check = (uint16_t *)(dst + vh->csum_start + + vh->csum_offset); + } + + while (ft_p != ft_end) { + /* Init/update the packet checksum if needed. */ + if (vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + if (!dst_slots) + csum = nm_csum_raw(src + vh->csum_start, + src_len - vh->csum_start, 0); + else + csum = nm_csum_raw(src, src_len, csum); + } + + /* Round to a multiple of 64 */ + src_len = (src_len + 63) & ~63; + + if (ft_p->ft_flags & NS_INDIRECT) { + if (copyin(src, dst, src_len)) { + /* Invalid user pointer, pretend len is 0. */ + dst_len = 0; + } + } else { + memcpy(dst, src, (int)src_len); + } + slot->len = dst_len; + + dst_slots++; + + /* Next destination slot. */ + *j = nm_next(*j, lim); + slot = &ring->slot[*j]; + dst = BDG_NMB(&dst_na->up, slot); + + /* Next source slot. */ + ft_p++; + src = ft_p->ft_buf; + dst_len = src_len = ft_p->ft_len; + + } + + /* Finalize (fold) the checksum if needed. */ + if (check && vh && (vh->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) { + *check = nm_csum_fold(csum); + } + ND(3, "using %u dst_slots", dst_slots); + + /* A second pass on the desitations slots to set the slot flags, + * using the right number of destination slots. + */ + while (j_start != *j) { + slot = &ring->slot[j_start]; + slot->flags = (dst_slots << 8)| NS_MOREFRAG; + j_start = nm_next(j_start, lim); + } + /* Clear NS_MOREFRAG flag on last entry. */ + slot->flags = (dst_slots << 8); + } + + /* Update howmany. */ + if (unlikely(dst_slots > *howmany)) { + dst_slots = *howmany; + D("Slot allocation error: Should never happen"); + } + *howmany -= dst_slots; +} diff --git a/sys/dev/netmap/netmap_pipe.c b/sys/dev/netmap/netmap_pipe.c new file mode 100644 index 000000000..f8f29fa17 --- /dev/null +++ b/sys/dev/netmap/netmap_pipe.c @@ -0,0 +1,711 @@ +/* + * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* $FreeBSD$ */ + +#if defined(__FreeBSD__) +#include /* prerequisite */ + +#include +#include +#include /* defines used in kernel.h */ +#include /* types used in module initialization */ +#include +#include +#include +#include +#include +#include +#include /* sockaddrs */ +#include +#include +#include /* bus_dmamap_* */ +#include + + +#elif defined(linux) + +#include "bsd_glue.h" + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +/* + * common headers + */ + +#include +#include +#include + +#ifdef WITH_PIPES + +#define NM_PIPE_MAXSLOTS 4096 + +int netmap_default_pipes = 0; /* default number of pipes for each nic */ +SYSCTL_DECL(_dev_netmap); +SYSCTL_INT(_dev_netmap, OID_AUTO, default_pipes, CTLFLAG_RW, &netmap_default_pipes, 0 , ""); + +/* allocate the pipe array in the parent adapter */ +int +netmap_pipe_alloc(struct netmap_adapter *na, struct nmreq *nmr) +{ + size_t len; + int mode = nmr->nr_flags & NR_REG_MASK; + u_int npipes; + + if (mode == NR_REG_PIPE_MASTER || mode == NR_REG_PIPE_SLAVE) { + /* this is for our parent, not for us */ + return 0; + } + + /* TODO: we can resize the array if the new + * request can accomodate the already existing pipes + */ + if (na->na_pipes) { + nmr->nr_arg1 = na->na_max_pipes; + return 0; + } + + npipes = nmr->nr_arg1; + if (npipes == 0) + npipes = netmap_default_pipes; + nm_bound_var(&npipes, 0, 0, NM_MAXPIPES, NULL); + + if (npipes == 0) { + /* really zero, nothing to alloc */ + goto out; + } + + len = sizeof(struct netmap_pipe_adapter *) * npipes; + na->na_pipes = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO); + if (na->na_pipes == NULL) + return ENOMEM; + + na->na_max_pipes = npipes; + na->na_next_pipe = 0; + +out: + nmr->nr_arg1 = npipes; + + return 0; +} + +/* deallocate the parent array in the parent adapter */ +void +netmap_pipe_dealloc(struct netmap_adapter *na) +{ + if (na->na_pipes) { + ND("freeing pipes for %s", NM_IFPNAME(na->ifp)); + free(na->na_pipes, M_DEVBUF); + na->na_pipes = NULL; + na->na_max_pipes = 0; + na->na_next_pipe = 0; + } +} + +/* find a pipe endpoint with the given id among the parent's pipes */ +static struct netmap_pipe_adapter * +netmap_pipe_find(struct netmap_adapter *parent, u_int pipe_id) +{ + int i; + struct netmap_pipe_adapter *na; + + for (i = 0; i < parent->na_next_pipe; i++) { + na = parent->na_pipes[i]; + if (na->id == pipe_id) { + return na; + } + } + return NULL; +} + +/* add a new pipe endpoint to the parent array */ +static int +netmap_pipe_add(struct netmap_adapter *parent, struct netmap_pipe_adapter *na) +{ + if (parent->na_next_pipe >= parent->na_max_pipes) { + D("%s: no space left for pipes", NM_IFPNAME(parent->ifp)); + return ENOMEM; + } + + parent->na_pipes[parent->na_next_pipe] = na; + na->parent_slot = parent->na_next_pipe; + parent->na_next_pipe++; + return 0; +} + +/* remove the given pipe endpoint from the parent array */ +static void +netmap_pipe_remove(struct netmap_adapter *parent, struct netmap_pipe_adapter *na) +{ + u_int n; + n = --parent->na_next_pipe; + if (n != na->parent_slot) { + parent->na_pipes[na->parent_slot] = + parent->na_pipes[n]; + } + parent->na_pipes[n] = NULL; +} + +static int +netmap_pipe_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *txkring = na->tx_rings + ring_nr, + *rxkring = txkring->pipe; + u_int limit; /* slots to transfer */ + u_int j, k, lim_tx = txkring->nkr_num_slots - 1, + lim_rx = rxkring->nkr_num_slots - 1; + int m, busy; + + ND("%p: %s %x -> %s", txkring, txkring->name, flags, rxkring->name); + ND(2, "before: hwcur %d hwtail %d cur %d head %d tail %d", txkring->nr_hwcur, txkring->nr_hwtail, + txkring->rcur, txkring->rhead, txkring->rtail); + + j = rxkring->nr_hwtail; /* RX */ + k = txkring->nr_hwcur; /* TX */ + m = txkring->rhead - txkring->nr_hwcur; /* new slots */ + if (m < 0) + m += txkring->nkr_num_slots; + limit = m; + m = rxkring->nkr_num_slots - 1; /* max avail space on destination */ + busy = j - rxkring->nr_hwcur; /* busy slots */ + if (busy < 0) + busy += txkring->nkr_num_slots; + m -= busy; /* subtract busy slots */ + ND(2, "m %d limit %d", m, limit); + if (m < limit) + limit = m; + + if (limit == 0) { + /* either the rxring is full, or nothing to send */ + nm_txsync_finalize(txkring); /* actually useless */ + return 0; + } + + while (limit-- > 0) { + struct netmap_slot *rs = &rxkring->save_ring->slot[j]; + struct netmap_slot *ts = &txkring->ring->slot[k]; + struct netmap_slot tmp; + + /* swap the slots */ + tmp = *rs; + *rs = *ts; + *ts = tmp; + + /* no need to report the buffer change */ + + j = nm_next(j, lim_rx); + k = nm_next(k, lim_tx); + } + + wmb(); /* make sure the slots are updated before publishing them */ + rxkring->nr_hwtail = j; + txkring->nr_hwcur = k; + txkring->nr_hwtail = nm_prev(k, lim_tx); + + nm_txsync_finalize(txkring); + ND(2, "after: hwcur %d hwtail %d cur %d head %d tail %d j %d", txkring->nr_hwcur, txkring->nr_hwtail, + txkring->rcur, txkring->rhead, txkring->rtail, j); + + wmb(); /* make sure rxkring->nr_hwtail is updated before notifying */ + rxkring->na->nm_notify(rxkring->na, rxkring->ring_id, NR_RX, 0); + + return 0; +} + +static int +netmap_pipe_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *rxkring = na->rx_rings + ring_nr, + *txkring = rxkring->pipe; + uint32_t oldhwcur = rxkring->nr_hwcur; + + ND("%s %x <- %s", rxkring->name, flags, txkring->name); + rxkring->nr_hwcur = rxkring->rhead; /* recover user-relased slots */ + ND(5, "hwcur %d hwtail %d cur %d head %d tail %d", rxkring->nr_hwcur, rxkring->nr_hwtail, + rxkring->rcur, rxkring->rhead, rxkring->rtail); + rmb(); /* paired with the first wmb() in txsync */ + nm_rxsync_finalize(rxkring); + + if (oldhwcur != rxkring->nr_hwcur) { + /* we have released some slots, notify the other end */ + wmb(); /* make sure nr_hwcur is updated before notifying */ + txkring->na->nm_notify(txkring->na, txkring->ring_id, NR_TX, 0); + } + return 0; +} + +/* Pipe endpoints are created and destroyed together, so that endopoints do not + * have to check for the existence of their peer at each ?xsync. + * + * To play well with the existing netmap infrastructure (refcounts etc.), we + * adopt the following strategy: + * + * 1) The first endpoint that is created also creates the other endpoint and + * grabs a reference to it. + * + * state A) user1 --> endpoint1 --> endpoint2 + * + * 2) If, starting from state A, endpoint2 is then registered, endpoint1 gives + * its reference to the user: + * + * state B) user1 --> endpoint1 endpoint2 <--- user2 + * + * 3) Assume that, starting from state B endpoint2 is closed. In the unregister + * callback endpoint2 notes that endpoint1 is still active and adds a reference + * from endpoint1 to itself. When user2 then releases her own reference, + * endpoint2 is not destroyed and we are back to state A. A symmetrical state + * would be reached if endpoint1 were released instead. + * + * 4) If, starting from state A, endpoint1 is closed, the destructor notes that + * it owns a reference to endpoint2 and releases it. + * + * Something similar goes on for the creation and destruction of the krings. + */ + + +/* netmap_pipe_krings_delete. + * + * There are two cases: + * + * 1) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1. We have to create both sets + * of krings. + * + * 2) state is + * + * usr1 --> e1 --> e2 + * + * and we are e2. e1 is certainly registered and our + * krings already exist, but they may be hidden. + */ +static int +netmap_pipe_krings_create(struct netmap_adapter *na) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + struct netmap_adapter *ona = &pna->peer->up; + int error = 0; + if (pna->peer_ref) { + int i; + + /* case 1) above */ + D("%p: case 1, create everything", na); + error = netmap_krings_create(na, 0); + if (error) + goto err; + + /* we also create all the rings, since we need to + * update the save_ring pointers. + * netmap_mem_rings_create (called by our caller) + * will not create the rings again + */ + + error = netmap_mem_rings_create(na); + if (error) + goto del_krings1; + + /* update our hidden ring pointers */ + for (i = 0; i < na->num_tx_rings + 1; i++) + na->tx_rings[i].save_ring = na->tx_rings[i].ring; + for (i = 0; i < na->num_rx_rings + 1; i++) + na->rx_rings[i].save_ring = na->rx_rings[i].ring; + + /* now, create krings and rings of the other end */ + error = netmap_krings_create(ona, 0); + if (error) + goto del_rings1; + + error = netmap_mem_rings_create(ona); + if (error) + goto del_krings2; + + for (i = 0; i < ona->num_tx_rings + 1; i++) + ona->tx_rings[i].save_ring = ona->tx_rings[i].ring; + for (i = 0; i < ona->num_rx_rings + 1; i++) + ona->rx_rings[i].save_ring = ona->rx_rings[i].ring; + + /* cross link the krings */ + for (i = 0; i < na->num_tx_rings; i++) { + na->tx_rings[i].pipe = pna->peer->up.rx_rings + i; + na->rx_rings[i].pipe = pna->peer->up.tx_rings + i; + pna->peer->up.tx_rings[i].pipe = na->rx_rings + i; + pna->peer->up.rx_rings[i].pipe = na->tx_rings + i; + } + } else { + int i; + /* case 2) above */ + /* recover the hidden rings */ + ND("%p: case 2, hidden rings", na); + for (i = 0; i < na->num_tx_rings + 1; i++) + na->tx_rings[i].ring = na->tx_rings[i].save_ring; + for (i = 0; i < na->num_rx_rings + 1; i++) + na->rx_rings[i].ring = na->rx_rings[i].save_ring; + } + return 0; + +del_krings2: + netmap_krings_delete(ona); +del_rings1: + netmap_mem_rings_delete(na); +del_krings1: + netmap_krings_delete(na); +err: + return error; +} + +/* netmap_pipe_reg. + * + * There are two cases on registration (onoff==1) + * + * 1.a) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1. Nothing special to do. + * + * 1.b) state is + * + * usr1 --> e1 --> e2 <-- usr2 + * + * and we are e2. Drop the ref e1 is holding. + * + * There are two additional cases on unregister (onoff==0) + * + * 2.a) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1. Nothing special to do, e2 will + * be cleaned up by the destructor of e1. + * + * 2.b) state is + * + * usr1 --> e1 e2 <-- usr2 + * + * and we are either e1 or e2. Add a ref from the + * other end and hide our rings. + */ +static int +netmap_pipe_reg(struct netmap_adapter *na, int onoff) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + struct ifnet *ifp = na->ifp; + ND("%p: onoff %d", na, onoff); + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + } else { + ifp->if_capenable &= ~IFCAP_NETMAP; + } + if (pna->peer_ref) { + ND("%p: case 1.a or 2.a, nothing to do", na); + return 0; + } + if (onoff) { + ND("%p: case 1.b, drop peer", na); + pna->peer->peer_ref = 0; + netmap_adapter_put(na); + } else { + int i; + ND("%p: case 2.b, grab peer", na); + netmap_adapter_get(na); + pna->peer->peer_ref = 1; + /* hide our rings from netmap_mem_rings_delete */ + for (i = 0; i < na->num_tx_rings + 1; i++) { + na->tx_rings[i].ring = NULL; + } + for (i = 0; i < na->num_rx_rings + 1; i++) { + na->rx_rings[i].ring = NULL; + } + } + return 0; +} + +/* netmap_pipe_krings_delete. + * + * There are two cases: + * + * 1) state is + * + * usr1 --> e1 --> e2 + * + * and we are e1 (e2 is not registered, so krings_delete cannot be + * called on it); + * + * 2) state is + * + * usr1 --> e1 e2 <-- usr2 + * + * and we are either e1 or e2. + * + * In the former case we have to also delete the krings of e2; + * in the latter case we do nothing (note that our krings + * have already been hidden in the unregister callback). + */ +static void +netmap_pipe_krings_delete(struct netmap_adapter *na) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + struct netmap_adapter *ona; /* na of the other end */ + int i; + + if (!pna->peer_ref) { + ND("%p: case 2, kept alive by peer", na); + return; + } + /* case 1) above */ + ND("%p: case 1, deleting everyhing", na); + netmap_krings_delete(na); /* also zeroes tx_rings etc. */ + /* restore the ring to be deleted on the peer */ + ona = &pna->peer->up; + if (ona->tx_rings == NULL) { + /* already deleted, we must be on an + * cleanup-after-error path */ + return; + } + for (i = 0; i < ona->num_tx_rings + 1; i++) + ona->tx_rings[i].ring = ona->tx_rings[i].save_ring; + for (i = 0; i < ona->num_rx_rings + 1; i++) + ona->rx_rings[i].ring = ona->rx_rings[i].save_ring; + netmap_mem_rings_delete(ona); + netmap_krings_delete(ona); +} + + +static void +netmap_pipe_dtor(struct netmap_adapter *na) +{ + struct netmap_pipe_adapter *pna = + (struct netmap_pipe_adapter *)na; + ND("%p", na); + if (pna->peer_ref) { + ND("%p: clean up peer", na); + pna->peer_ref = 0; + netmap_adapter_put(&pna->peer->up); + } + if (pna->role == NR_REG_PIPE_MASTER) + netmap_pipe_remove(pna->parent, pna); + netmap_adapter_put(pna->parent); + free(na->ifp, M_DEVBUF); + na->ifp = NULL; + pna->parent = NULL; +} + +int +netmap_get_pipe_na(struct nmreq *nmr, struct netmap_adapter **na, int create) +{ + struct nmreq pnmr; + struct netmap_adapter *pna; /* parent adapter */ + struct netmap_pipe_adapter *mna, *sna, *req; + struct ifnet *ifp, *ifp2; + u_int pipe_id; + int role = nmr->nr_flags & NR_REG_MASK; + int error; + + ND("flags %x", nmr->nr_flags); + + if (role != NR_REG_PIPE_MASTER && role != NR_REG_PIPE_SLAVE) { + ND("not a pipe"); + return 0; + } + role = nmr->nr_flags & NR_REG_MASK; + + /* first, try to find the parent adapter */ + bzero(&pnmr, sizeof(pnmr)); + memcpy(&pnmr.nr_name, nmr->nr_name, IFNAMSIZ); + /* pass to parent the requested number of pipes */ + pnmr.nr_arg1 = nmr->nr_arg1; + error = netmap_get_na(&pnmr, &pna, create); + if (error) { + ND("parent lookup failed: %d", error); + return error; + } + ND("found parent: %s", NM_IFPNAME(pna->ifp)); + + if (NETMAP_OWNED_BY_KERN(pna)) { + ND("parent busy"); + error = EBUSY; + goto put_out; + } + + /* next, lookup the pipe id in the parent list */ + req = NULL; + pipe_id = nmr->nr_ringid & NETMAP_RING_MASK; + mna = netmap_pipe_find(pna, pipe_id); + if (mna) { + if (mna->role == role) { + ND("found %d directly at %d", pipe_id, mna->parent_slot); + req = mna; + } else { + ND("found %d indirectly at %d", pipe_id, mna->parent_slot); + req = mna->peer; + } + /* the pipe we have found already holds a ref to the parent, + * so we need to drop the one we got from netmap_get_na() + */ + netmap_adapter_put(pna); + goto found; + } + ND("pipe %d not found, create %d", pipe_id, create); + if (!create) { + error = ENODEV; + goto put_out; + } + /* we create both master and slave. + * The endpoint we were asked for holds a reference to + * the other one. + */ + ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ifp) { + error = ENOMEM; + goto put_out; + } + strcpy(ifp->if_xname, NM_IFPNAME(pna->ifp)); + + mna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (mna == NULL) { + error = ENOMEM; + goto free_ifp; + } + mna->up.ifp = ifp; + + mna->id = pipe_id; + mna->role = NR_REG_PIPE_MASTER; + mna->parent = pna; + + mna->up.nm_txsync = netmap_pipe_txsync; + mna->up.nm_rxsync = netmap_pipe_rxsync; + mna->up.nm_register = netmap_pipe_reg; + mna->up.nm_dtor = netmap_pipe_dtor; + mna->up.nm_krings_create = netmap_pipe_krings_create; + mna->up.nm_krings_delete = netmap_pipe_krings_delete; + mna->up.nm_mem = pna->nm_mem; + mna->up.na_lut = pna->na_lut; + mna->up.na_lut_objtotal = pna->na_lut_objtotal; + + mna->up.num_tx_rings = 1; + mna->up.num_rx_rings = 1; + mna->up.num_tx_desc = nmr->nr_tx_slots; + nm_bound_var(&mna->up.num_tx_desc, pna->num_tx_desc, + 1, NM_PIPE_MAXSLOTS, NULL); + mna->up.num_rx_desc = nmr->nr_rx_slots; + nm_bound_var(&mna->up.num_rx_desc, pna->num_rx_desc, + 1, NM_PIPE_MAXSLOTS, NULL); + error = netmap_attach_common(&mna->up); + if (error) + goto free_ifp; + /* register the master with the parent */ + error = netmap_pipe_add(pna, mna); + if (error) + goto free_mna; + + /* create the slave */ + ifp2 = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ifp) { + error = ENOMEM; + goto free_mna; + } + strcpy(ifp2->if_xname, NM_IFPNAME(pna->ifp)); + + sna = malloc(sizeof(*mna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (sna == NULL) { + error = ENOMEM; + goto free_ifp2; + } + /* most fields are the same, copy from master and then fix */ + *sna = *mna; + sna->up.ifp = ifp2; + sna->role = NR_REG_PIPE_SLAVE; + error = netmap_attach_common(&sna->up); + if (error) + goto free_sna; + + /* join the two endpoints */ + mna->peer = sna; + sna->peer = mna; + + /* we already have a reference to the parent, but we + * need another one for the other endpoint we created + */ + netmap_adapter_get(pna); + + if (role == NR_REG_PIPE_MASTER) { + req = mna; + mna->peer_ref = 1; + netmap_adapter_get(&sna->up); + } else { + req = sna; + sna->peer_ref = 1; + netmap_adapter_get(&mna->up); + } + ND("created master %p and slave %p", mna, sna); +found: + + ND("pipe %d %s at %p", pipe_id, + (req->role == NR_REG_PIPE_MASTER ? "master" : "slave"), req); + *na = &req->up; + netmap_adapter_get(*na); + + /* write the configuration back */ + nmr->nr_tx_rings = req->up.num_tx_rings; + nmr->nr_rx_rings = req->up.num_rx_rings; + nmr->nr_tx_slots = req->up.num_tx_desc; + nmr->nr_rx_slots = req->up.num_rx_desc; + + /* keep the reference to the parent. + * It will be released by the req destructor + */ + + return 0; + +free_sna: + free(sna, M_DEVBUF); +free_ifp2: + free(ifp2, M_DEVBUF); +free_mna: + free(mna, M_DEVBUF); +free_ifp: + free(ifp, M_DEVBUF); +put_out: + netmap_adapter_put(pna); + return error; +} + + +#endif /* WITH_PIPES */ diff --git a/sys/dev/netmap/netmap_vale.c b/sys/dev/netmap/netmap_vale.c new file mode 100644 index 000000000..34e39126e --- /dev/null +++ b/sys/dev/netmap/netmap_vale.c @@ -0,0 +1,2103 @@ +/* + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +/* + * This module implements the VALE switch for netmap + +--- VALE SWITCH --- + +NMG_LOCK() serializes all modifications to switches and ports. +A switch cannot be deleted until all ports are gone. + +For each switch, an SX lock (RWlock on linux) protects +deletion of ports. When configuring or deleting a new port, the +lock is acquired in exclusive mode (after holding NMG_LOCK). +When forwarding, the lock is acquired in shared mode (without NMG_LOCK). +The lock is held throughout the entire forwarding cycle, +during which the thread may incur in a page fault. +Hence it is important that sleepable shared locks are used. + +On the rx ring, the per-port lock is grabbed initially to reserve +a number of slot in the ring, then the lock is released, +packets are copied from source to destination, and then +the lock is acquired again and the receive ring is updated. +(A similar thing is done on the tx ring for NIC and host stack +ports attached to the switch) + + */ + +/* + * OS-specific code that is used only within this file. + * Other OS-specific code that must be accessed by drivers + * is present in netmap_kern.h + */ + +#if defined(__FreeBSD__) +#include /* prerequisite */ +__FBSDID("$FreeBSD$"); + +#include +#include +#include /* defines used in kernel.h */ +#include /* types used in module initialization */ +#include /* cdevsw struct, UID, GID */ +#include +#include /* struct socket */ +#include +#include +#include +#include /* sockaddrs */ +#include +#include +#include +#include +#include /* BIOCIMMEDIATE */ +#include /* bus_dmamap_* */ +#include +#include + + +#define BDG_RWLOCK_T struct rwlock // struct rwlock + +#define BDG_RWINIT(b) \ + rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS) +#define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock) +#define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock) +#define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock) +#define BDG_RTRYLOCK(b) rw_try_rlock(&(b)->bdg_lock) +#define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock) +#define BDG_RWDESTROY(b) rw_destroy(&(b)->bdg_lock) + + +#elif defined(linux) + +#include "bsd_glue.h" + +#elif defined(__APPLE__) + +#warning OSX support is only partial +#include "osx_glue.h" + +#else + +#error Unsupported platform + +#endif /* unsupported */ + +/* + * common headers + */ + +#include +#include +#include + +#ifdef WITH_VALE + +/* + * system parameters (most of them in netmap_kern.h) + * NM_NAME prefix for switch port names, default "vale" + * NM_BDG_MAXPORTS number of ports + * NM_BRIDGES max number of switches in the system. + * XXX should become a sysctl or tunable + * + * Switch ports are named valeX:Y where X is the switch name and Y + * is the port. If Y matches a physical interface name, the port is + * connected to a physical device. + * + * Unlike physical interfaces, switch ports use their own memory region + * for rings and buffers. + * The virtual interfaces use per-queue lock instead of core lock. + * In the tx loop, we aggregate traffic in batches to make all operations + * faster. The batch size is bridge_batch. + */ +#define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */ +#define NM_BDG_MAXSLOTS 4096 /* XXX same as above */ +#define NM_BRIDGE_RINGSIZE 1024 /* in the device */ +#define NM_BDG_HASH 1024 /* forwarding table entries */ +#define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */ +#define NM_MULTISEG 64 /* max size of a chain of bufs */ +/* actual size of the tables */ +#define NM_BDG_BATCH_MAX (NM_BDG_BATCH + NM_MULTISEG) +/* NM_FT_NULL terminates a list of slots in the ft */ +#define NM_FT_NULL NM_BDG_BATCH_MAX +#define NM_BRIDGES 8 /* number of bridges */ + + +/* + * bridge_batch is set via sysctl to the max batch size to be + * used in the bridge. The actual value may be larger as the + * last packet in the block may overflow the size. + */ +int bridge_batch = NM_BDG_BATCH; /* bridge batch size */ +SYSCTL_DECL(_dev_netmap); +SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , ""); + + +static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp); +static int bdg_netmap_reg(struct netmap_adapter *na, int onoff); +static int netmap_bwrap_attach(struct ifnet *, struct ifnet *); +static int netmap_bwrap_register(struct netmap_adapter *, int onoff); +int kern_netmap_regif(struct nmreq *nmr); + +/* + * For each output interface, nm_bdg_q is used to construct a list. + * bq_len is the number of output buffers (we can have coalescing + * during the copy). + */ +struct nm_bdg_q { + uint16_t bq_head; + uint16_t bq_tail; + uint32_t bq_len; /* number of buffers */ +}; + +/* XXX revise this */ +struct nm_hash_ent { + uint64_t mac; /* the top 2 bytes are the epoch */ + uint64_t ports; +}; + +/* + * nm_bridge is a descriptor for a VALE switch. + * Interfaces for a bridge are all in bdg_ports[]. + * The array has fixed size, an empty entry does not terminate + * the search, but lookups only occur on attach/detach so we + * don't mind if they are slow. + * + * The bridge is non blocking on the transmit ports: excess + * packets are dropped if there is no room on the output port. + * + * bdg_lock protects accesses to the bdg_ports array. + * This is a rw lock (or equivalent). + */ +struct nm_bridge { + /* XXX what is the proper alignment/layout ? */ + BDG_RWLOCK_T bdg_lock; /* protects bdg_ports */ + int bdg_namelen; + uint32_t bdg_active_ports; /* 0 means free */ + char bdg_basename[IFNAMSIZ]; + + /* Indexes of active ports (up to active_ports) + * and all other remaining ports. + */ + uint8_t bdg_port_index[NM_BDG_MAXPORTS]; + + struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS]; + + + /* + * The function to decide the destination port. + * It returns either of an index of the destination port, + * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to + * forward this packet. ring_nr is the source ring index, and the + * function may overwrite this value to forward this packet to a + * different ring index. + * This function must be set by netmap_bdgctl(). + */ + bdg_lookup_fn_t nm_bdg_lookup; + + /* the forwarding table, MAC+ports. + * XXX should be changed to an argument to be passed to + * the lookup function, and allocated on attach + */ + struct nm_hash_ent ht[NM_BDG_HASH]; +}; + + +/* + * XXX in principle nm_bridges could be created dynamically + * Right now we have a static array and deletions are protected + * by an exclusive lock. + */ +struct nm_bridge nm_bridges[NM_BRIDGES]; + + +/* + * this is a slightly optimized copy routine which rounds + * to multiple of 64 bytes and is often faster than dealing + * with other odd sizes. We assume there is enough room + * in the source and destination buffers. + * + * XXX only for multiples of 64 bytes, non overlapped. + */ +static inline void +pkt_copy(void *_src, void *_dst, int l) +{ + uint64_t *src = _src; + uint64_t *dst = _dst; + if (unlikely(l >= 1024)) { + memcpy(dst, src, l); + return; + } + for (; likely(l > 0); l-=64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} + + +/* + * locate a bridge among the existing ones. + * MUST BE CALLED WITH NMG_LOCK() + * + * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME. + * We assume that this is called with a name of at least NM_NAME chars. + */ +static struct nm_bridge * +nm_find_bridge(const char *name, int create) +{ + int i, l, namelen; + struct nm_bridge *b = NULL; + + NMG_LOCK_ASSERT(); + + namelen = strlen(NM_NAME); /* base length */ + l = name ? strlen(name) : 0; /* actual length */ + if (l < namelen) { + D("invalid bridge name %s", name ? name : NULL); + return NULL; + } + for (i = namelen + 1; i < l; i++) { + if (name[i] == ':') { + namelen = i; + break; + } + } + if (namelen >= IFNAMSIZ) + namelen = IFNAMSIZ; + ND("--- prefix is '%.*s' ---", namelen, name); + + /* lookup the name, remember empty slot if there is one */ + for (i = 0; i < NM_BRIDGES; i++) { + struct nm_bridge *x = nm_bridges + i; + + if (x->bdg_active_ports == 0) { + if (create && b == NULL) + b = x; /* record empty slot */ + } else if (x->bdg_namelen != namelen) { + continue; + } else if (strncmp(name, x->bdg_basename, namelen) == 0) { + ND("found '%.*s' at %d", namelen, name, i); + b = x; + break; + } + } + if (i == NM_BRIDGES && b) { /* name not found, can create entry */ + /* initialize the bridge */ + strncpy(b->bdg_basename, name, namelen); + ND("create new bridge %s with ports %d", b->bdg_basename, + b->bdg_active_ports); + b->bdg_namelen = namelen; + b->bdg_active_ports = 0; + for (i = 0; i < NM_BDG_MAXPORTS; i++) + b->bdg_port_index[i] = i; + /* set the default function */ + b->nm_bdg_lookup = netmap_bdg_learning; + /* reset the MAC address table */ + bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH); + } + return b; +} + + +/* + * Free the forwarding tables for rings attached to switch ports. + */ +static void +nm_free_bdgfwd(struct netmap_adapter *na) +{ + int nrings, i; + struct netmap_kring *kring; + + NMG_LOCK_ASSERT(); + nrings = na->num_tx_rings; + kring = na->tx_rings; + for (i = 0; i < nrings; i++) { + if (kring[i].nkr_ft) { + free(kring[i].nkr_ft, M_DEVBUF); + kring[i].nkr_ft = NULL; /* protect from freeing twice */ + } + } +} + + +/* + * Allocate the forwarding tables for the rings attached to the bridge ports. + */ +static int +nm_alloc_bdgfwd(struct netmap_adapter *na) +{ + int nrings, l, i, num_dstq; + struct netmap_kring *kring; + + NMG_LOCK_ASSERT(); + /* all port:rings + broadcast */ + num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1; + l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX; + l += sizeof(struct nm_bdg_q) * num_dstq; + l += sizeof(uint16_t) * NM_BDG_BATCH_MAX; + + nrings = netmap_real_tx_rings(na); + kring = na->tx_rings; + for (i = 0; i < nrings; i++) { + struct nm_bdg_fwd *ft; + struct nm_bdg_q *dstq; + int j; + + ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ft) { + nm_free_bdgfwd(na); + return ENOMEM; + } + dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); + for (j = 0; j < num_dstq; j++) { + dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL; + dstq[j].bq_len = 0; + } + kring[i].nkr_ft = ft; + } + return 0; +} + + +static void +netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw) +{ + int s_hw = hw, s_sw = sw; + int i, lim =b->bdg_active_ports; + uint8_t tmp[NM_BDG_MAXPORTS]; + + /* + New algorithm: + make a copy of bdg_port_index; + lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port + in the array of bdg_port_index, replacing them with + entries from the bottom of the array; + decrement bdg_active_ports; + acquire BDG_WLOCK() and copy back the array. + */ + + if (netmap_verbose) + D("detach %d and %d (lim %d)", hw, sw, lim); + /* make a copy of the list of active ports, update it, + * and then copy back within BDG_WLOCK(). + */ + memcpy(tmp, b->bdg_port_index, sizeof(tmp)); + for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) { + if (hw >= 0 && tmp[i] == hw) { + ND("detach hw %d at %d", hw, i); + lim--; /* point to last active port */ + tmp[i] = tmp[lim]; /* swap with i */ + tmp[lim] = hw; /* now this is inactive */ + hw = -1; + } else if (sw >= 0 && tmp[i] == sw) { + ND("detach sw %d at %d", sw, i); + lim--; + tmp[i] = tmp[lim]; + tmp[lim] = sw; + sw = -1; + } else { + i++; + } + } + if (hw >= 0 || sw >= 0) { + D("XXX delete failed hw %d sw %d, should panic...", hw, sw); + } + + BDG_WLOCK(b); + b->bdg_ports[s_hw] = NULL; + if (s_sw >= 0) { + b->bdg_ports[s_sw] = NULL; + } + memcpy(b->bdg_port_index, tmp, sizeof(tmp)); + b->bdg_active_ports = lim; + BDG_WUNLOCK(b); + + ND("now %d active ports", lim); + if (lim == 0) { + ND("marking bridge %s as free", b->bdg_basename); + b->nm_bdg_lookup = NULL; + } +} + + +static void +netmap_adapter_vp_dtor(struct netmap_adapter *na) +{ + struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; + struct nm_bridge *b = vpna->na_bdg; + struct ifnet *ifp = na->ifp; + + ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount); + + if (b) { + netmap_bdg_detach_common(b, vpna->bdg_port, -1); + } + + bzero(ifp, sizeof(*ifp)); + free(ifp, M_DEVBUF); + na->ifp = NULL; +} + + +/* Try to get a reference to a netmap adapter attached to a VALE switch. + * If the adapter is found (or is created), this function returns 0, a + * non NULL pointer is returned into *na, and the caller holds a + * reference to the adapter. + * If an adapter is not found, then no reference is grabbed and the + * function returns an error code, or 0 if there is just a VALE prefix + * mismatch. Therefore the caller holds a reference when + * (*na != NULL && return == 0). + */ +int +netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create) +{ + const char *name = nmr->nr_name; + struct ifnet *ifp; + int error = 0; + struct netmap_adapter *ret; + struct netmap_vp_adapter *vpna; + struct nm_bridge *b; + int i, j, cand = -1, cand2 = -1; + int needed; + + *na = NULL; /* default return value */ + + /* first try to see if this is a bridge port. */ + NMG_LOCK_ASSERT(); + if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) { + return 0; /* no error, but no VALE prefix */ + } + + b = nm_find_bridge(name, create); + if (b == NULL) { + D("no bridges available for '%s'", name); + return (create ? ENOMEM : ENXIO); + } + + /* Now we are sure that name starts with the bridge's name, + * lookup the port in the bridge. We need to scan the entire + * list. It is not important to hold a WLOCK on the bridge + * during the search because NMG_LOCK already guarantees + * that there are no other possible writers. + */ + + /* lookup in the local list of ports */ + for (j = 0; j < b->bdg_active_ports; j++) { + i = b->bdg_port_index[j]; + vpna = b->bdg_ports[i]; + // KASSERT(na != NULL); + ifp = vpna->up.ifp; + /* XXX make sure the name only contains one : */ + if (!strcmp(NM_IFPNAME(ifp), name)) { + netmap_adapter_get(&vpna->up); + ND("found existing if %s refs %d", name, + vpna->na_bdg_refcount); + *na = (struct netmap_adapter *)vpna; + return 0; + } + } + /* not found, should we create it? */ + if (!create) + return ENXIO; + /* yes we should, see if we have space to attach entries */ + needed = 2; /* in some cases we only need 1 */ + if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) { + D("bridge full %d, cannot create new port", b->bdg_active_ports); + return ENOMEM; + } + /* record the next two ports available, but do not allocate yet */ + cand = b->bdg_port_index[b->bdg_active_ports]; + cand2 = b->bdg_port_index[b->bdg_active_ports + 1]; + ND("+++ bridge %s port %s used %d avail %d %d", + b->bdg_basename, name, b->bdg_active_ports, cand, cand2); + + /* + * try see if there is a matching NIC with this name + * (after the bridge's name) + */ + ifp = ifunit_ref(name + b->bdg_namelen + 1); + if (!ifp) { /* this is a virtual port */ + if (nmr->nr_cmd) { + /* nr_cmd must be 0 for a virtual port */ + return EINVAL; + } + + /* create a struct ifnet for the new port. + * need M_NOWAIT as we are under nma_lock + */ + ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!ifp) + return ENOMEM; + + strcpy(ifp->if_xname, name); + /* bdg_netmap_attach creates a struct netmap_adapter */ + error = bdg_netmap_attach(nmr, ifp); + if (error) { + D("error %d", error); + free(ifp, M_DEVBUF); + return error; + } + ret = NA(ifp); + cand2 = -1; /* only need one port */ + } else { /* this is a NIC */ + struct ifnet *fake_ifp; + + error = netmap_get_hw_na(ifp, &ret); + if (error || ret == NULL) + goto out; + + /* make sure the NIC is not already in use */ + if (NETMAP_OWNED_BY_ANY(ret)) { + D("NIC %s busy, cannot attach to bridge", + NM_IFPNAME(ifp)); + error = EBUSY; + goto out; + } + /* create a fake interface */ + fake_ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!fake_ifp) { + error = ENOMEM; + goto out; + } + strcpy(fake_ifp->if_xname, name); + error = netmap_bwrap_attach(fake_ifp, ifp); + if (error) { + free(fake_ifp, M_DEVBUF); + goto out; + } + ret = NA(fake_ifp); + if (nmr->nr_arg1 != NETMAP_BDG_HOST) + cand2 = -1; /* only need one port */ + if_rele(ifp); + } + vpna = (struct netmap_vp_adapter *)ret; + + BDG_WLOCK(b); + vpna->bdg_port = cand; + ND("NIC %p to bridge port %d", vpna, cand); + /* bind the port to the bridge (virtual ports are not active) */ + b->bdg_ports[cand] = vpna; + vpna->na_bdg = b; + b->bdg_active_ports++; + if (cand2 >= 0) { + struct netmap_vp_adapter *hostna = vpna + 1; + /* also bind the host stack to the bridge */ + b->bdg_ports[cand2] = hostna; + hostna->bdg_port = cand2; + hostna->na_bdg = b; + b->bdg_active_ports++; + ND("host %p to bridge port %d", hostna, cand2); + } + ND("if %s refs %d", name, vpna->up.na_refcount); + BDG_WUNLOCK(b); + *na = ret; + netmap_adapter_get(ret); + return 0; + +out: + if_rele(ifp); + + return error; +} + + +/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */ +static int +nm_bdg_attach(struct nmreq *nmr) +{ + struct netmap_adapter *na; + struct netmap_if *nifp; + struct netmap_priv_d *npriv; + struct netmap_bwrap_adapter *bna; + int error; + + npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO); + if (npriv == NULL) + return ENOMEM; + + NMG_LOCK(); + + error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */); + if (error) /* no device, or another bridge or user owns the device */ + goto unlock_exit; + + if (na == NULL) { /* VALE prefix missing */ + error = EINVAL; + goto unlock_exit; + } + + if (na->active_fds > 0) { /* already registered */ + error = EBUSY; + goto unref_exit; + } + + nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error); + if (!nifp) { + goto unref_exit; + } + + bna = (struct netmap_bwrap_adapter*)na; + bna->na_kpriv = npriv; + NMG_UNLOCK(); + ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp)); + return 0; + +unref_exit: + netmap_adapter_put(na); +unlock_exit: + NMG_UNLOCK(); + bzero(npriv, sizeof(*npriv)); + free(npriv, M_DEVBUF); + return error; +} + + +static int +nm_bdg_detach(struct nmreq *nmr) +{ + struct netmap_adapter *na; + int error; + struct netmap_bwrap_adapter *bna; + int last_instance; + + NMG_LOCK(); + error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */); + if (error) { /* no device, or another bridge or user owns the device */ + goto unlock_exit; + } + + if (na == NULL) { /* VALE prefix missing */ + error = EINVAL; + goto unlock_exit; + } + + bna = (struct netmap_bwrap_adapter *)na; + + if (na->active_fds == 0) { /* not registered */ + error = EINVAL; + goto unref_exit; + } + + last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */ + if (!last_instance) { + D("--- error, trying to detach an entry with active mmaps"); + error = EINVAL; + } else { + struct netmap_priv_d *npriv = bna->na_kpriv; + + bna->na_kpriv = NULL; + D("deleting priv"); + + bzero(npriv, sizeof(*npriv)); + free(npriv, M_DEVBUF); + } + +unref_exit: + netmap_adapter_put(na); +unlock_exit: + NMG_UNLOCK(); + return error; + +} + + +/* exported to kernel callers, e.g. OVS ? + * Entry point. + * Called without NMG_LOCK. + */ +int +netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func) +{ + struct nm_bridge *b; + struct netmap_adapter *na; + struct netmap_vp_adapter *vpna; + struct ifnet *iter; + char *name = nmr->nr_name; + int cmd = nmr->nr_cmd, namelen = strlen(name); + int error = 0, i, j; + + switch (cmd) { + case NETMAP_BDG_ATTACH: + error = nm_bdg_attach(nmr); + break; + + case NETMAP_BDG_DETACH: + error = nm_bdg_detach(nmr); + break; + + case NETMAP_BDG_LIST: + /* this is used to enumerate bridges and ports */ + if (namelen) { /* look up indexes of bridge and port */ + if (strncmp(name, NM_NAME, strlen(NM_NAME))) { + error = EINVAL; + break; + } + NMG_LOCK(); + b = nm_find_bridge(name, 0 /* don't create */); + if (!b) { + error = ENOENT; + NMG_UNLOCK(); + break; + } + + error = ENOENT; + for (j = 0; j < b->bdg_active_ports; j++) { + i = b->bdg_port_index[j]; + vpna = b->bdg_ports[i]; + if (vpna == NULL) { + D("---AAAAAAAAARGH-------"); + continue; + } + iter = vpna->up.ifp; + /* the former and the latter identify a + * virtual port and a NIC, respectively + */ + if (!strcmp(iter->if_xname, name)) { + /* bridge index */ + nmr->nr_arg1 = b - nm_bridges; + nmr->nr_arg2 = i; /* port index */ + error = 0; + break; + } + } + NMG_UNLOCK(); + } else { + /* return the first non-empty entry starting from + * bridge nr_arg1 and port nr_arg2. + * + * Users can detect the end of the same bridge by + * seeing the new and old value of nr_arg1, and can + * detect the end of all the bridge by error != 0 + */ + i = nmr->nr_arg1; + j = nmr->nr_arg2; + + NMG_LOCK(); + for (error = ENOENT; i < NM_BRIDGES; i++) { + b = nm_bridges + i; + if (j >= b->bdg_active_ports) { + j = 0; /* following bridges scan from 0 */ + continue; + } + nmr->nr_arg1 = i; + nmr->nr_arg2 = j; + j = b->bdg_port_index[j]; + vpna = b->bdg_ports[j]; + iter = vpna->up.ifp; + strncpy(name, iter->if_xname, (size_t)IFNAMSIZ); + error = 0; + break; + } + NMG_UNLOCK(); + } + break; + + case NETMAP_BDG_LOOKUP_REG: + /* register a lookup function to the given bridge. + * nmr->nr_name may be just bridge's name (including ':' + * if it is not just NM_NAME). + */ + if (!func) { + error = EINVAL; + break; + } + NMG_LOCK(); + b = nm_find_bridge(name, 0 /* don't create */); + if (!b) { + error = EINVAL; + } else { + b->nm_bdg_lookup = func; + } + NMG_UNLOCK(); + break; + + case NETMAP_BDG_VNET_HDR: + /* Valid lengths for the virtio-net header are 0 (no header), + 10 and 12. */ + if (nmr->nr_arg1 != 0 && + nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) && + nmr->nr_arg1 != 12) { + error = EINVAL; + break; + } + NMG_LOCK(); + error = netmap_get_bdg_na(nmr, &na, 0); + if (na && !error) { + vpna = (struct netmap_vp_adapter *)na; + vpna->virt_hdr_len = nmr->nr_arg1; + if (vpna->virt_hdr_len) + vpna->mfs = NETMAP_BDG_BUF_SIZE(na->nm_mem); + D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna); + netmap_adapter_put(na); + } + NMG_UNLOCK(); + break; + + default: + D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd); + error = EINVAL; + break; + } + return error; +} + +static int +netmap_vp_krings_create(struct netmap_adapter *na) +{ + u_int tailroom; + int error, i; + uint32_t *leases; + u_int nrx = netmap_real_rx_rings(na); + + /* + * Leases are attached to RX rings on vale ports + */ + tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx; + + error = netmap_krings_create(na, tailroom); + if (error) + return error; + + leases = na->tailroom; + + for (i = 0; i < nrx; i++) { /* Receive rings */ + na->rx_rings[i].nkr_leases = leases; + leases += na->num_rx_desc; + } + + error = nm_alloc_bdgfwd(na); + if (error) { + netmap_krings_delete(na); + return error; + } + + return 0; +} + + +static void +netmap_vp_krings_delete(struct netmap_adapter *na) +{ + nm_free_bdgfwd(na); + netmap_krings_delete(na); +} + + +static int +nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, + struct netmap_vp_adapter *na, u_int ring_nr); + + +/* + * Grab packets from a kring, move them into the ft structure + * associated to the tx (input) port. Max one instance per port, + * filtered on input (ioctl, poll or XXX). + * Returns the next position in the ring. + */ +static int +nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr, + struct netmap_kring *kring, u_int end) +{ + struct netmap_ring *ring = kring->ring; + struct nm_bdg_fwd *ft; + u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1; + u_int ft_i = 0; /* start from 0 */ + u_int frags = 1; /* how many frags ? */ + struct nm_bridge *b = na->na_bdg; + + /* To protect against modifications to the bridge we acquire a + * shared lock, waiting if we can sleep (if the source port is + * attached to a user process) or with a trylock otherwise (NICs). + */ + ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j); + if (na->up.na_flags & NAF_BDG_MAYSLEEP) + BDG_RLOCK(b); + else if (!BDG_RTRYLOCK(b)) + return 0; + ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j); + ft = kring->nkr_ft; + + for (; likely(j != end); j = nm_next(j, lim)) { + struct netmap_slot *slot = &ring->slot[j]; + char *buf; + + ft[ft_i].ft_len = slot->len; + ft[ft_i].ft_flags = slot->flags; + + ND("flags is 0x%x", slot->flags); + /* this slot goes into a list so initialize the link field */ + ft[ft_i].ft_next = NM_FT_NULL; + buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ? + (void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot); + __builtin_prefetch(buf); + ++ft_i; + if (slot->flags & NS_MOREFRAG) { + frags++; + continue; + } + if (unlikely(netmap_verbose && frags > 1)) + RD(5, "%d frags at %d", frags, ft_i - frags); + ft[ft_i - frags].ft_frags = frags; + frags = 1; + if (unlikely((int)ft_i >= bridge_batch)) + ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); + } + if (frags > 1) { + D("truncate incomplete fragment at %d (%d frags)", ft_i, frags); + // ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG + ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG; + ft[ft_i - frags].ft_frags = frags - 1; + } + if (ft_i) + ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr); + BDG_RUNLOCK(b); + return j; +} + + +/* ----- FreeBSD if_bridge hash function ------- */ + +/* + * The following hash function is adapted from "Hash Functions" by Bob Jenkins + * ("Algorithm Alley", Dr. Dobbs Journal, September 1997). + * + * http://www.burtleburtle.net/bob/hash/spooky.html + */ +#define mix(a, b, c) \ +do { \ + a -= b; a -= c; a ^= (c >> 13); \ + b -= c; b -= a; b ^= (a << 8); \ + c -= a; c -= b; c ^= (b >> 13); \ + a -= b; a -= c; a ^= (c >> 12); \ + b -= c; b -= a; b ^= (a << 16); \ + c -= a; c -= b; c ^= (b >> 5); \ + a -= b; a -= c; a ^= (c >> 3); \ + b -= c; b -= a; b ^= (a << 10); \ + c -= a; c -= b; c ^= (b >> 15); \ +} while (/*CONSTCOND*/0) + + +static __inline uint32_t +nm_bridge_rthash(const uint8_t *addr) +{ + uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key + + b += addr[5] << 8; + b += addr[4]; + a += addr[3] << 24; + a += addr[2] << 16; + a += addr[1] << 8; + a += addr[0]; + + mix(a, b, c); +#define BRIDGE_RTHASH_MASK (NM_BDG_HASH-1) + return (c & BRIDGE_RTHASH_MASK); +} + +#undef mix + + +static int +bdg_netmap_reg(struct netmap_adapter *na, int onoff) +{ + struct netmap_vp_adapter *vpna = + (struct netmap_vp_adapter*)na; + struct ifnet *ifp = na->ifp; + + /* the interface is already attached to the bridge, + * so we only need to toggle IFCAP_NETMAP. + */ + BDG_WLOCK(vpna->na_bdg); + if (onoff) { + ifp->if_capenable |= IFCAP_NETMAP; + } else { + ifp->if_capenable &= ~IFCAP_NETMAP; + } + BDG_WUNLOCK(vpna->na_bdg); + return 0; +} + + +/* + * Lookup function for a learning bridge. + * Update the hash table with the source address, + * and then returns the destination port index, and the + * ring in *dst_ring (at the moment, always use ring 0) + */ +u_int +netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring, + struct netmap_vp_adapter *na) +{ + struct nm_hash_ent *ht = na->na_bdg->ht; + uint32_t sh, dh; + u_int dst, mysrc = na->bdg_port; + uint64_t smac, dmac; + + if (buf_len < 14) { + D("invalid buf length %d", buf_len); + return NM_BDG_NOPORT; + } + dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff; + smac = le64toh(*(uint64_t *)(buf + 4)); + smac >>= 16; + + /* + * The hash is somewhat expensive, there might be some + * worthwhile optimizations here. + */ + if ((buf[6] & 1) == 0) { /* valid src */ + uint8_t *s = buf+6; + sh = nm_bridge_rthash(s); // XXX hash of source + /* update source port forwarding entry */ + ht[sh].mac = smac; /* XXX expire ? */ + ht[sh].ports = mysrc; + if (netmap_verbose) + D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d", + s[0], s[1], s[2], s[3], s[4], s[5], mysrc); + } + dst = NM_BDG_BROADCAST; + if ((buf[0] & 1) == 0) { /* unicast */ + dh = nm_bridge_rthash(buf); // XXX hash of dst + if (ht[dh].mac == dmac) { /* found dst */ + dst = ht[dh].ports; + } + /* XXX otherwise return NM_BDG_UNKNOWN ? */ + } + *dst_ring = 0; + return dst; +} + + +/* + * Available space in the ring. Only used in VALE code + * and only with is_rx = 1 + */ +static inline uint32_t +nm_kr_space(struct netmap_kring *k, int is_rx) +{ + int space; + + if (is_rx) { + int busy = k->nkr_hwlease - k->nr_hwcur; + if (busy < 0) + busy += k->nkr_num_slots; + space = k->nkr_num_slots - 1 - busy; + } else { + /* XXX never used in this branch */ + space = k->nr_hwtail - k->nkr_hwlease; + if (space < 0) + space += k->nkr_num_slots; + } +#if 0 + // sanity check + if (k->nkr_hwlease >= k->nkr_num_slots || + k->nr_hwcur >= k->nkr_num_slots || + k->nr_tail >= k->nkr_num_slots || + busy < 0 || + busy >= k->nkr_num_slots) { + D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d", k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, + k->nkr_lease_idx, k->nkr_num_slots); + } +#endif + return space; +} + + + + +/* make a lease on the kring for N positions. return the + * lease index + * XXX only used in VALE code and with is_rx = 1 + */ +static inline uint32_t +nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx) +{ + uint32_t lim = k->nkr_num_slots - 1; + uint32_t lease_idx = k->nkr_lease_idx; + + k->nkr_leases[lease_idx] = NR_NOSLOT; + k->nkr_lease_idx = nm_next(lease_idx, lim); + + if (n > nm_kr_space(k, is_rx)) { + D("invalid request for %d slots", n); + panic("x"); + } + /* XXX verify that there are n slots */ + k->nkr_hwlease += n; + if (k->nkr_hwlease > lim) + k->nkr_hwlease -= lim + 1; + + if (k->nkr_hwlease >= k->nkr_num_slots || + k->nr_hwcur >= k->nkr_num_slots || + k->nr_hwtail >= k->nkr_num_slots || + k->nkr_lease_idx >= k->nkr_num_slots) { + D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d", + k->na->ifp->if_xname, + k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease, + k->nkr_lease_idx, k->nkr_num_slots); + } + return lease_idx; +} + +/* + * This flush routine supports only unicast and broadcast but a large + * number of ports, and lets us replace the learn and dispatch functions. + */ +int +nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na, + u_int ring_nr) +{ + struct nm_bdg_q *dst_ents, *brddst; + uint16_t num_dsts = 0, *dsts; + struct nm_bridge *b = na->na_bdg; + u_int i, j, me = na->bdg_port; + + /* + * The work area (pointed by ft) is followed by an array of + * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS + * queues per port plus one for the broadcast traffic. + * Then we have an array of destination indexes. + */ + dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX); + dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1); + + /* first pass: find a destination for each packet in the batch */ + for (i = 0; likely(i < n); i += ft[i].ft_frags) { + uint8_t dst_ring = ring_nr; /* default, same ring as origin */ + uint16_t dst_port, d_i; + struct nm_bdg_q *d; + uint8_t *buf = ft[i].ft_buf; + u_int len = ft[i].ft_len; + + ND("slot %d frags %d", i, ft[i].ft_frags); + /* Drop the packet if the virtio-net header is not into the first + fragment nor at the very beginning of the second. */ + if (unlikely(na->virt_hdr_len > len)) + continue; + if (len == na->virt_hdr_len) { + buf = ft[i+1].ft_buf; + len = ft[i+1].ft_len; + } else { + buf += na->virt_hdr_len; + len -= na->virt_hdr_len; + } + dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na); + if (netmap_verbose > 255) + RD(5, "slot %d port %d -> %d", i, me, dst_port); + if (dst_port == NM_BDG_NOPORT) + continue; /* this packet is identified to be dropped */ + else if (unlikely(dst_port > NM_BDG_MAXPORTS)) + continue; + else if (dst_port == NM_BDG_BROADCAST) + dst_ring = 0; /* broadcasts always go to ring 0 */ + else if (unlikely(dst_port == me || + !b->bdg_ports[dst_port])) + continue; + + /* get a position in the scratch pad */ + d_i = dst_port * NM_BDG_MAXRINGS + dst_ring; + d = dst_ents + d_i; + + /* append the first fragment to the list */ + if (d->bq_head == NM_FT_NULL) { /* new destination */ + d->bq_head = d->bq_tail = i; + /* remember this position to be scanned later */ + if (dst_port != NM_BDG_BROADCAST) + dsts[num_dsts++] = d_i; + } else { + ft[d->bq_tail].ft_next = i; + d->bq_tail = i; + } + d->bq_len += ft[i].ft_frags; + } + + /* + * Broadcast traffic goes to ring 0 on all destinations. + * So we need to add these rings to the list of ports to scan. + * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is + * expensive. We should keep a compact list of active destinations + * so we could shorten this loop. + */ + brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS; + if (brddst->bq_head != NM_FT_NULL) { + for (j = 0; likely(j < b->bdg_active_ports); j++) { + uint16_t d_i; + i = b->bdg_port_index[j]; + if (unlikely(i == me)) + continue; + d_i = i * NM_BDG_MAXRINGS; + if (dst_ents[d_i].bq_head == NM_FT_NULL) + dsts[num_dsts++] = d_i; + } + } + + ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts); + /* second pass: scan destinations (XXX will be modular somehow) */ + for (i = 0; i < num_dsts; i++) { + struct ifnet *dst_ifp; + struct netmap_vp_adapter *dst_na; + struct netmap_kring *kring; + struct netmap_ring *ring; + u_int dst_nr, lim, j, d_i, next, brd_next; + u_int needed, howmany; + int retry = netmap_txsync_retry; + struct nm_bdg_q *d; + uint32_t my_start = 0, lease_idx = 0; + int nrings; + int virt_hdr_mismatch = 0; + + d_i = dsts[i]; + ND("second pass %d port %d", i, d_i); + d = dst_ents + d_i; + // XXX fix the division + dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS]; + /* protect from the lookup function returning an inactive + * destination port + */ + if (unlikely(dst_na == NULL)) + goto cleanup; + if (dst_na->up.na_flags & NAF_SW_ONLY) + goto cleanup; + dst_ifp = dst_na->up.ifp; + /* + * The interface may be in !netmap mode in two cases: + * - when na is attached but not activated yet; + * - when na is being deactivated but is still attached. + */ + if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) { + ND("not in netmap mode!"); + goto cleanup; + } + + /* there is at least one either unicast or broadcast packet */ + brd_next = brddst->bq_head; + next = d->bq_head; + /* we need to reserve this many slots. If fewer are + * available, some packets will be dropped. + * Packets may have multiple fragments, so we may not use + * there is a chance that we may not use all of the slots + * we have claimed, so we will need to handle the leftover + * ones when we regain the lock. + */ + needed = d->bq_len + brddst->bq_len; + + if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) { + /* There is a virtio-net header/offloadings mismatch between + * source and destination. The slower mismatch datapath will + * be used to cope with all the mismatches. + */ + virt_hdr_mismatch = 1; + if (dst_na->mfs < na->mfs) { + /* We may need to do segmentation offloadings, and so + * we may need a number of destination slots greater + * than the number of input slots ('needed'). + * We look for the smallest integer 'x' which satisfies: + * needed * na->mfs + x * H <= x * na->mfs + * where 'H' is the length of the longest header that may + * be replicated in the segmentation process (e.g. for + * TCPv4 we must account for ethernet header, IP header + * and TCPv4 header). + */ + needed = (needed * na->mfs) / + (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1; + ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed); + } + } + + ND(5, "pass 2 dst %d is %x %s", + i, d_i, is_vp ? "virtual" : "nic/host"); + dst_nr = d_i & (NM_BDG_MAXRINGS-1); + nrings = dst_na->up.num_rx_rings; + if (dst_nr >= nrings) + dst_nr = dst_nr % nrings; + kring = &dst_na->up.rx_rings[dst_nr]; + ring = kring->ring; + lim = kring->nkr_num_slots - 1; + +retry: + + if (dst_na->retry && retry) { + /* try to get some free slot from the previous run */ + dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); + } + /* reserve the buffers in the queue and an entry + * to report completion, and drop lock. + * XXX this might become a helper function. + */ + mtx_lock(&kring->q_lock); + if (kring->nkr_stopped) { + mtx_unlock(&kring->q_lock); + goto cleanup; + } + my_start = j = kring->nkr_hwlease; + howmany = nm_kr_space(kring, 1); + if (needed < howmany) + howmany = needed; + lease_idx = nm_kr_lease(kring, howmany, 1); + mtx_unlock(&kring->q_lock); + + /* only retry if we need more than available slots */ + if (retry && needed <= howmany) + retry = 0; + + /* copy to the destination queue */ + while (howmany > 0) { + struct netmap_slot *slot; + struct nm_bdg_fwd *ft_p, *ft_end; + u_int cnt; + + /* find the queue from which we pick next packet. + * NM_FT_NULL is always higher than valid indexes + * so we never dereference it if the other list + * has packets (and if both are empty we never + * get here). + */ + if (next < brd_next) { + ft_p = ft + next; + next = ft_p->ft_next; + } else { /* insert broadcast */ + ft_p = ft + brd_next; + brd_next = ft_p->ft_next; + } + cnt = ft_p->ft_frags; // cnt > 0 + if (unlikely(cnt > howmany)) + break; /* no more space */ + if (netmap_verbose && cnt > 1) + RD(5, "rx %d frags to %d", cnt, j); + ft_end = ft_p + cnt; + if (unlikely(virt_hdr_mismatch)) { + bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany); + } else { + howmany -= cnt; + do { + char *dst, *src = ft_p->ft_buf; + size_t copy_len = ft_p->ft_len, dst_len = copy_len; + + slot = &ring->slot[j]; + dst = BDG_NMB(&dst_na->up, slot); + + ND("send [%d] %d(%d) bytes at %s:%d", + i, (int)copy_len, (int)dst_len, + NM_IFPNAME(dst_ifp), j); + /* round to a multiple of 64 */ + copy_len = (copy_len + 63) & ~63; + + if (ft_p->ft_flags & NS_INDIRECT) { + if (copyin(src, dst, copy_len)) { + // invalid user pointer, pretend len is 0 + dst_len = 0; + } + } else { + //memcpy(dst, src, copy_len); + pkt_copy(src, dst, (int)copy_len); + } + slot->len = dst_len; + slot->flags = (cnt << 8)| NS_MOREFRAG; + j = nm_next(j, lim); + needed--; + ft_p++; + } while (ft_p != ft_end); + slot->flags = (cnt << 8); /* clear flag on last entry */ + } + /* are we done ? */ + if (next == NM_FT_NULL && brd_next == NM_FT_NULL) + break; + } + { + /* current position */ + uint32_t *p = kring->nkr_leases; /* shorthand */ + uint32_t update_pos; + int still_locked = 1; + + mtx_lock(&kring->q_lock); + if (unlikely(howmany > 0)) { + /* not used all bufs. If i am the last one + * i can recover the slots, otherwise must + * fill them with 0 to mark empty packets. + */ + ND("leftover %d bufs", howmany); + if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) { + /* yes i am the last one */ + ND("roll back nkr_hwlease to %d", j); + kring->nkr_hwlease = j; + } else { + while (howmany-- > 0) { + ring->slot[j].len = 0; + ring->slot[j].flags = 0; + j = nm_next(j, lim); + } + } + } + p[lease_idx] = j; /* report I am done */ + + update_pos = kring->nr_hwtail; + + if (my_start == update_pos) { + /* all slots before my_start have been reported, + * so scan subsequent leases to see if other ranges + * have been completed, and to a selwakeup or txsync. + */ + while (lease_idx != kring->nkr_lease_idx && + p[lease_idx] != NR_NOSLOT) { + j = p[lease_idx]; + p[lease_idx] = NR_NOSLOT; + lease_idx = nm_next(lease_idx, lim); + } + /* j is the new 'write' position. j != my_start + * means there are new buffers to report + */ + if (likely(j != my_start)) { + kring->nr_hwtail = j; + still_locked = 0; + mtx_unlock(&kring->q_lock); + dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0); + if (dst_na->retry && retry--) + goto retry; + } + } + if (still_locked) + mtx_unlock(&kring->q_lock); + } +cleanup: + d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */ + d->bq_len = 0; + } + brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */ + brddst->bq_len = 0; + return 0; +} + + +static int +netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->up.tx_rings[ring_nr]; + u_int done; + u_int const lim = kring->nkr_num_slots - 1; + u_int const cur = kring->rcur; + + if (bridge_batch <= 0) { /* testing only */ + done = cur; // used all + goto done; + } + if (bridge_batch > NM_BDG_BATCH) + bridge_batch = NM_BDG_BATCH; + + done = nm_bdg_preflush(na, ring_nr, kring, cur); +done: + if (done != cur) + D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail); + /* + * packets between 'done' and 'cur' are left unsent. + */ + kring->nr_hwcur = done; + kring->nr_hwtail = nm_prev(done, lim); + nm_txsync_finalize(kring); + if (netmap_verbose) + D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags); + return 0; +} + + +/* + * main dispatch routine for the bridge. + * We already know that only one thread is running this. + * we must run nm_bdg_preflush without lock. + */ +static int +bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na; + return netmap_vp_txsync(vpna, ring_nr, flags); +} + +static int +netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + struct netmap_ring *ring = kring->ring; + u_int nm_i, lim = kring->nkr_num_slots - 1; + u_int head = nm_rxsync_prologue(kring); + int n; + + if (head > lim) { + D("ouch dangerous reset!!!"); + n = netmap_ring_reinit(kring); + goto done; + } + + /* First part, import newly received packets. */ + /* actually nothing to do here, they are already in the kring */ + + /* Second part, skip past packets that userspace has released. */ + nm_i = kring->nr_hwcur; + if (nm_i != head) { + /* consistency check, but nothing really important here */ + for (n = 0; likely(nm_i != head); n++) { + struct netmap_slot *slot = &ring->slot[nm_i]; + void *addr = BDG_NMB(na, slot); + + if (addr == netmap_buffer_base) { /* bad buf */ + D("bad buffer index %d, ignore ?", + slot->buf_idx); + } + slot->flags &= ~NS_BUF_CHANGED; + nm_i = nm_next(nm_i, lim); + } + kring->nr_hwcur = head; + } + + /* tell userspace that there are new packets */ + nm_rxsync_finalize(kring); + n = 0; +done: + return n; +} + +/* + * user process reading from a VALE switch. + * Already protected against concurrent calls from userspace, + * but we must acquire the queue's lock to protect against + * writers on the same queue. + */ +static int +bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags) +{ + struct netmap_kring *kring = &na->rx_rings[ring_nr]; + int n; + + mtx_lock(&kring->q_lock); + n = netmap_vp_rxsync(na, ring_nr, flags); + mtx_unlock(&kring->q_lock); + return n; +} + + +static int +bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp) +{ + struct netmap_vp_adapter *vpna; + struct netmap_adapter *na; + int error; + u_int npipes = 0; + + vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (vpna == NULL) + return ENOMEM; + + na = &vpna->up; + + na->ifp = ifp; + + /* bound checking */ + na->num_tx_rings = nmr->nr_tx_rings; + nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); + nmr->nr_tx_rings = na->num_tx_rings; // write back + na->num_rx_rings = nmr->nr_rx_rings; + nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL); + nmr->nr_rx_rings = na->num_rx_rings; // write back + nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE, + 1, NM_BDG_MAXSLOTS, NULL); + na->num_tx_desc = nmr->nr_tx_slots; + nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE, + 1, NM_BDG_MAXSLOTS, NULL); + /* validate number of pipes. We want at least 1, + * but probably can do with some more. + * So let's use 2 as default (when 0 is supplied) + */ + npipes = nmr->nr_arg1; + nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL); + nmr->nr_arg1 = npipes; /* write back */ + /* validate extra bufs */ + nm_bound_var(&nmr->nr_arg3, 0, 0, + 128*NM_BDG_MAXSLOTS, NULL); + na->num_rx_desc = nmr->nr_rx_slots; + vpna->virt_hdr_len = 0; + vpna->mfs = 1514; + /*if (vpna->mfs > netmap_buf_size) TODO netmap_buf_size is zero?? + vpna->mfs = netmap_buf_size; */ + if (netmap_verbose) + D("max frame size %u", vpna->mfs); + + na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER; + na->nm_txsync = bdg_netmap_txsync; + na->nm_rxsync = bdg_netmap_rxsync; + na->nm_register = bdg_netmap_reg; + na->nm_dtor = netmap_adapter_vp_dtor; + na->nm_krings_create = netmap_vp_krings_create; + na->nm_krings_delete = netmap_vp_krings_delete; + na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp), + na->num_tx_rings, na->num_tx_desc, + na->num_rx_rings, na->num_rx_desc, + nmr->nr_arg3, npipes, &error); + if (na->nm_mem == NULL) + goto err; + /* other nmd fields are set in the common routine */ + error = netmap_attach_common(na); + if (error) + goto err; + return 0; + +err: + if (na->nm_mem != NULL) + netmap_mem_private_delete(na->nm_mem); + free(vpna, M_DEVBUF); + return error; +} + + +static void +netmap_bwrap_dtor(struct netmap_adapter *na) +{ + struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na; + struct netmap_adapter *hwna = bna->hwna; + struct nm_bridge *b = bna->up.na_bdg, + *bh = bna->host.na_bdg; + struct ifnet *ifp = na->ifp; + + ND("na %p", na); + + if (b) { + netmap_bdg_detach_common(b, bna->up.bdg_port, + (bh ? bna->host.bdg_port : -1)); + } + + hwna->na_private = NULL; + netmap_adapter_put(hwna); + + bzero(ifp, sizeof(*ifp)); + free(ifp, M_DEVBUF); + na->ifp = NULL; + +} + + +/* + * Intr callback for NICs connected to a bridge. + * Simply ignore tx interrupts (maybe we could try to recover space ?) + * and pass received packets from nic to the bridge. + * + * XXX TODO check locking: this is called from the interrupt + * handler so we should make sure that the interface is not + * disconnected while passing down an interrupt. + * + * Note, no user process can access this NIC or the host stack. + * The only part of the ring that is significant are the slots, + * and head/cur/tail are set from the kring as needed + * (part as a receive ring, part as a transmit ring). + * + * callback that overwrites the hwna notify callback. + * Packets come from the outside or from the host stack and are put on an hwna rx ring. + * The bridge wrapper then sends the packets through the bridge. + */ +static int +netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags) +{ + struct ifnet *ifp = na->ifp; + struct netmap_bwrap_adapter *bna = na->na_private; + struct netmap_vp_adapter *hostna = &bna->host; + struct netmap_kring *kring, *bkring; + struct netmap_ring *ring; + int is_host_ring = ring_nr == na->num_rx_rings; + struct netmap_vp_adapter *vpna = &bna->up; + int error = 0; + + if (netmap_verbose) + D("%s %s%d 0x%x", NM_IFPNAME(ifp), + (tx == NR_TX ? "TX" : "RX"), ring_nr, flags); + + if (flags & NAF_DISABLE_NOTIFY) { + kring = tx == NR_TX ? na->tx_rings : na->rx_rings; + bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings; + if (kring[ring_nr].nkr_stopped) + netmap_disable_ring(&bkring[ring_nr]); + else + bkring[ring_nr].nkr_stopped = 0; + return 0; + } + + if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP)) + return 0; + + /* we only care about receive interrupts */ + if (tx == NR_TX) + return 0; + + kring = &na->rx_rings[ring_nr]; + ring = kring->ring; + + /* make sure the ring is not disabled */ + if (nm_kr_tryget(kring)) + return 0; + + if (is_host_ring && hostna->na_bdg == NULL) { + error = bna->save_notify(na, ring_nr, tx, flags); + goto put_out; + } + + /* Here we expect ring->head = ring->cur = ring->tail + * because everything has been released from the previous round. + * However the ring is shared and we might have info from + * the wrong side (the tx ring). Hence we overwrite with + * the info from the rx kring. + */ + if (netmap_verbose) + D("%s head %d cur %d tail %d (kring %d %d %d)", NM_IFPNAME(ifp), + ring->head, ring->cur, ring->tail, + kring->rhead, kring->rcur, kring->rtail); + + ring->head = kring->rhead; + ring->cur = kring->rcur; + ring->tail = kring->rtail; + + if (is_host_ring) { + vpna = hostna; + ring_nr = 0; + } + /* simulate a user wakeup on the rx ring */ + /* fetch packets that have arrived. + * XXX maybe do this in a loop ? + */ + error = kring->nm_sync(kring, 0); + if (error) + goto put_out; + if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) { + D("how strange, interrupt with no packets on %s", + NM_IFPNAME(ifp)); + goto put_out; + } + + /* new packets are ring->cur to ring->tail, and the bkring + * had hwcur == ring->cur. So advance ring->cur to ring->tail + * to push all packets out. + */ + ring->head = ring->cur = ring->tail; + + /* also set tail to what the bwrap expects */ + bkring = &vpna->up.tx_rings[ring_nr]; + ring->tail = bkring->nr_hwtail; // rtail too ? + + /* pass packets to the switch */ + nm_txsync_prologue(bkring); // XXX error checking ? + netmap_vp_txsync(vpna, ring_nr, flags); + + /* mark all buffers as released on this ring */ + ring->head = ring->cur = kring->nr_hwtail; + ring->tail = kring->rtail; + /* another call to actually release the buffers */ + if (!is_host_ring) { + error = kring->nm_sync(kring, 0); + } else { + /* mark all packets as released, as in the + * second part of netmap_rxsync_from_host() + */ + kring->nr_hwcur = kring->nr_hwtail; + nm_rxsync_finalize(kring); + } + +put_out: + nm_kr_put(kring); + return error; +} + + +static int +netmap_bwrap_register(struct netmap_adapter *na, int onoff) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + struct netmap_vp_adapter *hostna = &bna->host; + int error; + + ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off"); + + if (onoff) { + int i; + + hwna->na_lut = na->na_lut; + hwna->na_lut_objtotal = na->na_lut_objtotal; + + if (hostna->na_bdg) { + hostna->up.na_lut = na->na_lut; + hostna->up.na_lut_objtotal = na->na_lut_objtotal; + } + + /* cross-link the netmap rings + * The original number of rings comes from hwna, + * rx rings on one side equals tx rings on the other. + */ + for (i = 0; i < na->num_rx_rings + 1; i++) { + hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots; + hwna->tx_rings[i].ring = na->rx_rings[i].ring; + } + for (i = 0; i < na->num_tx_rings + 1; i++) { + hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots; + hwna->rx_rings[i].ring = na->tx_rings[i].ring; + } + } + + if (hwna->ifp) { + error = hwna->nm_register(hwna, onoff); + if (error) + return error; + } + + bdg_netmap_reg(na, onoff); + + if (onoff) { + bna->save_notify = hwna->nm_notify; + hwna->nm_notify = netmap_bwrap_intr_notify; + } else { + hwna->nm_notify = bna->save_notify; + hwna->na_lut = NULL; + hwna->na_lut_objtotal = 0; + } + + return 0; +} + + +static int +netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd, + u_int *rxr, u_int *rxd) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + + /* forward the request */ + netmap_update_config(hwna); + /* swap the results */ + *txr = hwna->num_rx_rings; + *txd = hwna->num_rx_desc; + *rxr = hwna->num_tx_rings; + *rxd = hwna->num_rx_desc; + + return 0; +} + + +static int +netmap_bwrap_krings_create(struct netmap_adapter *na) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + struct netmap_adapter *hostna = &bna->host.up; + int error; + + ND("%s", NM_IFPNAME(na->ifp)); + + error = netmap_vp_krings_create(na); + if (error) + return error; + + error = hwna->nm_krings_create(hwna); + if (error) { + netmap_vp_krings_delete(na); + return error; + } + + if (na->na_flags & NAF_HOST_RINGS) { + hostna->tx_rings = na->tx_rings + na->num_tx_rings; + hostna->rx_rings = na->rx_rings + na->num_rx_rings; + } + + return 0; +} + + +static void +netmap_bwrap_krings_delete(struct netmap_adapter *na) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + + ND("%s", NM_IFPNAME(na->ifp)); + + hwna->nm_krings_delete(hwna); + netmap_vp_krings_delete(na); +} + + +/* notify method for the bridge-->hwna direction */ +static int +netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) +{ + struct netmap_bwrap_adapter *bna = + (struct netmap_bwrap_adapter *)na; + struct netmap_adapter *hwna = bna->hwna; + struct netmap_kring *kring, *hw_kring; + struct netmap_ring *ring; + u_int lim; + int error = 0; + + if (tx == NR_TX) + return EINVAL; + + kring = &na->rx_rings[ring_n]; + hw_kring = &hwna->tx_rings[ring_n]; + ring = kring->ring; + lim = kring->nkr_num_slots - 1; + + if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP)) + return 0; + mtx_lock(&kring->q_lock); + /* first step: simulate a user wakeup on the rx ring */ + netmap_vp_rxsync(na, ring_n, flags); + ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", + NM_IFPNAME(na->ifp), ring_n, + kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, + ring->head, ring->cur, ring->tail, + hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail); + /* second step: the simulated user consumes all new packets */ + ring->head = ring->cur = ring->tail; + + /* third step: the new packets are sent on the tx ring + * (which is actually the same ring) + */ + /* set tail to what the hw expects */ + ring->tail = hw_kring->rtail; + nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ? + error = hw_kring->nm_sync(hw_kring, flags); + + /* fourth step: now we are back the rx ring */ + /* claim ownership on all hw owned bufs */ + ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */ + ring->tail = kring->rtail; /* restore saved value of tail, for safety */ + + /* fifth step: the user goes to sleep again, causing another rxsync */ + netmap_vp_rxsync(na, ring_n, flags); + ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)", + NM_IFPNAME(na->ifp), ring_n, + kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease, + ring->head, ring->cur, ring->tail, + hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail); + mtx_unlock(&kring->q_lock); + return error; +} + + +static int +netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags) +{ + struct netmap_bwrap_adapter *bna = na->na_private; + struct netmap_adapter *port_na = &bna->up.up; + if (tx == NR_TX || ring_n != 0) + return EINVAL; + return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags); +} + + +/* attach a bridge wrapper to the 'real' device */ +static int +netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real) +{ + struct netmap_bwrap_adapter *bna; + struct netmap_adapter *na; + struct netmap_adapter *hwna = NA(real); + struct netmap_adapter *hostna; + int error; + + + bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO); + if (bna == NULL) + return ENOMEM; + + na = &bna->up.up; + na->ifp = fake; + /* fill the ring data for the bwrap adapter with rx/tx meanings + * swapped. The real cross-linking will be done during register, + * when all the krings will have been created. + */ + na->num_rx_rings = hwna->num_tx_rings; + na->num_tx_rings = hwna->num_rx_rings; + na->num_tx_desc = hwna->num_rx_desc; + na->num_rx_desc = hwna->num_tx_desc; + na->nm_dtor = netmap_bwrap_dtor; + na->nm_register = netmap_bwrap_register; + // na->nm_txsync = netmap_bwrap_txsync; + // na->nm_rxsync = netmap_bwrap_rxsync; + na->nm_config = netmap_bwrap_config; + na->nm_krings_create = netmap_bwrap_krings_create; + na->nm_krings_delete = netmap_bwrap_krings_delete; + na->nm_notify = netmap_bwrap_notify; + na->nm_mem = hwna->nm_mem; + na->na_private = na; /* prevent NIOCREGIF */ + bna->up.retry = 1; /* XXX maybe this should depend on the hwna */ + + bna->hwna = hwna; + netmap_adapter_get(hwna); + hwna->na_private = bna; /* weak reference */ + + if (hwna->na_flags & NAF_HOST_RINGS) { + na->na_flags |= NAF_HOST_RINGS; + hostna = &bna->host.up; + hostna->ifp = hwna->ifp; + hostna->num_tx_rings = 1; + hostna->num_tx_desc = hwna->num_rx_desc; + hostna->num_rx_rings = 1; + hostna->num_rx_desc = hwna->num_tx_desc; + // hostna->nm_txsync = netmap_bwrap_host_txsync; + // hostna->nm_rxsync = netmap_bwrap_host_rxsync; + hostna->nm_notify = netmap_bwrap_host_notify; + hostna->nm_mem = na->nm_mem; + hostna->na_private = bna; + } + + ND("%s<->%s txr %d txd %d rxr %d rxd %d", + fake->if_xname, real->if_xname, + na->num_tx_rings, na->num_tx_desc, + na->num_rx_rings, na->num_rx_desc); + + error = netmap_attach_common(na); + if (error) { + netmap_adapter_put(hwna); + free(bna, M_DEVBUF); + return error; + } + return 0; +} + + +void +netmap_init_bridges(void) +{ + int i; + bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */ + for (i = 0; i < NM_BRIDGES; i++) + BDG_RWINIT(&nm_bridges[i]); +} +#endif /* WITH_VALE */ diff --git a/sys/modules/netmap/Makefile b/sys/modules/netmap/Makefile index b58bfdb25..647cd1036 100644 --- a/sys/modules/netmap/Makefile +++ b/sys/modules/netmap/Makefile @@ -5,10 +5,16 @@ .PATH: ${.CURDIR}/../../dev/netmap .PATH.h: ${.CURDIR}/../../net +CFLAGS += -I${.CURDIR}/../../ KMOD = netmap SRCS = device_if.h bus_if.h opt_netmap.h SRCS += netmap.c netmap.h netmap_kern.h - -netmap.o: netmap_mem2.c +SRCS += netmap_mem2.c netmap_mem2.h +SRCS += netmap_generic.c +SRCS += netmap_mbq.c netmap_mbq.h +SRCS += netmap_vale.c +SRCS += netmap_freebsd.c +SRCS += netmap_offloadings.c +SRCS += netmap_pipe.c .include diff --git a/sys/net/netmap.h b/sys/net/netmap.h index 0f2baebe1..f0b4c56d4 100644 --- a/sys/net/netmap.h +++ b/sys/net/netmap.h @@ -1,33 +1,27 @@ /* - * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved. - * + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * + * modification, are permitted provided that the following conditions + * are met: + * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. - * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the - * distribution. - * - * 3. Neither the name of the authors nor the names of their contributors - * may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY MATTEO LANDI AND CONTRIBUTORS "AS IS" AND + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``S IS''AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATTEO LANDI OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ /* @@ -36,7 +30,7 @@ * Definitions of constants and the structures used by the netmap * framework, for the part visible to both kernel and userspace. * Detailed info on netmap is available with "man netmap" or at - * + * * http://info.iet.unipi.it/~luigi/netmap/ * * This API is also used to communicate with the VALE software switch @@ -45,6 +39,18 @@ #ifndef _NET_NETMAP_H_ #define _NET_NETMAP_H_ +#define NETMAP_API 11 /* current API version */ + +#define NETMAP_MIN_API 11 /* min and max versions accepted */ +#define NETMAP_MAX_API 15 +/* + * Some fields should be cache-aligned to reduce contention. + * The alignment is architecture and OS dependent, but rather than + * digging into OS headers to find the exact value we use an estimate + * that should cover most architectures. + */ +#define NM_CACHE_ALIGN 128 + /* * --- Netmap data structures --- * @@ -58,31 +64,32 @@ ==================================================================== | USERSPACE | struct netmap_ring - +---->+--------------+ - / | cur | - struct netmap_if (nifp, 1 per fd) / | avail | - +---------------+ / | buf_ofs | - | ni_tx_rings | / +==============+ - | ni_rx_rings | / | buf_idx, len | slot[0] - | | / | flags, ptr | - | | / +--------------+ - +===============+ / | buf_idx, len | slot[1] - | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | - | txring_ofs[1] | +--------------+ - (ni_tx_rings+1 entries) (num_slots entries) - | txring_ofs[t] | | buf_idx, len | slot[n-1] - +---------------+ | flags, ptr | - | rxring_ofs[0] | +--------------+ + +---->+---------------+ + / | head,cur,tail | + struct netmap_if (nifp, 1 per fd) / | buf_ofs | + +---------------+ / | other fields | + | ni_tx_rings | / +===============+ + | ni_rx_rings | / | buf_idx, len | slot[0] + | | / | flags, ptr | + | | / +---------------+ + +===============+ / | buf_idx, len | slot[1] + | txring_ofs[0] | (rel.to nifp)--' | flags, ptr | + | txring_ofs[1] | +---------------+ + (tx+1 entries) (num_slots entries) + | txring_ofs[t] | | buf_idx, len | slot[n-1] + +---------------+ | flags, ptr | + | rxring_ofs[0] | +---------------+ | rxring_ofs[1] | - (ni_rx_rings+1 entries) + (rx+1 entries) | rxring_ofs[r] | +---------------+ - * For each "interface" (NIC, host stack, VALE switch port) attached to a - * file descriptor, the mmap()ed region contains a (logically readonly) + * For each "interface" (NIC, host stack, PIPE, VALE switch port) bound to + * a file descriptor, the mmap()ed region contains a (logically readonly) * struct netmap_if pointing to struct netmap_ring's. + * * There is one netmap_ring per physical NIC ring, plus one tx/rx ring - * pair attached to the host stack (this pair is unused for VALE ports). + * pair attached to the host stack (this pair is unused for non-NIC ports). * * All physical/host stack ports share the same memory region, * so that zero-copy can be implemented between them. @@ -94,127 +101,155 @@ * is provided for user-supplied buffers in the tx path. * * In user space, the buffer address is computed as - * (char *)ring + buf_ofs + index*NETMAP_BUF_SIZE + * (char *)ring + buf_ofs + index * NETMAP_BUF_SIZE + * + * Added in NETMAP_API 11: + * + * + NIOCREGIF can request the allocation of extra spare buffers from + * the same memory pool. The desired number of buffers must be in + * nr_arg3. The ioctl may return fewer buffers, depending on memory + * availability. nr_arg3 will return the actual value, and, once + * mapped, nifp->ni_bufs_head will be the index of the first buffer. + * + * The buffers are linked to each other using the first uint32_t + * as the index. On close, ni_bufs_head must point to the list of + * buffers to be released. + * + * + NIOCREGIF can request space for extra rings (and buffers) + * allocated in the same memory space. The number of extra rings + * is in nr_arg1, and is advisory. This is a no-op on NICs where + * the size of the memory space is fixed. + * + * + NIOCREGIF can attach to PIPE rings sharing the same memory + * space with a parent device. The ifname indicates the parent device, + * which must already exist. Flags in nr_flags indicate if we want to + * bind the master or slave side, the index (from nr_ringid) + * is just a cookie and does need to be sequential. + * + * + NIOCREGIF can also attach to 'monitor' rings that replicate + * the content of specific rings, also from the same memory space. + * + * Extra flags in nr_flags support the above functions. + * Application libraries may use the following naming scheme: + * netmap:foo all NIC ring pairs + * netmap:foo^ only host ring pair + * netmap:foo+ all NIC ring + host ring pairs + * netmap:foo-k the k-th NIC ring pair + * netmap:foo{k PIPE ring pair k, master side + * netmap:foo}k PIPE ring pair k, slave side */ /* * struct netmap_slot is a buffer descriptor - * - * buf_idx the index of the buffer associated to the slot. - * len the length of the payload - * flags control operation on the slot, as defined below - * - * NS_BUF_CHANGED must be set whenever userspace wants - * to change buf_idx (it might be necessary to - * reprogram the NIC) - * - * NS_REPORT must be set if we want the NIC to generate an interrupt - * when this slot is used. Leaving it to 0 improves - * performance. - * - * NS_FORWARD if set on a receive ring, and the device is in - * transparent mode, buffers released with the flag set - * will be forwarded to the 'other' side (host stack - * or NIC, respectively) on the next select() or ioctl() - * - * NS_NO_LEARN on a VALE switch, do not 'learn' the source port for - * this packet. - * - * NS_INDIRECT (tx rings only) data is in a userspace buffer pointed - * by the ptr field in the slot. - * - * NS_MOREFRAG Part of a multi-segment frame. The last (or only) - * segment must not have this flag. - * Only supported on VALE ports. - * - * NS_PORT_MASK the high 8 bits of the flag, if not zero, indicate the - * destination port for the VALE switch, overriding - * the lookup table. */ - struct netmap_slot { uint32_t buf_idx; /* buffer index */ - uint16_t len; /* packet length */ + uint16_t len; /* length for this slot */ uint16_t flags; /* buf changed, etc. */ + uint64_t ptr; /* pointer for indirect buffers */ +}; + +/* + * The following flags control how the slot is used + */ + #define NS_BUF_CHANGED 0x0001 /* buf_idx changed */ -#define NS_REPORT 0x0002 /* ask the hardware to report results - * e.g. by generating an interrupt - */ -#define NS_FORWARD 0x0004 /* pass packet to the other endpoint - * (host stack or device) - */ -#define NS_NO_LEARN 0x0008 -#define NS_INDIRECT 0x0010 -#define NS_MOREFRAG 0x0020 + /* + * must be set whenever buf_idx is changed (as it might be + * necessary to recompute the physical address and mapping) + */ + +#define NS_REPORT 0x0002 /* ask the hardware to report results */ + /* + * Request notification when slot is used by the hardware. + * Normally transmit completions are handled lazily and + * may be unreported. This flag lets us know when a slot + * has been sent (e.g. to terminate the sender). + */ + +#define NS_FORWARD 0x0004 /* pass packet 'forward' */ + /* + * (Only for physical ports, rx rings with NR_FORWARD set). + * Slot released to the kernel (i.e. before ring->head) with + * this flag set are passed to the peer ring (host/NIC), + * thus restoring the host-NIC connection for these slots. + * This supports efficient traffic monitoring or firewalling. + */ + +#define NS_NO_LEARN 0x0008 /* disable bridge learning */ + /* + * On a VALE switch, do not 'learn' the source port for + * this buffer. + */ + +#define NS_INDIRECT 0x0010 /* userspace buffer */ + /* + * (VALE tx rings only) data is in a userspace buffer, + * whose address is in the 'ptr' field in the slot. + */ + +#define NS_MOREFRAG 0x0020 /* packet has more fragments */ + /* + * (VALE ports only) + * Set on all but the last slot of a multi-segment packet. + * The 'len' field refers to the individual fragment. + */ + #define NS_PORT_SHIFT 8 #define NS_PORT_MASK (0xff << NS_PORT_SHIFT) - /* - * in rx rings, the high 8 bits - * are the number of fragments. - */ + /* + * The high 8 bits of the flag, if not zero, indicate the + * destination port for the VALE switch, overriding + * the lookup table. + */ + #define NS_RFRAGS(_slot) ( ((_slot)->flags >> 8) & 0xff) - uint64_t ptr; /* pointer for indirect buffers */ -}; + /* + * (VALE rx rings only) the high 8 bits + * are the number of fragments. + */ + /* * struct netmap_ring * * Netmap representation of a TX or RX ring (also known as "queue"). * This is a queue implemented as a fixed-size circular array. - * At the software level, two fields are important: avail and cur. + * At the software level the important fields are: head, cur, tail. * * In TX rings: * - * avail tells how many slots are available for transmission. - * It is updated by the kernel in each netmap system call. - * It MUST BE decremented by the user when it - * adds a new packet to send. + * head first slot available for transmission. + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel * - * cur indicates the slot to use for the next packet - * to send (i.e. the "tail" of the queue). - * It MUST BE incremented by the user before - * netmap system calls to reflect the number of newly - * sent packets. - * It is checked by the kernel on netmap system calls - * (normally unmodified by the kernel unless invalid). + * [head .. tail-1] can be used for new packets to send; + * 'head' and 'cur' must be incremented as slots are filled + * with new packets to be sent; + * 'cur' can be moved further ahead if we need more space + * for new transmissions. * * In RX rings: * - * avail is the number of packets available (possibly 0). - * It is updated by the kernel in each netmap system call. - * It MUST BE decremented by the user when it - * consumes a packet. - * - * cur indicates the first slot that contains a packet not - * yet processed (the "head" of the queue). - * It MUST BE incremented by the user when it consumes - * a packet. - * - * reserved indicates the number of buffers before 'cur' - * that the user has not released yet. Normally 0, - * it MUST BE incremented by the user when it - * does not return the buffer immediately, and decremented - * when the buffer is finally freed. + * head first valid received packet + * cur wakeup point. select() and poll() will unblock + * when 'tail' moves past 'cur' + * tail (readonly) first slot reserved to the kernel * + * [head .. tail-1] contain received packets; + * 'head' and 'cur' must be incremented as slots are consumed + * and can be returned to the kernel; + * 'cur' can be moved further ahead if we want to wait for + * new packets without returning the previous ones. * * DATA OWNERSHIP/LOCKING: - * The netmap_ring, all slots, and buffers in the range - * [reserved-cur , cur+avail[ are owned by the user program, - * and the kernel only touches them in the same thread context - * during a system call. - * Other buffers are reserved for use by the NIC's DMA engines. - * - * FLAGS - * NR_TIMESTAMP updates the 'ts' field on each syscall. This is - * a global timestamp for all packets. - * NR_RX_TSTMP if set, the last 64 byte in each buffer will - * contain a timestamp for the frame supplied by - * the hardware (if supported) - * NR_FORWARD if set, the NS_FORWARD flag in each slot of the - * RX ring is checked, and if set the packet is - * passed to the other side (host stack or device, - * respectively). This permits bpf-like behaviour - * or transparency for selected packets. + * The netmap_ring, and all slots and buffers in the range + * [head .. tail-1] are owned by the user program; + * the kernel only accesses them during a netmap system call + * and in the user thread context. + * + * Other slots and buffers are reserved for use by the kernel */ struct netmap_ring { /* @@ -222,25 +257,44 @@ struct netmap_ring { * It contains the offset of the buffer region from this * descriptor. */ - const ssize_t buf_ofs; + const int64_t buf_ofs; const uint32_t num_slots; /* number of slots in the ring. */ - uint32_t avail; /* number of usable slots */ - uint32_t cur; /* 'current' r/w position */ - uint32_t reserved; /* not refilled before current */ + const uint32_t nr_buf_size; + const uint16_t ringid; + const uint16_t dir; /* 0: tx, 1: rx */ - const uint16_t nr_buf_size; - uint16_t flags; -#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ -#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ -#define NR_RX_TSTMP 0x0008 /* set rx timestamp in slots */ + uint32_t head; /* (u) first user slot */ + uint32_t cur; /* (u) wakeup point */ + uint32_t tail; /* (k) first kernel slot */ + + uint32_t flags; + + struct timeval ts; /* (k) time of last *sync() */ - struct timeval ts; /* time of last *sync() */ + /* opaque room for a mutex or similar object */ + uint8_t sem[128] __attribute__((__aligned__(NM_CACHE_ALIGN))); /* the slots follow. This struct has variable size */ struct netmap_slot slot[0]; /* array of slots. */ }; +/* + * RING FLAGS + */ +#define NR_TIMESTAMP 0x0002 /* set timestamp on *sync() */ + /* + * updates the 'ts' field on each netmap syscall. This saves + * saves a separate gettimeofday(), and is not much worse than + * software timestamps generated in the interrupt handler. + */ + +#define NR_FORWARD 0x0004 /* enable NS_FORWARD for ring */ + /* + * Enables the NS_FORWARD slot flag for the ring. + */ + + /* * Netmap representation of an interface and its queue(s). * This is initialized by the kernel when binding a file @@ -258,103 +312,214 @@ struct netmap_if { const uint32_t ni_flags; /* properties */ #define NI_PRIV_MEM 0x1 /* private memory region */ - const uint32_t ni_rx_rings; /* number of rx rings */ - const uint32_t ni_tx_rings; /* number of tx rings */ + /* + * The number of packet rings available in netmap mode. + * Physical NICs can have different numbers of tx and rx rings. + * Physical NICs also have a 'host' ring pair. + * Additionally, clients can request additional ring pairs to + * be used for internal communication. + */ + const uint32_t ni_tx_rings; /* number of HW tx rings */ + const uint32_t ni_rx_rings; /* number of HW rx rings */ + + uint32_t ni_bufs_head; /* head index for extra bufs */ + uint32_t ni_spare1[5]; /* * The following array contains the offset of each netmap ring - * from this structure. The first ni_tx_rings+1 entries refer - * to the tx rings, the next ni_rx_rings+1 refer to the rx rings - * (the last entry in each block refers to the host stack rings). + * from this structure, in the following order: + * NIC tx rings (ni_tx_rings); host tx ring (1); extra tx rings; + * NIC rx rings (ni_rx_rings); host tx ring (1); extra rx rings. + * * The area is filled up by the kernel on NIOCREGIF, * and then only read by userspace code. */ const ssize_t ring_ofs[0]; }; -#ifndef NIOCREGIF + +#ifndef NIOCREGIF /* * ioctl names and related fields * + * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, + * whose identity is set in NIOCREGIF through nr_ringid. + * These are non blocking and take no argument. + * * NIOCGINFO takes a struct ifreq, the interface name is the input, * the outputs are number of queues and number of descriptor * for each queue (useful to set number of threads etc.). * The info returned is only advisory and may change before * the interface is bound to a file descriptor. * - * NIOCREGIF takes an interface name within a struct ifreq, + * NIOCREGIF takes an interface name within a struct nmre, * and activates netmap mode on the interface (if possible). * - * nr_name is the name of the interface - * - * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings - * indicate the configuration of the port on return. - * - * On input, non-zero values for nr_tx_rings, nr_tx_slots and the - * rx counterparts may be used to reconfigure the port according - * to the requested values, but this is not guaranteed. - * The actual values are returned on completion of the ioctl(). + * The argument to NIOCGINFO/NIOCREGIF overlays struct ifreq so we + * can pass it down to other NIC-related ioctls. + * + * The actual argument (struct nmreq) has a number of options to request + * different functions. + * The following are used in NIOCREGIF when nr_cmd == 0: + * + * nr_name (in) + * The name of the port (em0, valeXXX:YYY, etc.) + * limited to IFNAMSIZ for backward compatibility. + * + * nr_version (in/out) + * Must match NETMAP_API as used in the kernel, error otherwise. + * Always returns the desired value on output. + * + * nr_tx_slots, nr_tx_slots, nr_tx_rings, nr_rx_rings (in/out) + * On input, non-zero values may be used to reconfigure the port + * according to the requested values, but this is not guaranteed. + * On output the actual values in use are reported. + * + * nr_ringid (in) + * Indicates how rings should be bound to the file descriptors. + * If nr_flags != 0, then the low bits (in NETMAP_RING_MASK) + * are used to indicate the ring number, and nr_flags specifies + * the actual rings to bind. NETMAP_NO_TX_POLL is unaffected. + * + * NOTE: THE FOLLOWING (nr_flags == 0) IS DEPRECATED: + * If nr_flags == 0, NETMAP_HW_RING and NETMAP_SW_RING control + * the binding as follows: + * 0 (default) binds all physical rings + * NETMAP_HW_RING | ring number binds a single ring pair + * NETMAP_SW_RING binds only the host tx/rx rings + * + * NETMAP_NO_TX_POLL can be OR-ed to make select()/poll() push + * packets on tx rings only if POLLOUT is set. + * The default is to push any pending packet. + * + * NETMAP_DO_RX_POLL can be OR-ed to make select()/poll() release + * packets on rx rings also when POLLIN is NOT set. + * The default is to touch the rx ring only with POLLIN. + * Note that this is the opposite of TX because it + * reflects the common usage. + * + * NOTE: NETMAP_PRIV_MEM IS DEPRECATED, use nr_arg2 instead. + * NETMAP_PRIV_MEM is set on return for ports that do not use + * the global memory allocator. + * This information is not significant and applications + * should look at the region id in nr_arg2 + * + * nr_flags is the recommended mode to indicate which rings should + * be bound to a file descriptor. Values are NR_REG_* + * + * nr_arg1 (in) The number of extra rings to be reserved. + * Especially when allocating a VALE port the system only + * allocates the amount of memory needed for the port. + * If more shared memory rings are desired (e.g. for pipes), + * the first invocation for the same basename/allocator + * should specify a suitable number. Memory cannot be + * extended after the first allocation without closing + * all ports on the same region. + * + * nr_arg2 (in/out) The identity of the memory region used. + * On input, 0 means the system decides autonomously, + * other values may try to select a specific region. + * On return the actual value is reported. + * Region '1' is the global allocator, normally shared + * by all interfaces. Other values are private regions. + * If two ports the same region zero-copy is possible. + * + * nr_arg3 (in/out) number of extra buffers to be allocated. + * + * + * + * nr_cmd (in) if non-zero indicates a special command: + * NETMAP_BDG_ATTACH and nr_name = vale*:ifname + * attaches the NIC to the switch; nr_ringid specifies + * which rings to use. Used by vale-ctl -a ... + * nr_arg1 = NETMAP_BDG_HOST also attaches the host port + * as in vale-ctl -h ... + * + * NETMAP_BDG_DETACH and nr_name = vale*:ifname + * disconnects a previously attached NIC. + * Used by vale-ctl -d ... + * + * NETMAP_BDG_LIST + * list the configuration of VALE switches. + * + * NETMAP_BDG_VNET_HDR + * Set the virtio-net header length used by the client + * of a VALE switch port. + * + * nr_arg1, nr_arg2, nr_arg3 (in/out) command specific * - * nr_ringid - * indicates how rings should be bound to the file descriptors. - * The default (0) means all physical rings of a NIC are bound. - * NETMAP_HW_RING plus a ring number lets you bind just - * a single ring pair. - * NETMAP_SW_RING binds only the host tx/rx rings - * NETMAP_NO_TX_POLL prevents select()/poll() from pushing - * out packets on the tx ring unless POLLOUT is specified. - * - * NETMAP_PRIV_MEM is a return value used to indicate that - * this ring is in a private memory region hence buffer - * swapping cannot be used - * - * nr_cmd is used to configure NICs attached to a VALE switch, - * or to dump the configuration of a VALE switch. * - * nr_cmd = NETMAP_BDG_ATTACH and nr_name = vale*:ifname - * attaches the NIC to the switch, with nr_ringid specifying - * which rings to use - * - * nr_cmd = NETMAP_BDG_DETACH and nr_name = vale*:ifname - * disconnects a previously attached NIC - * - * nr_cmd = NETMAP_BDG_LIST is used to list the configuration - * of VALE switches, with additional arguments. * - * NIOCTXSYNC, NIOCRXSYNC synchronize tx or rx queues, - * whose identity is set in NIOCREGIF through nr_ringid - * - * NETMAP_API is the API version. */ + /* - * struct nmreq overlays a struct ifreq + * struct nmreq overlays a struct ifreq (just the name) + * + * On input, nr_ringid indicates which rings we are requesting, + * with the low flags for the specific ring number. + * selection FLAGS RING INDEX + * + * all the NIC rings 0x0000 - + * only HOST ring 0x2000 ring index + * single NIC ring 0x4000 - + * all the NIC+HOST rings 0x6000 - + * one pipe ring, master 0x8000 ring index + * *** INVALID 0xA000 + * one pipe ring, slave 0xC000 ring index + * *** INVALID 0xE000 + * */ struct nmreq { char nr_name[IFNAMSIZ]; uint32_t nr_version; /* API version */ -#define NETMAP_API 5 /* current version */ uint32_t nr_offset; /* nifp offset in the shared region */ uint32_t nr_memsize; /* size of the shared region */ uint32_t nr_tx_slots; /* slots in tx rings */ uint32_t nr_rx_slots; /* slots in rx rings */ uint16_t nr_tx_rings; /* number of tx rings */ uint16_t nr_rx_rings; /* number of rx rings */ + uint16_t nr_ringid; /* ring(s) we care about */ -#define NETMAP_PRIV_MEM 0x8000 /* rings use private memory */ -#define NETMAP_HW_RING 0x4000 /* low bits indicate one hw ring */ -#define NETMAP_SW_RING 0x2000 /* process the sw ring */ +#define NETMAP_HW_RING 0x4000 /* single NIC ring pair */ +#define NETMAP_SW_RING 0x2000 /* only host ring pair */ + +#define NETMAP_RING_MASK 0x0fff /* the ring number */ + #define NETMAP_NO_TX_POLL 0x1000 /* no automatic txsync on poll */ -#define NETMAP_RING_MASK 0xfff /* the ring number */ + +#define NETMAP_DO_RX_POLL 0x8000 /* DO automatic rxsync on poll */ + uint16_t nr_cmd; #define NETMAP_BDG_ATTACH 1 /* attach the NIC */ #define NETMAP_BDG_DETACH 2 /* detach the NIC */ #define NETMAP_BDG_LOOKUP_REG 3 /* register lookup function */ #define NETMAP_BDG_LIST 4 /* get bridge's info */ - uint16_t nr_arg1; +#define NETMAP_BDG_VNET_HDR 5 /* set the port virtio-net-hdr length */ +#define NETMAP_BDG_OFFSET NETMAP_BDG_VNET_HDR /* deprecated alias */ + + uint16_t nr_arg1; /* reserve extra rings in NIOCREGIF */ #define NETMAP_BDG_HOST 1 /* attach the host stack on ATTACH */ + uint16_t nr_arg2; - uint32_t spare2[3]; + uint32_t nr_arg3; /* req. extra buffers in NIOCREGIF */ + uint32_t nr_flags; + /* various modes, extends nr_ringid */ + uint32_t spare2[1]; +}; + +#define NR_REG_MASK 0xf /* values for nr_flags */ +enum { NR_REG_DEFAULT = 0, /* backward compat, should not be used. */ + NR_REG_ALL_NIC = 1, + NR_REG_SW = 2, + NR_REG_NIC_SW = 3, + NR_REG_ONE_NIC = 4, + NR_REG_PIPE_MASTER = 5, + NR_REG_PIPE_SLAVE = 6, }; +/* monitor uses the NR_REG to select the rings to monitor */ +#define NR_MONITOR_TX 0x100 +#define NR_MONITOR_RX 0x200 + /* * FreeBSD uses the size value embedded in the _IOWR to determine @@ -364,9 +529,22 @@ struct nmreq { */ #define NIOCGINFO _IOWR('i', 145, struct nmreq) /* return IF info */ #define NIOCREGIF _IOWR('i', 146, struct nmreq) /* interface register */ -#define NIOCUNREGIF _IO('i', 147) /* deprecated. Was interface unregister */ #define NIOCTXSYNC _IO('i', 148) /* sync tx queues */ #define NIOCRXSYNC _IO('i', 149) /* sync rx queues */ #endif /* !NIOCREGIF */ + +/* + * Helper functions for kernel and userspace + */ + +/* + * check if space is available in the ring. + */ +static inline int +nm_ring_empty(struct netmap_ring *ring) +{ + return (ring->cur == ring->tail); +} + #endif /* _NET_NETMAP_H_ */ diff --git a/sys/net/netmap_user.h b/sys/net/netmap_user.h index fcb5cb3ea..9c3a4c1e5 100644 --- a/sys/net/netmap_user.h +++ b/sys/net/netmap_user.h @@ -1,40 +1,34 @@ /* - * Copyright (C) 2011 Matteo Landi, Luigi Rizzo. All rights reserved. - * + * Copyright (C) 2011-2014 Universita` di Pisa. All rights reserved. + * * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * + * modification, are permitted provided that the following conditions + * are met: + * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. - * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the - * distribution. - * - * 3. Neither the name of the authors nor the names of their contributors - * may be used to endorse or promote products derived from this - * software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY MATTEO LANDI AND CONTRIBUTORS "AS IS" AND + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL MATTEO LANDI OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ /* * $FreeBSD$ * - * This header contains the macros used to manipulate netmap structures - * and packets in userspace. See netmap(4) for more information. + * Functions and macros to manipulate netmap structures and packets + * in userspace. See netmap(4) for more information. * * The address of the struct netmap_if, say nifp, is computed from the * value returned from ioctl(.., NIOCREG, ...) and the mmap region: @@ -49,22 +43,44 @@ * we can access ring->nr_cur, ring->nr_avail, ring->nr_flags * * ring->slot[i] gives us the i-th slot (we can access - * directly plen, flags, bufindex) + * directly len, flags, buf_idx) * * char *buf = NETMAP_BUF(ring, x) returns a pointer to * the buffer numbered x * - * Since rings are circular, we have macros to compute the next index - * i = NETMAP_RING_NEXT(ring, i); + * All ring indexes (head, cur, tail) should always move forward. + * To compute the next index in a circular ring you can use + * i = nm_ring_next(ring, i); + * + * To ease porting apps from pcap to netmap we supply a few fuctions + * that can be called to open, close, read and write on netmap in a way + * similar to libpcap. Note that the read/write function depend on + * an ioctl()/select()/poll() being issued to refill rings or push + * packets out. + * + * In order to use these, include #define NETMAP_WITH_LIBS + * in the source file that invokes these functions. */ #ifndef _NET_NETMAP_USER_H_ #define _NET_NETMAP_USER_H_ +#include +#include /* apple needs sockaddr */ +#include /* IFNAMSIZ */ + +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif /* likely and unlikely */ + +#include + +/* helper macro */ #define _NETMAP_OFFSET(type, ptr, offset) \ ((type)(void *)((char *)(ptr) + (offset))) -#define NETMAP_IF(b, o) _NETMAP_OFFSET(struct netmap_if *, b, o) +#define NETMAP_IF(_base, _ofs) _NETMAP_OFFSET(struct netmap_if *, _base, _ofs) #define NETMAP_TXRING(nifp, index) _NETMAP_OFFSET(struct netmap_ring *, \ nifp, (nifp)->ring_ofs[index] ) @@ -77,19 +93,585 @@ #define NETMAP_BUF_IDX(ring, buf) \ ( ((char *)(buf) - ((char *)(ring) + (ring)->buf_ofs) ) / \ - (ring)->nr_buf_size ) + (ring)->nr_buf_size ) + + +static inline uint32_t +nm_ring_next(struct netmap_ring *r, uint32_t i) +{ + return ( unlikely(i + 1 == r->num_slots) ? 0 : i + 1); +} + + +/* + * Return 1 if we have pending transmissions in the tx ring. + * When everything is complete ring->head = ring->tail + 1 (modulo ring size) + */ +static inline int +nm_tx_pending(struct netmap_ring *r) +{ + return nm_ring_next(r, r->tail) != r->head; +} + -#define NETMAP_RING_NEXT(r, i) \ - ((i)+1 == (r)->num_slots ? 0 : (i) + 1 ) +static inline uint32_t +nm_ring_space(struct netmap_ring *ring) +{ + int ret = ring->tail - ring->cur; + if (ret < 0) + ret += ring->num_slots; + return ret; +} -#define NETMAP_RING_FIRST_RESERVED(r) \ - ( (r)->cur < (r)->reserved ? \ - (r)->cur + (r)->num_slots - (r)->reserved : \ - (r)->cur - (r)->reserved ) +#ifdef NETMAP_WITH_LIBS /* - * Return 1 if the given tx ring is empty. + * Support for simple I/O libraries. + * Include other system headers required for compiling this. */ -#define NETMAP_TX_RING_EMPTY(r) ((r)->avail >= (r)->num_slots - 1) + +#ifndef HAVE_NETMAP_WITH_LIBS +#define HAVE_NETMAP_WITH_LIBS + +#include +#include +#include /* memset */ +#include +#include /* EINVAL */ +#include /* O_RDWR */ +#include /* close() */ +#include +#include + +#ifndef ND /* debug macros */ +/* debug support */ +#define ND(_fmt, ...) do {} while(0) +#define D(_fmt, ...) \ + do { \ + struct timeval t0; \ + gettimeofday(&t0, NULL); \ + fprintf(stderr, "%03d.%06d %s [%d] " _fmt "\n", \ + (int)(t0.tv_sec % 1000), (int)t0.tv_usec, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + } while (0) + +/* Rate limited version of "D", lps indicates how many per second */ +#define RD(lps, format, ...) \ + do { \ + static int t0, __cnt; \ + struct timeval __xxts; \ + gettimeofday(&__xxts, NULL); \ + if (t0 != __xxts.tv_sec) { \ + t0 = __xxts.tv_sec; \ + __cnt = 0; \ + } \ + if (__cnt++ < lps) { \ + D(format, ##__VA_ARGS__); \ + } \ + } while (0) +#endif + +struct nm_pkthdr { /* same as pcap_pkthdr */ + struct timeval ts; + uint32_t caplen; + uint32_t len; +}; + +struct nm_stat { /* same as pcap_stat */ + u_int ps_recv; + u_int ps_drop; + u_int ps_ifdrop; +#ifdef WIN32 + u_int bs_capt; +#endif /* WIN32 */ +}; + +#define NM_ERRBUF_SIZE 512 + +struct nm_desc { + struct nm_desc *self; /* point to self if netmap. */ + int fd; + void *mem; + int memsize; + int done_mmap; /* set if mem is the result of mmap */ + struct netmap_if * const nifp; + uint16_t first_tx_ring, last_tx_ring, cur_tx_ring; + uint16_t first_rx_ring, last_rx_ring, cur_rx_ring; + struct nmreq req; /* also contains the nr_name = ifname */ + struct nm_pkthdr hdr; + + /* + * The memory contains netmap_if, rings and then buffers. + * Given a pointer (e.g. to nm_inject) we can compare with + * mem/buf_start/buf_end to tell if it is a buffer or + * some other descriptor in our region. + * We also store a pointer to some ring as it helps in the + * translation from buffer indexes to addresses. + */ + struct netmap_ring * const some_ring; + void * const buf_start; + void * const buf_end; + /* parameters from pcap_open_live */ + int snaplen; + int promisc; + int to_ms; + char *errbuf; + + /* save flags so we can restore them on close */ + uint32_t if_flags; + uint32_t if_reqcap; + uint32_t if_curcap; + + struct nm_stat st; + char msg[NM_ERRBUF_SIZE]; +}; + +/* + * when the descriptor is open correctly, d->self == d + * Eventually we should also use some magic number. + */ +#define P2NMD(p) ((struct nm_desc *)(p)) +#define IS_NETMAP_DESC(d) ((d) && P2NMD(d)->self == P2NMD(d)) +#define NETMAP_FD(d) (P2NMD(d)->fd) + + +/* + * this is a slightly optimized copy routine which rounds + * to multiple of 64 bytes and is often faster than dealing + * with other odd sizes. We assume there is enough room + * in the source and destination buffers. + * + * XXX only for multiples of 64 bytes, non overlapped. + */ +static inline void +nm_pkt_copy(const void *_src, void *_dst, int l) +{ + const uint64_t *src = (const uint64_t *)_src; + uint64_t *dst = (uint64_t *)_dst; + + if (unlikely(l >= 1024)) { + memcpy(dst, src, l); + return; + } + for (; likely(l > 0); l-=64) { + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + *dst++ = *src++; + } +} + + +/* + * The callback, invoked on each received packet. Same as libpcap + */ +typedef void (*nm_cb_t)(u_char *, const struct nm_pkthdr *, const u_char *d); + +/* + *--- the pcap-like API --- + * + * nm_open() opens a file descriptor, binds to a port and maps memory. + * + * ifname (netmap:foo or vale:foo) is the port name + * a suffix can indicate the follwing: + * ^ bind the host (sw) ring pair + * * bind host and NIC ring pairs (transparent) + * -NN bind individual NIC ring pair + * {NN bind master side of pipe NN + * }NN bind slave side of pipe NN + * + * req provides the initial values of nmreq before parsing ifname. + * Remember that the ifname parsing will override the ring + * number in nm_ringid, and part of nm_flags; + * flags special functions, normally 0 + * indicates which fields of *arg are significant + * arg special functions, normally NULL + * if passed a netmap_desc with mem != NULL, + * use that memory instead of mmap. + */ + +static struct nm_desc *nm_open(const char *ifname, const struct nmreq *req, + uint64_t flags, const struct nm_desc *arg); + +/* + * nm_open can import some fields from the parent descriptor. + * These flags control which ones. + * Also in flags you can specify NETMAP_NO_TX_POLL and NETMAP_DO_RX_POLL, + * which set the initial value for these flags. + * Note that the 16 low bits of the flags are reserved for data + * that may go into the nmreq. + */ +enum { + NM_OPEN_NO_MMAP = 0x040000, /* reuse mmap from parent */ + NM_OPEN_IFNAME = 0x080000, /* nr_name, nr_ringid, nr_flags */ + NM_OPEN_ARG1 = 0x100000, + NM_OPEN_ARG2 = 0x200000, + NM_OPEN_ARG3 = 0x400000, + NM_OPEN_RING_CFG = 0x800000, /* tx|rx rings|slots */ +}; + + +/* + * nm_close() closes and restores the port to its previous state + */ + +static int nm_close(struct nm_desc *); + +/* + * nm_inject() is the same as pcap_inject() + * nm_dispatch() is the same as pcap_dispatch() + * nm_nextpkt() is the same as pcap_next() + */ + +static int nm_inject(struct nm_desc *, const void *, size_t); +static int nm_dispatch(struct nm_desc *, int, nm_cb_t, u_char *); +static u_char *nm_nextpkt(struct nm_desc *, struct nm_pkthdr *); + + +/* + * Try to open, return descriptor if successful, NULL otherwise. + * An invalid netmap name will return errno = 0; + * You can pass a pointer to a pre-filled nm_desc to add special + * parameters. Flags is used as follows + * NM_OPEN_NO_MMAP use the memory from arg, only + * if the nr_arg2 (memory block) matches. + * NM_OPEN_ARG1 use req.nr_arg1 from arg + * NM_OPEN_ARG2 use req.nr_arg2 from arg + * NM_OPEN_RING_CFG user ring config from arg + */ +static struct nm_desc * +nm_open(const char *ifname, const struct nmreq *req, + uint64_t new_flags, const struct nm_desc *arg) +{ + struct nm_desc *d = NULL; + const struct nm_desc *parent = arg; + u_int namelen; + uint32_t nr_ringid = 0, nr_flags; + const char *port = NULL; + const char *errmsg = NULL; + + if (strncmp(ifname, "netmap:", 7) && strncmp(ifname, "vale", 4)) { + errno = 0; /* name not recognised, not an error */ + return NULL; + } + if (ifname[0] == 'n') + ifname += 7; + /* scan for a separator */ + for (port = ifname; *port && !index("-*^{}", *port); port++) + ; + namelen = port - ifname; + if (namelen >= sizeof(d->req.nr_name)) { + errmsg = "name too long"; + goto fail; + } + switch (*port) { + default: /* '\0', no suffix */ + nr_flags = NR_REG_ALL_NIC; + break; + case '-': /* one NIC */ + nr_flags = NR_REG_ONE_NIC; + nr_ringid = atoi(port + 1); + break; + case '*': /* NIC and SW, ignore port */ + nr_flags = NR_REG_NIC_SW; + if (port[1]) { + errmsg = "invalid port for nic+sw"; + goto fail; + } + break; + case '^': /* only sw ring */ + nr_flags = NR_REG_SW; + if (port[1]) { + errmsg = "invalid port for sw ring"; + goto fail; + } + break; + case '{': + nr_flags = NR_REG_PIPE_MASTER; + nr_ringid = atoi(port + 1); + break; + case '}': + nr_flags = NR_REG_PIPE_SLAVE; + nr_ringid = atoi(port + 1); + break; + } + + if (nr_ringid >= NETMAP_RING_MASK) { + errmsg = "invalid ringid"; + goto fail; + } + /* add the *XPOLL flags */ + nr_ringid |= new_flags & (NETMAP_NO_TX_POLL | NETMAP_DO_RX_POLL); + + d = (struct nm_desc *)calloc(1, sizeof(*d)); + if (d == NULL) { + errmsg = "nm_desc alloc failure"; + errno = ENOMEM; + return NULL; + } + d->self = d; /* set this early so nm_close() works */ + d->fd = open("/dev/netmap", O_RDWR); + if (d->fd < 0) { + errmsg = "cannot open /dev/netmap"; + goto fail; + } + + if (req) + d->req = *req; + d->req.nr_version = NETMAP_API; + d->req.nr_ringid &= ~NETMAP_RING_MASK; + + /* these fields are overridden by ifname and flags processing */ + d->req.nr_ringid |= nr_ringid; + d->req.nr_flags = nr_flags; + memcpy(d->req.nr_name, ifname, namelen); + d->req.nr_name[namelen] = '\0'; + /* optionally import info from parent */ + if (IS_NETMAP_DESC(parent) && new_flags) { + if (new_flags & NM_OPEN_ARG1) + D("overriding ARG1 %d", parent->req.nr_arg1); + d->req.nr_arg1 = new_flags & NM_OPEN_ARG1 ? + parent->req.nr_arg1 : 4; + if (new_flags & NM_OPEN_ARG2) + D("overriding ARG2 %d", parent->req.nr_arg2); + d->req.nr_arg2 = new_flags & NM_OPEN_ARG2 ? + parent->req.nr_arg2 : 0; + if (new_flags & NM_OPEN_ARG3) + D("overriding ARG3 %d", parent->req.nr_arg3); + d->req.nr_arg3 = new_flags & NM_OPEN_ARG3 ? + parent->req.nr_arg3 : 0; + if (new_flags & NM_OPEN_RING_CFG) { + D("overriding RING_CFG"); + d->req.nr_tx_slots = parent->req.nr_tx_slots; + d->req.nr_rx_slots = parent->req.nr_rx_slots; + d->req.nr_tx_rings = parent->req.nr_tx_rings; + d->req.nr_rx_rings = parent->req.nr_rx_rings; + } + if (new_flags & NM_OPEN_IFNAME) { + D("overriding ifname %s ringid 0x%x flags 0x%x", + parent->req.nr_name, parent->req.nr_ringid, + parent->req.nr_flags); + memcpy(d->req.nr_name, parent->req.nr_name, + sizeof(d->req.nr_name)); + d->req.nr_ringid = parent->req.nr_ringid; + d->req.nr_flags = parent->req.nr_flags; + } + } + if (ioctl(d->fd, NIOCREGIF, &d->req)) { + errmsg = "NIOCREGIF failed"; + goto fail; + } + + if (IS_NETMAP_DESC(parent) && parent->mem && + parent->req.nr_arg2 == d->req.nr_arg2) { + /* do not mmap, inherit from parent */ + d->memsize = parent->memsize; + d->mem = parent->mem; + } else { + d->memsize = d->req.nr_memsize; + d->mem = mmap(0, d->memsize, PROT_WRITE | PROT_READ, MAP_SHARED, + d->fd, 0); + if (d->mem == NULL) { + errmsg = "mmap failed"; + goto fail; + } + d->done_mmap = 1; + } + { + struct netmap_if *nifp = NETMAP_IF(d->mem, d->req.nr_offset); + struct netmap_ring *r = NETMAP_RXRING(nifp, ); + + *(struct netmap_if **)(uintptr_t)&(d->nifp) = nifp; + *(struct netmap_ring **)(uintptr_t)&d->some_ring = r; + *(void **)(uintptr_t)&d->buf_start = NETMAP_BUF(r, 0); + *(void **)(uintptr_t)&d->buf_end = + (char *)d->mem + d->memsize; + } + + if (nr_flags == NR_REG_SW) { /* host stack */ + d->first_tx_ring = d->last_tx_ring = d->req.nr_tx_rings; + d->first_rx_ring = d->last_rx_ring = d->req.nr_rx_rings; + } else if (nr_flags == NR_REG_ALL_NIC) { /* only nic */ + d->first_tx_ring = 0; + d->first_rx_ring = 0; + d->last_tx_ring = d->req.nr_tx_rings - 1; + d->last_rx_ring = d->req.nr_rx_rings - 1; + } else if (nr_flags == NR_REG_NIC_SW) { + d->first_tx_ring = 0; + d->first_rx_ring = 0; + d->last_tx_ring = d->req.nr_tx_rings; + d->last_rx_ring = d->req.nr_rx_rings; + } else if (nr_flags == NR_REG_ONE_NIC) { + /* XXX check validity */ + d->first_tx_ring = d->last_tx_ring = + d->first_rx_ring = d->last_rx_ring = nr_ringid; + } else { /* pipes */ + d->first_tx_ring = d->last_tx_ring = 0; + d->first_rx_ring = d->last_rx_ring = 0; + } + +#ifdef DEBUG_NETMAP_USER + { /* debugging code */ + int i; + + D("%s tx %d .. %d %d rx %d .. %d %d", ifname, + d->first_tx_ring, d->last_tx_ring, d->req.nr_tx_rings, + d->first_rx_ring, d->last_rx_ring, d->req.nr_rx_rings); + for (i = 0; i <= d->req.nr_tx_rings; i++) { + struct netmap_ring *r = NETMAP_TXRING(d->nifp, i); + D("TX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail); + } + for (i = 0; i <= d->req.nr_rx_rings; i++) { + struct netmap_ring *r = NETMAP_RXRING(d->nifp, i); + D("RX%d %p h %d c %d t %d", i, r, r->head, r->cur, r->tail); + } + } +#endif /* debugging */ + + d->cur_tx_ring = d->first_tx_ring; + d->cur_rx_ring = d->first_rx_ring; + return d; + +fail: + nm_close(d); + if (errmsg) + D("%s %s", errmsg, ifname); + errno = EINVAL; + return NULL; +} + + +static int +nm_close(struct nm_desc *d) +{ + /* + * ugly trick to avoid unused warnings + */ + static void *__xxzt[] __attribute__ ((unused)) = + { (void *)nm_open, (void *)nm_inject, + (void *)nm_dispatch, (void *)nm_nextpkt } ; + + if (d == NULL || d->self != d) + return EINVAL; + if (d->done_mmap && d->mem) + munmap(d->mem, d->memsize); + if (d->fd != -1) + close(d->fd); + bzero(d, sizeof(*d)); + free(d); + return 0; +} + + +/* + * Same prototype as pcap_inject(), only need to cast. + */ +static int +nm_inject(struct nm_desc *d, const void *buf, size_t size) +{ + u_int c, n = d->last_tx_ring - d->first_tx_ring + 1; + + for (c = 0; c < n ; c++) { + /* compute current ring to use */ + struct netmap_ring *ring; + uint32_t i, idx; + uint32_t ri = d->cur_tx_ring + c; + + if (ri > d->last_tx_ring) + ri = d->first_tx_ring; + ring = NETMAP_TXRING(d->nifp, ri); + if (nm_ring_empty(ring)) { + continue; + } + i = ring->cur; + idx = ring->slot[i].buf_idx; + ring->slot[i].len = size; + nm_pkt_copy(buf, NETMAP_BUF(ring, idx), size); + d->cur_tx_ring = ri; + ring->head = ring->cur = nm_ring_next(ring, i); + return size; + } + return 0; /* fail */ +} + + +/* + * Same prototype as pcap_dispatch(), only need to cast. + */ +static int +nm_dispatch(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg) +{ + int n = d->last_rx_ring - d->first_rx_ring + 1; + int c, got = 0, ri = d->cur_rx_ring; + + if (cnt == 0) + cnt = -1; + /* cnt == -1 means infinite, but rings have a finite amount + * of buffers and the int is large enough that we never wrap, + * so we can omit checking for -1 + */ + for (c=0; c < n && cnt != got; c++) { + /* compute current ring to use */ + struct netmap_ring *ring; + + ri = d->cur_rx_ring + c; + if (ri > d->last_rx_ring) + ri = d->first_rx_ring; + ring = NETMAP_RXRING(d->nifp, ri); + for ( ; !nm_ring_empty(ring) && cnt != got; got++) { + u_int i = ring->cur; + u_int idx = ring->slot[i].buf_idx; + u_char *buf = (u_char *)NETMAP_BUF(ring, idx); + + // __builtin_prefetch(buf); + d->hdr.len = d->hdr.caplen = ring->slot[i].len; + d->hdr.ts = ring->ts; + cb(arg, &d->hdr, buf); + ring->head = ring->cur = nm_ring_next(ring, i); + } + } + d->cur_rx_ring = ri; + return got; +} + +static u_char * +nm_nextpkt(struct nm_desc *d, struct nm_pkthdr *hdr) +{ + int ri = d->cur_rx_ring; + + do { + /* compute current ring to use */ + struct netmap_ring *ring = NETMAP_RXRING(d->nifp, ri); + if (!nm_ring_empty(ring)) { + u_int i = ring->cur; + u_int idx = ring->slot[i].buf_idx; + u_char *buf = (u_char *)NETMAP_BUF(ring, idx); + + // __builtin_prefetch(buf); + hdr->ts = ring->ts; + hdr->len = hdr->caplen = ring->slot[i].len; + ring->cur = nm_ring_next(ring, i); + /* we could postpone advancing head if we want + * to hold the buffer. This can be supported in + * the future. + */ + ring->head = ring->cur; + d->cur_rx_ring = ri; + return buf; + } + ri++; + if (ri > d->last_rx_ring) + ri = d->first_rx_ring; + } while (ri != d->cur_rx_ring); + return NULL; /* nothing found */ +} + +#endif /* !HAVE_NETMAP_WITH_LIBS */ + +#endif /* NETMAP_WITH_LIBS */ #endif /* _NET_NETMAP_USER_H_ */ diff --git a/tools/tools/netmap/Makefile b/tools/tools/netmap/Makefile index d737bac71..c50247366 100644 --- a/tools/tools/netmap/Makefile +++ b/tools/tools/netmap/Makefile @@ -3,26 +3,30 @@ # # For multiple programs using a single source file each, # we can just define 'progs' and create custom targets. -PROGS = pkt-gen bridge vale-ctl testpcap libnetmap.so +PROGS = pkt-gen bridge vale-ctl -CLEANFILES = $(PROGS) pcap.o nm_util.o +CLEANFILES = $(PROGS) *.o NO_MAN= -CFLAGS += -Werror -Wall -nostdinc -I/usr/include -I../../../sys +CFLAGS += -Werror -Wall # -nostdinc -I/usr/include -I../../../sys CFLAGS += -Wextra -LDFLAGS += -lpthread -lpcap +LDFLAGS += -lpthread +.ifdef WITHOUT_PCAP +CFLAGS += -DNO_PCAP +.else +LDFLAGS += -lpcap +.endif .include .include all: $(PROGS) -pkt-gen bridge: nm_util.o - $(CC) $(CFLAGS) -o ${.TARGET} ${.TARGET:=.c} nm_util.o $(LDFLAGS) +pkt-gen: pkt-gen.o + $(CC) $(CFLAGS) -o pkt-gen pkt-gen.o $(LDFLAGS) -testpcap: pcap.c libnetmap.so - $(CC) $(CFLAGS) -DTEST -L. -lnetmap -o ${.TARGET} pcap.c - -libnetmap.so: pcap.c nm_util.c - $(CC) $(CFLAGS) -fpic -c ${.ALLSRC} - $(CC) -shared -o ${.TARGET} ${.ALLSRC:.c=.o} +bridge: bridge.o + $(CC) $(CFLAGS) -o bridge bridge.o + +vale-ctl: vale-ctl.o + $(CC) $(CFLAGS) -o vale-ctl vale-ctl.o diff --git a/tools/tools/netmap/README b/tools/tools/netmap/README index 2bde6f2ab..40378e62b 100644 --- a/tools/tools/netmap/README +++ b/tools/tools/netmap/README @@ -6,19 +6,4 @@ This directory contains examples that use netmap bridge a two-port jumper wire, also using the native API - testpcap a jumper wire using libnetmap (or libpcap) - - click* various click examples - ------------------------------------------------------------- -Some performance data as of may 2012 for applications using libpcap. -Throughput is generally in Mpps computed with the 64-byte frames, -using 1 core on a 2.9GHz CPU and 10Gbit/s interface - -Libpcap version -- Application --------------------- -BSD netmap ---------------------------------------------------- - 0.77 3.82 ports/trafshow (version 5) - 0.94 7.7 net-mgmt/ipcad (ip accounting daemon) - 0.9 5.0 net-mgmt/darkstat (ip accounting + graphing) - 0.83 2.45 net-mgmt/iftop (curses traffic display) + vale-ctl the program to control VALE bridges diff --git a/tools/tools/netmap/bridge.c b/tools/tools/netmap/bridge.c index 0aca44d44..0895d4ede 100644 --- a/tools/tools/netmap/bridge.c +++ b/tools/tools/netmap/bridge.c @@ -1,5 +1,5 @@ /* - * (C) 2011 Luigi Rizzo, Matteo Landi + * (C) 2011-2014 Luigi Rizzo, Matteo Landi * * BSD license * @@ -9,14 +9,15 @@ * $FreeBSD$ */ -#include "nm_util.h" - +#include +#define NETMAP_WITH_LIBS +#include +#include int verbose = 0; -char *version = "$Id$"; - static int do_abort = 0; +static int zerocopy = 1; /* enable zerocopy if possible */ static void sigint_h(int sig) @@ -27,6 +28,26 @@ sigint_h(int sig) } +/* + * how many packets on this set of queues ? + */ +int +pkt_queued(struct nm_desc *d, int tx) +{ + u_int i, tot = 0; + + if (tx) { + for (i = d->first_tx_ring; i <= d->last_tx_ring; i++) { + tot += nm_ring_space(NETMAP_TXRING(d->nifp, i)); + } + } else { + for (i = d->first_rx_ring; i <= d->last_rx_ring; i++) { + tot += nm_ring_space(NETMAP_RXRING(d->nifp, i)); + } + } + return tot; +} + /* * move up to 'limit' pkts from rxring to txring swapping buffers. */ @@ -42,20 +63,16 @@ process_rings(struct netmap_ring *rxring, struct netmap_ring *txring, msg, rxring->flags, txring->flags); j = rxring->cur; /* RX */ k = txring->cur; /* TX */ - if (rxring->avail < limit) - limit = rxring->avail; - if (txring->avail < limit) - limit = txring->avail; + m = nm_ring_space(rxring); + if (m < limit) + limit = m; + m = nm_ring_space(txring); + if (m < limit) + limit = m; m = limit; while (limit-- > 0) { struct netmap_slot *rs = &rxring->slot[j]; struct netmap_slot *ts = &txring->slot[k]; -#ifdef NO_SWAP - char *rxbuf = NETMAP_BUF(rxring, rs->buf_idx); - char *txbuf = NETMAP_BUF(txring, ts->buf_idx); -#else - uint32_t pkt; -#endif /* swap packets */ if (ts->buf_idx < 2 || rs->buf_idx < 2) { @@ -63,31 +80,31 @@ process_rings(struct netmap_ring *rxring, struct netmap_ring *txring, j, rs->buf_idx, k, ts->buf_idx); sleep(2); } -#ifndef NO_SWAP - pkt = ts->buf_idx; - ts->buf_idx = rs->buf_idx; - rs->buf_idx = pkt; -#endif /* copy the packet length. */ - if (rs->len < 14 || rs->len > 2048) + if (rs->len > 2048) { D("wrong len %d rx[%d] -> tx[%d]", rs->len, j, k); - else if (verbose > 1) + rs->len = 0; + } else if (verbose > 1) { D("%s send len %d rx[%d] -> tx[%d]", msg, rs->len, j, k); + } ts->len = rs->len; -#ifdef NO_SWAP - pkt_copy(rxbuf, txbuf, ts->len); -#else - /* report the buffer change. */ - ts->flags |= NS_BUF_CHANGED; - rs->flags |= NS_BUF_CHANGED; -#endif /* NO_SWAP */ - j = NETMAP_RING_NEXT(rxring, j); - k = NETMAP_RING_NEXT(txring, k); + if (zerocopy) { + uint32_t pkt = ts->buf_idx; + ts->buf_idx = rs->buf_idx; + rs->buf_idx = pkt; + /* report the buffer change. */ + ts->flags |= NS_BUF_CHANGED; + rs->flags |= NS_BUF_CHANGED; + } else { + char *rxbuf = NETMAP_BUF(rxring, rs->buf_idx); + char *txbuf = NETMAP_BUF(txring, ts->buf_idx); + nm_pkt_copy(rxbuf, txbuf, ts->len); + } + j = nm_ring_next(rxring, j); + k = nm_ring_next(txring, k); } - rxring->avail -= m; - txring->avail -= m; - rxring->cur = j; - txring->cur = k; + rxring->head = rxring->cur = j; + txring->head = txring->cur = k; if (verbose && m > 0) D("%s sent %d packets to %p", msg, m, txring); @@ -96,22 +113,22 @@ process_rings(struct netmap_ring *rxring, struct netmap_ring *txring, /* move packts from src to destination */ static int -move(struct my_ring *src, struct my_ring *dst, u_int limit) +move(struct nm_desc *src, struct nm_desc *dst, u_int limit) { struct netmap_ring *txring, *rxring; - u_int m = 0, si = src->begin, di = dst->begin; - const char *msg = (src->queueid & NETMAP_SW_RING) ? + u_int m = 0, si = src->first_rx_ring, di = dst->first_tx_ring; + const char *msg = (src->req.nr_ringid & NETMAP_SW_RING) ? "host->net" : "net->host"; - while (si < src->end && di < dst->end) { + while (si <= src->last_rx_ring && di <= dst->last_tx_ring) { rxring = NETMAP_RXRING(src->nifp, si); txring = NETMAP_TXRING(dst->nifp, di); ND("txring %p rxring %p", txring, rxring); - if (rxring->avail == 0) { + if (nm_ring_empty(rxring)) { si++; continue; } - if (txring->avail == 0) { + if (nm_ring_empty(txring)) { di++; continue; } @@ -121,28 +138,6 @@ move(struct my_ring *src, struct my_ring *dst, u_int limit) return (m); } -/* - * how many packets on this set of queues ? - */ -static int -pkt_queued(struct my_ring *me, int tx) -{ - u_int i, tot = 0; - - ND("me %p begin %d end %d", me, me->begin, me->end); - for (i = me->begin; i < me->end; i++) { - struct netmap_ring *ring = tx ? - NETMAP_TXRING(me->nifp, i) : NETMAP_RXRING(me->nifp, i); - tot += ring->avail; - } - if (0 && verbose && tot && !tx) - D("ring %s %s %s has %d avail at %d", - me->ifname, tx ? "tx": "rx", - me->end >= me->nifp->ni_tx_rings ? // XXX who comes first ? - "host":"net", - tot, NETMAP_TXRING(me->nifp, me->begin)->cur); - return tot; -} static void usage(void) @@ -163,17 +158,16 @@ int main(int argc, char **argv) { struct pollfd pollfd[2]; - int i, ch; + int ch; u_int burst = 1024, wait_link = 4; - struct my_ring me[2]; + struct nm_desc *pa = NULL, *pb = NULL; char *ifa = NULL, *ifb = NULL; + char ifabuf[64] = { 0 }; - fprintf(stderr, "%s %s built %s %s\n", - argv[0], version, __DATE__, __TIME__); - - bzero(me, sizeof(me)); + fprintf(stderr, "%s built %s %s\n", + argv[0], __DATE__, __TIME__); - while ( (ch = getopt(argc, argv, "b:i:vw:")) != -1) { + while ( (ch = getopt(argc, argv, "b:ci:vw:")) != -1) { switch (ch) { default: D("bad option %c %s", ch, optarg); @@ -191,6 +185,9 @@ main(int argc, char **argv) D("%s ignored, already have 2 interfaces", optarg); break; + case 'c': + zerocopy = 0; /* do not zerocopy */ + break; case 'v': verbose++; break; @@ -224,34 +221,38 @@ main(int argc, char **argv) D("invalid wait_link %d, set to 4", wait_link); wait_link = 4; } - /* setup netmap interface #1. */ - me[0].ifname = ifa; - me[1].ifname = ifb; if (!strcmp(ifa, ifb)) { D("same interface, endpoint 0 goes to host"); - i = NETMAP_SW_RING; + snprintf(ifabuf, sizeof(ifabuf) - 1, "%s^", ifa); + ifa = ifabuf; } else { /* two different interfaces. Take all rings on if1 */ - i = 0; // all hw rings } - if (netmap_open(me, i, 1)) + pa = nm_open(ifa, NULL, 0, NULL); + if (pa == NULL) { + D("cannot open %s", ifa); return (1); - me[1].mem = me[0].mem; /* copy the pointer, so only one mmap */ - if (netmap_open(me+1, 0, 1)) + } + // XXX use a single mmap ? + pb = nm_open(ifb, NULL, NM_OPEN_NO_MMAP, pa); + if (pb == NULL) { + D("cannot open %s", ifb); + nm_close(pa); return (1); + } + zerocopy = zerocopy && (pa->mem == pb->mem); + D("------- zerocopy %ssupported", zerocopy ? "" : "NOT "); /* setup poll(2) variables. */ memset(pollfd, 0, sizeof(pollfd)); - for (i = 0; i < 2; i++) { - pollfd[i].fd = me[i].fd; - pollfd[i].events = (POLLIN); - } + pollfd[0].fd = pa->fd; + pollfd[1].fd = pb->fd; D("Wait %d secs for link to come up...", wait_link); sleep(wait_link); D("Ready to go, %s 0x%x/%d <-> %s 0x%x/%d.", - me[0].ifname, me[0].queueid, me[0].nifp->ni_rx_rings, - me[1].ifname, me[1].queueid, me[1].nifp->ni_rx_rings); + pa->req.nr_name, pa->first_rx_ring, pa->req.nr_rx_rings, + pb->req.nr_name, pb->first_rx_ring, pb->req.nr_rx_rings); /* main loop */ signal(SIGINT, sigint_h); @@ -259,8 +260,8 @@ main(int argc, char **argv) int n0, n1, ret; pollfd[0].events = pollfd[1].events = 0; pollfd[0].revents = pollfd[1].revents = 0; - n0 = pkt_queued(me, 0); - n1 = pkt_queued(me + 1, 0); + n0 = pkt_queued(pa, 0); + n1 = pkt_queued(pb, 0); if (n0) pollfd[1].events |= POLLOUT; else @@ -276,39 +277,41 @@ main(int argc, char **argv) ret <= 0 ? "timeout" : "ok", pollfd[0].events, pollfd[0].revents, - pkt_queued(me, 0), - me[0].rx->cur, - pkt_queued(me, 1), + pkt_queued(pa, 0), + NETMAP_RXRING(pa->nifp, pa->cur_rx_ring)->cur, + pkt_queued(pa, 1), pollfd[1].events, pollfd[1].revents, - pkt_queued(me+1, 0), - me[1].rx->cur, - pkt_queued(me+1, 1) + pkt_queued(pb, 0), + NETMAP_RXRING(pb->nifp, pb->cur_rx_ring)->cur, + pkt_queued(pb, 1) ); if (ret < 0) continue; if (pollfd[0].revents & POLLERR) { - D("error on fd0, rxcur %d@%d", - me[0].rx->avail, me[0].rx->cur); + struct netmap_ring *rx = NETMAP_RXRING(pa->nifp, pa->cur_rx_ring); + D("error on fd0, rx [%d,%d,%d)", + rx->head, rx->cur, rx->tail); } if (pollfd[1].revents & POLLERR) { - D("error on fd1, rxcur %d@%d", - me[1].rx->avail, me[1].rx->cur); + struct netmap_ring *rx = NETMAP_RXRING(pb->nifp, pb->cur_rx_ring); + D("error on fd1, rx [%d,%d,%d)", + rx->head, rx->cur, rx->tail); } if (pollfd[0].revents & POLLOUT) { - move(me + 1, me, burst); + move(pb, pa, burst); // XXX we don't need the ioctl */ // ioctl(me[0].fd, NIOCTXSYNC, NULL); } if (pollfd[1].revents & POLLOUT) { - move(me, me + 1, burst); + move(pa, pb, burst); // XXX we don't need the ioctl */ // ioctl(me[1].fd, NIOCTXSYNC, NULL); } } D("exiting"); - netmap_close(me + 1); - netmap_close(me + 0); + nm_close(pb); + nm_close(pa); return (0); } diff --git a/tools/tools/netmap/click-test.cfg b/tools/tools/netmap/click-test.cfg deleted file mode 100644 index fc5759f88..000000000 --- a/tools/tools/netmap/click-test.cfg +++ /dev/null @@ -1,19 +0,0 @@ -// -// $FreeBSD$ -// -// A sample test configuration for click -// -// -// create a switch - -myswitch :: EtherSwitch; - -// two input devices - -c0 :: FromDevice(ix0, PROMISC true); -c1 :: FromDevice(ix1, PROMISC true); - -// and now pass packets around - -c0[0] -> [0]sw[0] -> Queue(10000) -> ToDevice(ix0); -c1[0] -> [1]sw[1] -> Queue(10000) -> ToDevice(ix1); diff --git a/tools/tools/netmap/nm_util.c b/tools/tools/netmap/nm_util.c deleted file mode 100644 index 195b68776..000000000 --- a/tools/tools/netmap/nm_util.c +++ /dev/null @@ -1,244 +0,0 @@ -/* - * Copyright (C) 2012-2013 Luigi Rizzo. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - * $Id$ - * - * utilities to use netmap devices. - * This does the basic functions of opening a device and issuing - * ioctls() - */ - -#include "nm_util.h" - -extern int verbose; - -int -nm_do_ioctl(struct my_ring *me, u_long what, int subcmd) -{ - struct ifreq ifr; - int error; -#if defined( __FreeBSD__ ) || defined (__APPLE__) - int fd = me->fd; -#endif -#ifdef linux - struct ethtool_value eval; - int fd; - fd = socket(AF_INET, SOCK_DGRAM, 0); - if (fd < 0) { - printf("Error: cannot get device control socket.\n"); - return -1; - } -#endif /* linux */ - - (void)subcmd; // unused - bzero(&ifr, sizeof(ifr)); - strncpy(ifr.ifr_name, me->ifname, sizeof(ifr.ifr_name)); - switch (what) { - case SIOCSIFFLAGS: -#ifndef __APPLE__ - ifr.ifr_flagshigh = me->if_flags >> 16; -#endif - ifr.ifr_flags = me->if_flags & 0xffff; - break; - -#if defined( __FreeBSD__ ) - case SIOCSIFCAP: - ifr.ifr_reqcap = me->if_reqcap; - ifr.ifr_curcap = me->if_curcap; - break; -#endif -#ifdef linux - case SIOCETHTOOL: - eval.cmd = subcmd; - eval.data = 0; - ifr.ifr_data = (caddr_t)&eval; - break; -#endif /* linux */ - } - error = ioctl(fd, what, &ifr); - if (error) - goto done; - switch (what) { - case SIOCGIFFLAGS: -#ifndef __APPLE__ - me->if_flags = (ifr.ifr_flagshigh << 16) | - (0xffff & ifr.ifr_flags); -#endif - if (verbose) - D("flags are 0x%x", me->if_flags); - break; - -#if defined( __FreeBSD__ ) - case SIOCGIFCAP: - me->if_reqcap = ifr.ifr_reqcap; - me->if_curcap = ifr.ifr_curcap; - if (verbose) - D("curcap are 0x%x", me->if_curcap); - break; -#endif /* __FreeBSD__ */ - } -done: -#ifdef linux - close(fd); -#endif - if (error) - D("ioctl error %d %lu", error, what); - return error; -} - -/* - * open a device. if me->mem is null then do an mmap. - * Returns the file descriptor. - * The extra flag checks configures promisc mode. - */ -int -netmap_open(struct my_ring *me, int ringid, int promisc) -{ - int fd, err, l; - struct nmreq req; - - me->fd = fd = open("/dev/netmap", O_RDWR); - if (fd < 0) { - D("Unable to open /dev/netmap"); - return (-1); - } - bzero(&req, sizeof(req)); - req.nr_version = NETMAP_API; - strncpy(req.nr_name, me->ifname, sizeof(req.nr_name)); - req.nr_ringid = ringid; - err = ioctl(fd, NIOCREGIF, &req); - if (err) { - D("Unable to register %s", me->ifname); - goto error; - } - me->memsize = l = req.nr_memsize; - if (verbose) - D("memsize is %d MB", l>>20); - - if (me->mem == NULL) { - me->mem = mmap(0, l, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); - if (me->mem == MAP_FAILED) { - D("Unable to mmap"); - me->mem = NULL; - goto error; - } - } - - - /* Set the operating mode. */ - if (ringid != NETMAP_SW_RING) { - nm_do_ioctl(me, SIOCGIFFLAGS, 0); - if ((me[0].if_flags & IFF_UP) == 0) { - D("%s is down, bringing up...", me[0].ifname); - me[0].if_flags |= IFF_UP; - } - if (promisc) { - me[0].if_flags |= IFF_PPROMISC; - nm_do_ioctl(me, SIOCSIFFLAGS, 0); - } - -#ifdef __FreeBSD__ - /* also disable checksums etc. */ - nm_do_ioctl(me, SIOCGIFCAP, 0); - me[0].if_reqcap = me[0].if_curcap; - me[0].if_reqcap &= ~(IFCAP_HWCSUM | IFCAP_TSO | IFCAP_TOE); - nm_do_ioctl(me+0, SIOCSIFCAP, 0); -#endif -#ifdef linux - /* disable: - * - generic-segmentation-offload - * - tcp-segmentation-offload - * - rx-checksumming - * - tx-checksumming - * XXX check how to set back the caps. - */ - nm_do_ioctl(me, SIOCETHTOOL, ETHTOOL_SGSO); - nm_do_ioctl(me, SIOCETHTOOL, ETHTOOL_STSO); - nm_do_ioctl(me, SIOCETHTOOL, ETHTOOL_SRXCSUM); - nm_do_ioctl(me, SIOCETHTOOL, ETHTOOL_STXCSUM); -#endif /* linux */ - } - - me->nifp = NETMAP_IF(me->mem, req.nr_offset); - me->queueid = ringid; - if (ringid & NETMAP_SW_RING) { - me->begin = req.nr_rx_rings; - me->end = me->begin + 1; - me->tx = NETMAP_TXRING(me->nifp, req.nr_tx_rings); - me->rx = NETMAP_RXRING(me->nifp, req.nr_rx_rings); - } else if (ringid & NETMAP_HW_RING) { - D("XXX check multiple threads"); - me->begin = ringid & NETMAP_RING_MASK; - me->end = me->begin + 1; - me->tx = NETMAP_TXRING(me->nifp, me->begin); - me->rx = NETMAP_RXRING(me->nifp, me->begin); - } else { - me->begin = 0; - me->end = req.nr_rx_rings; // XXX max of the two - me->tx = NETMAP_TXRING(me->nifp, 0); - me->rx = NETMAP_RXRING(me->nifp, 0); - } - return (0); -error: - close(me->fd); - return -1; -} - - -int -netmap_close(struct my_ring *me) -{ - D(""); - if (me->mem) - munmap(me->mem, me->memsize); - close(me->fd); - return (0); -} - - -/* - * how many packets on this set of queues ? - */ -int -pkt_queued(struct my_ring *me, int tx) -{ - u_int i, tot = 0; - - ND("me %p begin %d end %d", me, me->begin, me->end); - for (i = me->begin; i < me->end; i++) { - struct netmap_ring *ring = tx ? - NETMAP_TXRING(me->nifp, i) : NETMAP_RXRING(me->nifp, i); - tot += ring->avail; - } - if (0 && verbose && tot && !tx) - D("ring %s %s %s has %d avail at %d", - me->ifname, tx ? "tx": "rx", - me->end >= me->nifp->ni_tx_rings ? // XXX who comes first ? - "host":"net", - tot, NETMAP_TXRING(me->nifp, me->begin)->cur); - return tot; -} diff --git a/tools/tools/netmap/nm_util.h b/tools/tools/netmap/nm_util.h deleted file mode 100644 index 0d64f131f..000000000 --- a/tools/tools/netmap/nm_util.h +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (C) 2012 Luigi Rizzo. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -/* - * $FreeBSD$ - * $Id$ - * - * Some utilities to build netmap-based programs. - */ - -#ifndef _NM_UTIL_H -#define _NM_UTIL_H -#include -#include /* signal */ -#include -#include -#include /* PRI* macros */ -#include /* strcmp */ -#include /* open */ -#include /* close */ -#include /* getifaddrs */ - -#include /* PROT_* */ -#include /* ioctl */ -#include -#include /* sockaddr.. */ -#include /* ntohs */ -#include -#include /* sysctl */ -#include /* timersub */ - -#include -#include /* ifreq */ - -#include -#include -#include - -#include -#include - -#ifndef MY_PCAP /* use the system's pcap if available */ - -#ifdef NO_PCAP -#define PCAP_ERRBUF_SIZE 512 -typedef void pcap_t; -struct pcap_pkthdr; -#define pcap_inject(a,b,c) ((void)a, (void)b, (void)c, -1) -#define pcap_dispatch(a, b, c, d) (void)c -#define pcap_open_live(a, b, c, d, e) ((void)e, NULL) -#else /* !NO_PCAP */ -#include // XXX do we need it ? -#endif /* !NO_PCAP */ - -#endif // XXX hack - -#include /* pthread_* */ - -#ifdef linux -#define ifr_flagshigh ifr_flags -#define ifr_curcap ifr_flags -#define ifr_reqcap ifr_flags -#define IFF_PPROMISC IFF_PROMISC -#include -#include - -#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME -#include /* ether_aton */ -#include /* sockaddr_ll */ -#endif /* linux */ - -#ifdef __FreeBSD__ -#include /* le64toh */ -#include - -#include /* pthread w/ affinity */ -#include /* cpu_set */ -#include /* LLADDR */ -#endif /* __FreeBSD__ */ - -#ifdef __APPLE__ -#define ifr_flagshigh ifr_flags // XXX -#define IFF_PPROMISC IFF_PROMISC -#include /* LLADDR */ -#define clock_gettime(a,b) \ - do {struct timespec t0 = {0,0}; *(b) = t0; } while (0) -#endif /* __APPLE__ */ - -static inline int min(int a, int b) { return a < b ? a : b; } -extern int time_second; - -/* debug support */ -#define ND(format, ...) do {} while(0) -#define D(format, ...) \ - fprintf(stderr, "%s [%d] " format "\n", \ - __FUNCTION__, __LINE__, ##__VA_ARGS__) - -#define RD(lps, format, ...) \ - do { \ - static int t0, cnt; \ - if (t0 != time_second) { \ - t0 = time_second; \ - cnt = 0; \ - } \ - if (cnt++ < lps) \ - D(format, ##__VA_ARGS__); \ - } while (0) - - - -// XXX does it work on 32-bit machines ? -static inline void prefetch (const void *x) -{ - __asm volatile("prefetcht0 %0" :: "m" (*(const unsigned long *)x)); -} - -// XXX only for multiples of 64 bytes, non overlapped. -static inline void -pkt_copy(const void *_src, void *_dst, int l) -{ - const uint64_t *src = _src; - uint64_t *dst = _dst; -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(!!(x), 0) - if (unlikely(l >= 1024)) { - bcopy(src, dst, l); - return; - } - for (; l > 0; l-=64) { - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - *dst++ = *src++; - } -} - -/* - * info on a ring we handle - */ -struct my_ring { - const char *ifname; - int fd; - char *mem; /* userspace mmap address */ - u_int memsize; - u_int queueid; - u_int begin, end; /* first..last+1 rings to check */ - struct netmap_if *nifp; - struct netmap_ring *tx, *rx; /* shortcuts */ - - uint32_t if_flags; - uint32_t if_reqcap; - uint32_t if_curcap; -}; -int netmap_open(struct my_ring *me, int ringid, int promisc); -int netmap_close(struct my_ring *me); -int nm_do_ioctl(struct my_ring *me, u_long what, int subcmd); -#endif /* _NM_UTIL_H */ diff --git a/tools/tools/netmap/pcap.c b/tools/tools/netmap/pcap.c deleted file mode 100644 index f30f57bf8..000000000 --- a/tools/tools/netmap/pcap.c +++ /dev/null @@ -1,654 +0,0 @@ -/* - * (C) 2011-2012 Luigi Rizzo - * - * BSD license - * - * A simple library that maps some pcap functions onto netmap - * This is not 100% complete but enough to let tcpdump, trafshow - * and other apps work. - * - * $FreeBSD$ - */ - -#define MY_PCAP -#include "nm_util.h" - -char *version = "$Id$"; -int verbose = 0; - -/* - * We redefine here a number of structures that are in pcap.h - * so we can compile this file without the system header. - */ -#ifndef PCAP_ERRBUF_SIZE -#define PCAP_ERRBUF_SIZE 128 -/* - * Each packet is accompanied by a header including the timestamp, - * captured size and actual size. - */ -struct pcap_pkthdr { - struct timeval ts; /* time stamp */ - uint32_t caplen; /* length of portion present */ - uint32_t len; /* length this packet (off wire) */ -}; - -typedef struct pcap_if pcap_if_t; - -/* - * Representation of an interface address. - */ -struct pcap_addr { - struct pcap_addr *next; - struct sockaddr *addr; /* address */ - struct sockaddr *netmask; /* netmask for the above */ - struct sockaddr *broadaddr; /* broadcast addr for the above */ - struct sockaddr *dstaddr; /* P2P dest. address for the above */ -}; - -struct pcap_if { - struct pcap_if *next; - char *name; /* name to hand to "pcap_open_live()" */ - char *description; /* textual description of interface, or NULL */ - struct pcap_addr *addresses; - uint32_t flags; /* PCAP_IF_ interface flags */ -}; - -/* - * We do not support stats (yet) - */ -struct pcap_stat { - u_int ps_recv; /* number of packets received */ - u_int ps_drop; /* number of packets dropped */ - u_int ps_ifdrop; /* drops by interface XXX not yet supported */ -#ifdef WIN32 - u_int bs_capt; /* number of packets that reach the app. */ -#endif /* WIN32 */ -}; - -typedef void pcap_t; -typedef enum { - PCAP_D_INOUT = 0, - PCAP_D_IN, - PCAP_D_OUT -} pcap_direction_t; - - - -typedef void (*pcap_handler)(u_char *user, - const struct pcap_pkthdr *h, const u_char *bytes); - -char errbuf[PCAP_ERRBUF_SIZE]; - -pcap_t *pcap_open_live(const char *device, int snaplen, - int promisc, int to_ms, char *errbuf); - -int pcap_findalldevs(pcap_if_t **alldevsp, char *errbuf); -void pcap_close(pcap_t *p); -int pcap_get_selectable_fd(pcap_t *p); -int pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user); -int pcap_setnonblock(pcap_t *p, int nonblock, char *errbuf); -int pcap_setdirection(pcap_t *p, pcap_direction_t d); -char *pcap_lookupdev(char *errbuf); -int pcap_inject(pcap_t *p, const void *buf, size_t size); -int pcap_fileno(pcap_t *p); -const char *pcap_lib_version(void); - - -struct eproto { - const char *s; - u_short p; -}; -#endif /* !PCAP_ERRBUF_SIZE */ - -#ifndef TEST -/* - * build as a shared library - */ - -char pcap_version[] = "libnetmap version 0.3"; - -/* - * Our equivalent of pcap_t - */ -struct pcap_ring { - struct my_ring me; -#if 0 - const char *ifname; - - //struct nmreq nmr; - - int fd; - char *mem; /* userspace mmap address */ - u_int memsize; - u_int queueid; - u_int begin, end; /* first..last+1 rings to check */ - struct netmap_if *nifp; - - uint32_t if_flags; - uint32_t if_reqcap; - uint32_t if_curcap; -#endif - int snaplen; - char *errbuf; - int promisc; - int to_ms; - - struct pcap_pkthdr hdr; - - - struct pcap_stat st; - - char msg[PCAP_ERRBUF_SIZE]; -}; - - - -/* - * There is a set of functions that tcpdump expects even if probably - * not used - */ -struct eproto eproto_db[] = { - { "ip", ETHERTYPE_IP }, - { "arp", ETHERTYPE_ARP }, - { (char *)0, 0 } -}; - - -const char *pcap_lib_version(void) -{ - return pcap_version; -} - -int -pcap_findalldevs(pcap_if_t **alldevsp, char *errbuf) -{ - pcap_if_t *top = NULL; -#ifndef linux - struct ifaddrs *i_head, *i; - pcap_if_t *cur; - struct pcap_addr *tail = NULL; - int l; - - D("listing all devs"); - *alldevsp = NULL; - i_head = NULL; - - if (getifaddrs(&i_head)) { - D("cannot get if addresses"); - return -1; - } - for (i = i_head; i; i = i->ifa_next) { - //struct ifaddrs *ifa; - struct pcap_addr *pca; - //struct sockaddr *sa; - - D("got interface %s", i->ifa_name); - if (!top || strcmp(top->name, i->ifa_name)) { - /* new interface */ - l = sizeof(*top) + strlen(i->ifa_name) + 1; - cur = calloc(1, l); - if (cur == NULL) { - D("no space for if descriptor"); - continue; - } - cur->name = (char *)(cur + 1); - //cur->flags = i->ifa_flags; - strcpy(cur->name, i->ifa_name); - cur->description = NULL; - cur->next = top; - top = cur; - tail = NULL; - } - /* now deal with addresses */ - D("%s addr family %d len %d %s %s", - top->name, - i->ifa_addr->sa_family, i->ifa_addr->sa_len, - i->ifa_netmask ? "Netmask" : "", - i->ifa_broadaddr ? "Broadcast" : ""); - l = sizeof(struct pcap_addr) + - (i->ifa_addr ? i->ifa_addr->sa_len:0) + - (i->ifa_netmask ? i->ifa_netmask->sa_len:0) + - (i->ifa_broadaddr? i->ifa_broadaddr->sa_len:0); - pca = calloc(1, l); - if (pca == NULL) { - D("no space for if addr"); - continue; - } -#define SA_NEXT(x) ((struct sockaddr *)((char *)(x) + (x)->sa_len)) - pca->addr = (struct sockaddr *)(pca + 1); - pkt_copy(i->ifa_addr, pca->addr, i->ifa_addr->sa_len); - if (i->ifa_netmask) { - pca->netmask = SA_NEXT(pca->addr); - bcopy(i->ifa_netmask, pca->netmask, i->ifa_netmask->sa_len); - if (i->ifa_broadaddr) { - pca->broadaddr = SA_NEXT(pca->netmask); - bcopy(i->ifa_broadaddr, pca->broadaddr, i->ifa_broadaddr->sa_len); - } - } - if (tail == NULL) { - top->addresses = pca; - } else { - tail->next = pca; - } - tail = pca; - - } - freeifaddrs(i_head); -#endif /* !linux */ - (void)errbuf; /* UNUSED */ - *alldevsp = top; - return 0; -} - -void pcap_freealldevs(pcap_if_t *alldevs) -{ - (void)alldevs; /* UNUSED */ - D("unimplemented"); -} - -char * -pcap_lookupdev(char *buf) -{ - D("%s", buf); - strcpy(buf, "/dev/netmap"); - return buf; -} - -pcap_t * -pcap_create(const char *source, char *errbuf) -{ - D("src %s (call open liveted)", source); - return pcap_open_live(source, 0, 1, 100, errbuf); -} - -int -pcap_activate(pcap_t *p) -{ - D("pcap %p running", p); - return 0; -} - -int -pcap_can_set_rfmon(pcap_t *p) -{ - (void)p; /* UNUSED */ - D(""); - return 0; /* no we can't */ -} - -int -pcap_set_snaplen(pcap_t *p, int snaplen) -{ - struct pcap_ring *me = p; - - D("len %d", snaplen); - me->snaplen = snaplen; - return 0; -} - -int -pcap_snapshot(pcap_t *p) -{ - struct pcap_ring *me = p; - - D("len %d", me->snaplen); - return me->snaplen; -} - -int -pcap_lookupnet(const char *device, uint32_t *netp, - uint32_t *maskp, char *errbuf) -{ - - (void)errbuf; /* UNUSED */ - D("device %s", device); - inet_aton("10.0.0.255", (struct in_addr *)netp); - inet_aton("255.255.255.0",(struct in_addr *) maskp); - return 0; -} - -int -pcap_set_promisc(pcap_t *p, int promisc) -{ - struct pcap_ring *me = p; - - D("promisc %d", promisc); - if (nm_do_ioctl(&me->me, SIOCGIFFLAGS, 0)) - D("SIOCGIFFLAGS failed"); - if (promisc) { - me->me.if_flags |= IFF_PPROMISC; - } else { - me->me.if_flags &= ~IFF_PPROMISC; - } - if (nm_do_ioctl(&me->me, SIOCSIFFLAGS, 0)) - D("SIOCSIFFLAGS failed"); - return 0; -} - -int -pcap_set_timeout(pcap_t *p, int to_ms) -{ - struct pcap_ring *me = p; - - D("%d ms", to_ms); - me->to_ms = to_ms; - return 0; -} - -struct bpf_program; - -int -pcap_compile(pcap_t *p, struct bpf_program *fp, - const char *str, int optimize, uint32_t netmask) -{ - (void)p; /* UNUSED */ - (void)fp; /* UNUSED */ - (void)optimize; /* UNUSED */ - (void)netmask; /* UNUSED */ - D("%s", str); - return 0; -} - -int -pcap_setfilter(pcap_t *p, struct bpf_program *fp) -{ - (void)p; /* UNUSED */ - (void)fp; /* UNUSED */ - D(""); - return 0; -} - -int -pcap_datalink(pcap_t *p) -{ - (void)p; /* UNUSED */ - D("returns 1"); - return 1; // ethernet -} - -const char * -pcap_datalink_val_to_name(int dlt) -{ - D("%d returns DLT_EN10MB", dlt); - return "DLT_EN10MB"; -} - -const char * -pcap_datalink_val_to_description(int dlt) -{ - D("%d returns Ethernet link", dlt); - return "Ethernet link"; -} - -struct pcap_stat; -int -pcap_stats(pcap_t *p, struct pcap_stat *ps) -{ - struct pcap_ring *me = p; - ND(""); - - *ps = me->st; - return 0; /* accumulate from pcap_dispatch() */ -}; - -char * -pcap_geterr(pcap_t *p) -{ - struct pcap_ring *me = p; - - D(""); - return me->msg; -} - -pcap_t * -pcap_open_live(const char *device, int snaplen, - int promisc, int to_ms, char *errbuf) -{ - struct pcap_ring *me; - int l; - - (void)snaplen; /* UNUSED */ - (void)errbuf; /* UNUSED */ - if (!device) { - D("missing device name"); - return NULL; - } - - l = strlen(device) + 1; - D("request to open %s snaplen %d promisc %d timeout %dms", - device, snaplen, promisc, to_ms); - me = calloc(1, sizeof(*me) + l); - if (me == NULL) { - D("failed to allocate struct for %s", device); - return NULL; - } - me->me.ifname = (char *)(me + 1); - strcpy((char *)me->me.ifname, device); - if (netmap_open(&me->me, 0, promisc)) { - D("error opening %s", device); - free(me); - return NULL; - } - me->to_ms = to_ms; - - return (pcap_t *)me; -} - -void -pcap_close(pcap_t *p) -{ - struct my_ring *me = p; - - D(""); - if (!me) - return; - if (me->mem) - munmap(me->mem, me->memsize); - /* restore original flags ? */ - close(me->fd); - bzero(me, sizeof(*me)); - free(me); -} - -int -pcap_fileno(pcap_t *p) -{ - struct my_ring *me = p; - D("returns %d", me->fd); - return me->fd; -} - -int -pcap_get_selectable_fd(pcap_t *p) -{ - struct my_ring *me = p; - - ND(""); - return me->fd; -} - -int -pcap_setnonblock(pcap_t *p, int nonblock, char *errbuf) -{ - (void)p; /* UNUSED */ - (void)errbuf; /* UNUSED */ - D("mode is %d", nonblock); - return 0; /* ignore */ -} - -int -pcap_setdirection(pcap_t *p, pcap_direction_t d) -{ - (void)p; /* UNUSED */ - (void)d; /* UNUSED */ - D(""); - return 0; /* ignore */ -}; - -int -pcap_dispatch(pcap_t *p, int cnt, pcap_handler callback, u_char *user) -{ - struct pcap_ring *pme = p; - struct my_ring *me = &pme->me; - int got = 0; - u_int si; - - ND("cnt %d", cnt); - if (cnt == 0) - cnt = -1; - /* scan all rings */ - for (si = me->begin; si < me->end; si++) { - struct netmap_ring *ring = NETMAP_RXRING(me->nifp, si); - ND("ring has %d pkts", ring->avail); - if (ring->avail == 0) - continue; - pme->hdr.ts = ring->ts; - /* - * XXX a proper prefetch should be done as - * prefetch(i); callback(i-1); ... - */ - while ((cnt == -1 || cnt != got) && ring->avail > 0) { - u_int i = ring->cur; - u_int idx = ring->slot[i].buf_idx; - if (idx < 2) { - D("%s bogus RX index %d at offset %d", - me->nifp->ni_name, idx, i); - sleep(2); - } - u_char *buf = (u_char *)NETMAP_BUF(ring, idx); - prefetch(buf); - pme->hdr.len = pme->hdr.caplen = ring->slot[i].len; - // D("call %p len %d", p, me->hdr.len); - callback(user, &pme->hdr, buf); - ring->cur = NETMAP_RING_NEXT(ring, i); - ring->avail--; - got++; - } - } - pme->st.ps_recv += got; - return got; -} - -int -pcap_inject(pcap_t *p, const void *buf, size_t size) -{ - struct my_ring *me = p; - u_int si; - - ND("cnt %d", cnt); - /* scan all rings */ - for (si = me->begin; si < me->end; si++) { - struct netmap_ring *ring = NETMAP_TXRING(me->nifp, si); - - ND("ring has %d pkts", ring->avail); - if (ring->avail == 0) - continue; - u_int i = ring->cur; - u_int idx = ring->slot[i].buf_idx; - if (idx < 2) { - D("%s bogus TX index %d at offset %d", - me->nifp->ni_name, idx, i); - sleep(2); - } - u_char *dst = (u_char *)NETMAP_BUF(ring, idx); - ring->slot[i].len = size; - pkt_copy(buf, dst, size); - ring->cur = NETMAP_RING_NEXT(ring, i); - ring->avail--; - // if (ring->avail == 0) ioctl(me->fd, NIOCTXSYNC, NULL); - return size; - } - errno = ENOBUFS; - return -1; -} - -int -pcap_loop(pcap_t *p, int cnt, pcap_handler callback, u_char *user) -{ - struct pcap_ring *me = p; - struct pollfd fds[1]; - int i; - - ND("cnt %d", cnt); - memset(fds, 0, sizeof(fds)); - fds[0].fd = me->me.fd; - fds[0].events = (POLLIN); - - while (cnt == -1 || cnt > 0) { - if (poll(fds, 1, me->to_ms) <= 0) { - D("poll error/timeout"); - continue; - } - i = pcap_dispatch(p, cnt, callback, user); - if (cnt > 0) - cnt -= i; - } - return 0; -} - -#endif /* !TEST */ - -#ifdef TEST /* build test code */ -void do_send(u_char *user, const struct pcap_pkthdr *h, const u_char *buf) -{ - pcap_inject((pcap_t *)user, buf, h->caplen); -} - -/* - * a simple pcap test program, bridge between two interfaces. - */ -int -main(int argc, char **argv) -{ - pcap_t *p0, *p1; - int burst = 1024; - struct pollfd pollfd[2]; - - fprintf(stderr, "%s %s built %s %s\n", - argv[0], version, __DATE__, __TIME__); - - while (argc > 1 && !strcmp(argv[1], "-v")) { - verbose++; - argv++; - argc--; - } - - if (argc < 3 || argc > 4 || !strcmp(argv[1], argv[2])) { - D("Usage: %s IFNAME1 IFNAME2 [BURST]", argv[0]); - return (1); - } - if (argc > 3) - burst = atoi(argv[3]); - - p0 = pcap_open_live(argv[1], 0, 1, 100, NULL); - p1 = pcap_open_live(argv[2], 0, 1, 100, NULL); - D("%s", version); - D("open returns %p %p", p0, p1); - if (!p0 || !p1) - return(1); - bzero(pollfd, sizeof(pollfd)); - pollfd[0].fd = pcap_fileno(p0); - pollfd[1].fd = pcap_fileno(p1); - pollfd[0].events = pollfd[1].events = POLLIN; - for (;;) { - /* do i need to reset ? */ - pollfd[0].revents = pollfd[1].revents = 0; - int ret = poll(pollfd, 2, 1000); - if (ret <= 0 || verbose) - D("poll %s [0] ev %x %x [1] ev %x %x", - ret <= 0 ? "timeout" : "ok", - pollfd[0].events, - pollfd[0].revents, - pollfd[1].events, - pollfd[1].revents); - if (ret < 0) - continue; - if (pollfd[0].revents & POLLIN) - pcap_dispatch(p0, burst, do_send, p1); - if (pollfd[1].revents & POLLIN) - pcap_dispatch(p1, burst, do_send, p0); - } - - return (0); -} -#endif /* TEST */ diff --git a/tools/tools/netmap/pkt-gen.c b/tools/tools/netmap/pkt-gen.c index 82b57c43e..8e78fa8e2 100644 --- a/tools/tools/netmap/pkt-gen.c +++ b/tools/tools/netmap/pkt-gen.c @@ -1,5 +1,6 @@ /* - * Copyright (C) 2011-2012 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved. + * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -25,7 +26,7 @@ /* * $FreeBSD$ - * $Id: pkt-gen.c 12024 2013-01-25 05:41:51Z luigi $ + * $Id: pkt-gen.c 12346 2013-06-12 17:36:25Z luigi $ * * Example program to show how to build a multithreaded packet * source/sink using the netmap device. @@ -36,9 +37,73 @@ * */ -#include "nm_util.h" +#define _GNU_SOURCE /* for CPU_SET() */ +#include +#define NETMAP_WITH_LIBS +#include + #include // isprint() +#include // sysconf() +#include +#include /* ntohs */ +#include /* sysctl */ +#include /* getifaddrs */ +#include +#include +#include +#include + +#include + +#ifndef NO_PCAP +#include +#endif + +#ifdef linux + +#define cpuset_t cpu_set_t + +#define ifr_flagshigh ifr_flags /* only the low 16 bits here */ +#define IFF_PPROMISC IFF_PROMISC /* IFF_PPROMISC does not exist */ +#include +#include + +#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME +#include /* ether_aton */ +#include /* sockaddr_ll */ +#endif /* linux */ + +#ifdef __FreeBSD__ +#include /* le64toh */ +#include + +#include /* pthread w/ affinity */ +#include /* cpu_set */ +#include /* LLADDR */ +#endif /* __FreeBSD__ */ + +#ifdef __APPLE__ + +#define cpuset_t uint64_t // XXX +static inline void CPU_ZERO(cpuset_t *p) +{ + *p = 0; +} + +static inline void CPU_SET(uint32_t i, cpuset_t *p) +{ + *p |= 1<< (i & 0x3f); +} + +#define pthread_setaffinity_np(a, b, c) ((void)a, 0) + +#define ifr_flagshigh ifr_flags // XXX +#define IFF_PPROMISC IFF_PROMISC +#include /* LLADDR */ +#define clock_gettime(a,b) \ + do {struct timespec t0 = {0,0}; *(b) = t0; } while (0) +#endif /* __APPLE__ */ const char *default_payload="netmap pkt-gen DIRECT payload\n" "http://info.iet.unipi.it/~luigi/netmap/ "; @@ -46,13 +111,20 @@ const char *default_payload="netmap pkt-gen DIRECT payload\n" const char *indirect_payload="netmap pkt-gen indirect payload\n" "http://info.iet.unipi.it/~luigi/netmap/ "; -int time_second; // support for RD() debugging macro - int verbose = 0; -#define SKIP_PAYLOAD 1 /* do not check payload. */ +#define SKIP_PAYLOAD 1 /* do not check payload. XXX unused */ + + +#define VIRT_HDR_1 10 /* length of a base vnet-hdr */ +#define VIRT_HDR_2 12 /* length of the extenede vnet-hdr */ +#define VIRT_HDR_MAX VIRT_HDR_2 +struct virt_header { + uint8_t fields[VIRT_HDR_MAX]; +}; struct pkt { + struct virt_header vh; struct ether_header eh; struct ip ip; struct udphdr udp; @@ -70,6 +142,8 @@ struct mac_range { struct ether_addr start, end; }; +/* ifname can be netmap:foo-xxxx */ +#define MAX_IFNAMELEN 64 /* our buffer for ifname */ /* * global arguments for all threads */ @@ -95,20 +169,25 @@ struct glob_arg { #define OPT_INDIRECT 32 /* use indirect buffers, tx only */ #define OPT_DUMP 64 /* dump rx/tx traffic */ int dev_type; +#ifndef NO_PCAP pcap_t *p; +#endif int tx_rate; struct timespec tx_period; int affinity; int main_fd; - int report_interval; + struct nm_desc *nmd; + uint64_t nmd_flags; + int report_interval; /* milliseconds between prints */ void *(*td_body)(void *); void *mmap_addr; - int mmap_size; - char *ifname; + char ifname[MAX_IFNAMELEN]; char *nmr_config; int dummy_send; + int virt_header; /* send also the virt_header */ + int extra_bufs; /* goes in nr_arg3 */ }; enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP }; @@ -123,9 +202,7 @@ struct targ { int completed; int cancel; int fd; - struct nmreq nmr; - struct netmap_if *nifp; - uint16_t qfirst, qlast; /* range of queues to scan */ + struct nm_desc *nmd; volatile uint64_t count; struct timespec tic, toc; int me; @@ -146,7 +223,8 @@ extract_ip_range(struct ip_range *r) char *ap, *pp; struct in_addr a; - D("extract IP range from %s", r->name); + if (verbose) + D("extract IP range from %s", r->name); r->port0 = r->port1 = 0; r->start = r->end = 0; @@ -167,7 +245,7 @@ extract_ip_range(struct ip_range *r) pp = index(ap, ':'); if (pp) { *pp++ = '\0'; - if (*pp) + if (*pp) r->port1 = strtol(pp, NULL, 0); } if (*ap) { @@ -192,7 +270,8 @@ extract_ip_range(struct ip_range *r) a.s_addr = htonl(r->end); strncpy(buf1, inet_ntoa(a), sizeof(buf1)); a.s_addr = htonl(r->start); - D("range is %s:%d to %s:%d", + if (1) + D("range is %s:%d to %s:%d", inet_ntoa(a), r->port0, buf1, r->port1); } } @@ -200,7 +279,8 @@ extract_ip_range(struct ip_range *r) static void extract_mac_range(struct mac_range *r) { - D("extract MAC range from %s", r->name); + if (verbose) + D("extract MAC range from %s", r->name); bcopy(ether_aton(r->name), &r->start, 6); bcopy(ether_aton(r->name), &r->end, 6); #if 0 @@ -215,7 +295,8 @@ extract_mac_range(struct mac_range *r) if (p) targ->dst_mac_range = atoi(p+1); #endif - D("%s starts at %s", r->name, ether_ntoa(&r->start)); + if (verbose) + D("%s starts at %s", r->name, ether_ntoa(&r->start)); } static struct targ *targs; @@ -238,19 +319,17 @@ sigint_h(int sig) static int system_ncpus(void) { -#ifdef __FreeBSD__ - int mib[2], ncpus; - size_t len; - - mib[0] = CTL_HW; - mib[1] = HW_NCPU; - len = sizeof(mib); + int ncpus; +#if defined (__FreeBSD__) + int mib[2] = { CTL_HW, HW_NCPU }; + size_t len = sizeof(mib); sysctl(mib, 2, &ncpus, &len, NULL, 0); - +#elif defined(linux) + ncpus = sysconf(_SC_NPROCESSORS_ONLN); +#else /* others */ + ncpus = 1; +#endif /* others */ return (ncpus); -#else - return 1; -#endif /* !__FreeBSD__ */ } #ifdef __linux__ @@ -276,15 +355,17 @@ system_ncpus(void) /* * parse the vale configuration in conf and put it in nmr. + * Return the flag set if necessary. * The configuration may consist of 0 to 4 numbers separated - * by commas: #tx-slots,#rx-slots,#tx-rinzgs,#rx-rings. + * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings. * Missing numbers or zeroes stand for default values. * As an additional convenience, if exactly one number - * is specified, then this is assigned to bot #tx-slots and #rx-slots. - * If there is no 4th number, then the 3rd is assigned to bot #tx-rings + * is specified, then this is assigned to both #tx-slots and #rx-slots. + * If there is no 4th number, then the 3rd is assigned to both #tx-rings * and #rx-rings. */ -void parse_nmr_config(const char* conf, struct nmreq *nmr) +int +parse_nmr_config(const char* conf, struct nmreq *nmr) { char *w, *tok; int i, v; @@ -292,7 +373,7 @@ void parse_nmr_config(const char* conf, struct nmreq *nmr) nmr->nr_tx_rings = nmr->nr_rx_rings = 0; nmr->nr_tx_slots = nmr->nr_rx_slots = 0; if (conf == NULL || ! *conf) - return; + return 0; w = strdup(conf); for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) { v = atoi(tok); @@ -318,6 +399,9 @@ void parse_nmr_config(const char* conf, struct nmreq *nmr) nmr->nr_tx_rings, nmr->nr_tx_slots, nmr->nr_rx_rings, nmr->nr_rx_slots); free(w); + return (nmr->nr_tx_rings || nmr->nr_tx_slots || + nmr->nr_rx_rings || nmr->nr_rx_slots) ? + NM_OPEN_RING_CFG : 0; } @@ -362,7 +446,6 @@ source_hwaddr(const char *ifname, char *buf) static int setaffinity(pthread_t me, int i) { -#ifdef __FreeBSD__ cpuset_t cpumask; if (i == -1) @@ -373,13 +456,9 @@ setaffinity(pthread_t me, int i) CPU_SET(i, &cpumask); if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) { - D("Unable to set affinity"); + D("Unable to set affinity: %s", strerror(errno)); return 1; } -#else - (void)me; /* suppress 'unused' warnings */ - (void)i; -#endif /* __FreeBSD__ */ return 0; } @@ -426,7 +505,7 @@ dump_payload(char *p, int len, struct netmap_ring *ring, int cur) int i, j, i0; /* get the length in ASCII of the length of the packet. */ - + printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n", ring, cur, ring->slot[cur].buf_idx, ring->slot[cur].flags, len); @@ -470,17 +549,18 @@ update_addresses(struct pkt *pkt, struct glob_arg *g) struct ip *ip = &pkt->ip; struct udphdr *udp = &pkt->udp; + do { p = ntohs(udp->uh_sport); if (p < g->src_ip.port1) { /* just inc, no wrap */ udp->uh_sport = htons(p + 1); - return; + break; } udp->uh_sport = htons(g->src_ip.port0); a = ntohl(ip->ip_src.s_addr); if (a < g->src_ip.end) { /* just inc, no wrap */ ip->ip_src.s_addr = htonl(a + 1); - return; + break; } ip->ip_src.s_addr = htonl(g->src_ip.start); @@ -488,17 +568,18 @@ update_addresses(struct pkt *pkt, struct glob_arg *g) p = ntohs(udp->uh_dport); if (p < g->dst_ip.port1) { /* just inc, no wrap */ udp->uh_dport = htons(p + 1); - return; + break; } udp->uh_dport = htons(g->dst_ip.port0); a = ntohl(ip->ip_dst.s_addr); if (a < g->dst_ip.end) { /* just inc, no wrap */ ip->ip_dst.s_addr = htonl(a + 1); - return; + break; } ip->ip_dst.s_addr = htonl(g->dst_ip.start); - + } while (0); + // update checksum } /* @@ -515,13 +596,13 @@ initialize_packet(struct targ *targ) uint16_t paylen = targ->g->pkt_size - sizeof(*eh) - sizeof(struct ip); const char *payload = targ->g->options & OPT_INDIRECT ? indirect_payload : default_payload; - int i, l, l0 = strlen(payload); + int i, l0 = strlen(payload); /* create a nice NUL-terminated string */ - for (i = 0; i < paylen;) { - l = min(l0, paylen - i); - bcopy(payload, pkt->body + i, l); - i += l; + for (i = 0; i < paylen; i += l0) { + if (l0 > paylen - i) + l0 = paylen - i; // last round + bcopy(payload, pkt->body + i, l0); } pkt->body[i-1] = '\0'; ip = &pkt->ip; @@ -559,6 +640,8 @@ initialize_packet(struct targ *targ) bcopy(&targ->g->src_mac.start, eh->ether_shost, 6); bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6); eh->ether_type = htons(ETHERTYPE_IP); + + bzero(&pkt->vh, sizeof(pkt->vh)); // dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0); } @@ -570,18 +653,19 @@ initialize_packet(struct targ *targ) * an interrupt when done. */ static int -send_packets(struct netmap_ring *ring, struct pkt *pkt, - struct glob_arg *g, u_int count, int options, u_int nfrags) +send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame, + int size, struct glob_arg *g, u_int count, int options, + u_int nfrags) { - u_int sent, cur = ring->cur; - int fcnt; - int size = g->pkt_size; + u_int n, sent, cur = ring->cur; + u_int fcnt; - if (ring->avail < count) - count = ring->avail; + n = nm_ring_space(ring); + if (n < count) + count = n; if (count < nfrags) { D("truncating packet, no room for frags %d %d", - count, nfrags); + count, nfrags); } #if 0 if (options & (OPT_COPY | OPT_PREFETCH) ) { @@ -589,8 +673,8 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, struct netmap_slot *slot = &ring->slot[cur]; char *p = NETMAP_BUF(ring, slot->buf_idx); - prefetch(p); - cur = NETMAP_RING_NEXT(ring, cur); + __builtin_prefetch(p); + cur = nm_ring_next(ring, cur); } cur = ring->cur; } @@ -602,17 +686,17 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, slot->flags = 0; if (options & OPT_INDIRECT) { slot->flags |= NS_INDIRECT; - slot->ptr = (uint64_t)pkt; + slot->ptr = (uint64_t)frame; } else if (options & OPT_COPY) { - pkt_copy(pkt, p, size); - if (fcnt == 1) + nm_pkt_copy(frame, p, size); + if (fcnt == nfrags) update_addresses(pkt, g); } else if (options & OPT_MEMCPY) { - memcpy(p, pkt, size); - if (fcnt == 1) + memcpy(p, frame, size); + if (fcnt == nfrags) update_addresses(pkt, g); } else if (options & OPT_PREFETCH) { - prefetch(p); + __builtin_prefetch(p); } if (options & OPT_DUMP) dump_payload(p, size, ring, cur); @@ -625,10 +709,9 @@ send_packets(struct netmap_ring *ring, struct pkt *pkt, slot->flags &= ~NS_MOREFRAG; slot->flags |= NS_REPORT; } - cur = NETMAP_RING_NEXT(ring, cur); + cur = nm_ring_next(ring, cur); } - ring->avail -= sent; - ring->cur = cur; + ring->head = ring->cur = cur; return (sent); } @@ -644,52 +727,58 @@ static void * pinger_body(void *data) { struct targ *targ = (struct targ *) data; - struct pollfd fds[1]; - struct netmap_if *nifp = targ->nifp; + struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; + struct netmap_if *nifp = targ->nmd->nifp; int i, rx = 0, n = targ->g->npackets; - - fds[0].fd = targ->fd; - fds[0].events = (POLLIN); - static uint32_t sent; + void *frame; + int size; + uint32_t sent = 0; struct timespec ts, now, last_print; uint32_t count = 0, min = 1000000000, av = 0; + frame = &targ->pkt; + frame += sizeof(targ->pkt.vh) - targ->g->virt_header; + size = targ->g->pkt_size + targ->g->virt_header; + + if (targ->g->nthreads > 1) { D("can only ping with 1 thread"); return NULL; } clock_gettime(CLOCK_REALTIME_PRECISE, &last_print); + now = last_print; while (n == 0 || (int)sent < n) { struct netmap_ring *ring = NETMAP_TXRING(nifp, 0); struct netmap_slot *slot; char *p; - for (i = 0; i < 1; i++) { + for (i = 0; i < 1; i++) { /* XXX why the loop for 1 pkt ? */ slot = &ring->slot[ring->cur]; - slot->len = targ->g->pkt_size; + slot->len = size; p = NETMAP_BUF(ring, slot->buf_idx); - if (ring->avail == 0) { + if (nm_ring_empty(ring)) { D("-- ouch, cannot send"); } else { - pkt_copy(&targ->pkt, p, targ->g->pkt_size); + nm_pkt_copy(frame, p, size); clock_gettime(CLOCK_REALTIME_PRECISE, &ts); bcopy(&sent, p+42, sizeof(sent)); bcopy(&ts, p+46, sizeof(ts)); sent++; - ring->cur = NETMAP_RING_NEXT(ring, ring->cur); - ring->avail--; + ring->head = ring->cur = nm_ring_next(ring, ring->cur); } } /* should use a parameter to decide how often to send */ - if (poll(fds, 1, 3000) <= 0) { - D("poll error/timeout on queue %d", targ->me); + if (poll(&pfd, 1, 3000) <= 0) { + D("poll error/timeout on queue %d: %s", targ->me, + strerror(errno)); continue; } /* see what we got back */ - for (i = targ->qfirst; i < targ->qlast; i++) { + for (i = targ->nmd->first_tx_ring; + i <= targ->nmd->last_tx_ring; i++) { ring = NETMAP_RXRING(nifp, i); - while (ring->avail > 0) { + while (!nm_ring_empty(ring)) { uint32_t seq; slot = &ring->slot[ring->cur]; p = NETMAP_BUF(ring, slot->buf_idx); @@ -709,8 +798,7 @@ pinger_body(void *data) min = ts.tv_nsec; count ++; av += ts.tv_nsec; - ring->avail--; - ring->cur = NETMAP_RING_NEXT(ring, ring->cur); + ring->head = ring->cur = nm_ring_next(ring, ring->cur); rx++; } } @@ -742,12 +830,10 @@ static void * ponger_body(void *data) { struct targ *targ = (struct targ *) data; - struct pollfd fds[1]; - struct netmap_if *nifp = targ->nifp; + struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; + struct netmap_if *nifp = targ->nmd->nifp; struct netmap_ring *txring, *rxring; int i, rx = 0, sent = 0, n = targ->g->npackets; - fds[0].fd = targ->fd; - fds[0].events = (POLLIN); if (targ->g->nthreads > 1) { D("can only reply ping with 1 thread"); @@ -758,28 +844,28 @@ ponger_body(void *data) uint32_t txcur, txavail; //#define BUSYWAIT #ifdef BUSYWAIT - ioctl(fds[0].fd, NIOCRXSYNC, NULL); + ioctl(pfd.fd, NIOCRXSYNC, NULL); #else - if (poll(fds, 1, 1000) <= 0) { - D("poll error/timeout on queue %d", targ->me); + if (poll(&pfd, 1, 1000) <= 0) { + D("poll error/timeout on queue %d: %s", targ->me, + strerror(errno)); continue; } #endif txring = NETMAP_TXRING(nifp, 0); txcur = txring->cur; - txavail = txring->avail; + txavail = nm_ring_space(txring); /* see what we got back */ - for (i = targ->qfirst; i < targ->qlast; i++) { + for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) { rxring = NETMAP_RXRING(nifp, i); - while (rxring->avail > 0) { + while (!nm_ring_empty(rxring)) { uint16_t *spkt, *dpkt; uint32_t cur = rxring->cur; struct netmap_slot *slot = &rxring->slot[cur]; char *src, *dst; src = NETMAP_BUF(rxring, slot->buf_idx); //D("got pkt %p of size %d", src, slot->len); - rxring->avail--; - rxring->cur = NETMAP_RING_NEXT(rxring, cur); + rxring->head = rxring->cur = nm_ring_next(rxring, cur); rx++; if (txavail == 0) continue; @@ -788,7 +874,7 @@ ponger_body(void *data) /* copy... */ dpkt = (uint16_t *)dst; spkt = (uint16_t *)src; - pkt_copy(src, dst, slot->len); + nm_pkt_copy(src, dst, slot->len); dpkt[0] = spkt[3]; dpkt[1] = spkt[4]; dpkt[2] = spkt[5]; @@ -797,16 +883,15 @@ ponger_body(void *data) dpkt[5] = spkt[2]; txring->slot[txcur].len = slot->len; /* XXX swap src dst mac */ - txcur = NETMAP_RING_NEXT(txring, txcur); + txcur = nm_ring_next(txring, txcur); txavail--; sent++; } } - txring->cur = txcur; - txring->avail = txavail; + txring->head = txring->cur = txcur; targ->count = sent; #ifdef BUSYWAIT - ioctl(fds[0].fd, NIOCTXSYNC, NULL); + ioctl(pfd.fd, NIOCTXSYNC, NULL); #endif //D("tx %d rx %d", sent, rx); } @@ -847,85 +932,84 @@ timespec2val(const struct timespec *a) } -static int -wait_time(struct timespec ts, struct timespec *wakeup_ts, long long *waited) +static __inline struct timespec +timespec_add(struct timespec a, struct timespec b) { - struct timespec curtime; - - curtime.tv_sec = 0; - curtime.tv_nsec = 0; - - if (clock_gettime(CLOCK_REALTIME_PRECISE, &curtime) == -1) { - D("clock_gettime: %s", strerror(errno)); - return (-1); - } - while (timespec_ge(&ts, &curtime)) { - if (waited != NULL) - (*waited)++; - if (clock_gettime(CLOCK_REALTIME_PRECISE, &curtime) == -1) { - D("clock_gettime"); - return (-1); - } + struct timespec ret = { a.tv_sec + b.tv_sec, a.tv_nsec + b.tv_nsec }; + if (ret.tv_nsec >= 1000000000) { + ret.tv_sec++; + ret.tv_nsec -= 1000000000; } - if (wakeup_ts != NULL) - *wakeup_ts = curtime; - return (0); + return ret; } -static __inline void -timespec_add(struct timespec *tsa, struct timespec *tsb) +static __inline struct timespec +timespec_sub(struct timespec a, struct timespec b) { - tsa->tv_sec += tsb->tv_sec; - tsa->tv_nsec += tsb->tv_nsec; - if (tsa->tv_nsec >= 1000000000) { - tsa->tv_sec++; - tsa->tv_nsec -= 1000000000; + struct timespec ret = { a.tv_sec - b.tv_sec, a.tv_nsec - b.tv_nsec }; + if (ret.tv_nsec < 0) { + ret.tv_sec--; + ret.tv_nsec += 1000000000; } + return ret; } +/* + * wait until ts, either busy or sleeping if more than 1ms. + * Return wakeup time. + */ +static struct timespec +wait_time(struct timespec ts) +{ + for (;;) { + struct timespec w, cur; + clock_gettime(CLOCK_REALTIME_PRECISE, &cur); + w = timespec_sub(ts, cur); + if (w.tv_sec < 0) + return cur; + else if (w.tv_sec > 0 || w.tv_nsec > 1000000) + poll(NULL, 0, 1); + } +} + static void * sender_body(void *data) { struct targ *targ = (struct targ *) data; - - struct pollfd fds[1]; - struct netmap_if *nifp = targ->nifp; + struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT }; + struct netmap_if *nifp = targ->nmd->nifp; struct netmap_ring *txring; - int i, n = targ->g->npackets / targ->g->nthreads, sent = 0; + int i, n = targ->g->npackets / targ->g->nthreads; + int64_t sent = 0; int options = targ->g->options | OPT_COPY; - struct timespec tmptime, nexttime = { 0, 0}; // XXX silence compiler + struct timespec nexttime = { 0, 0}; // XXX silence compiler int rate_limit = targ->g->tx_rate; - long long waited = 0; + struct pkt *pkt = &targ->pkt; + void *frame; + int size; + + frame = pkt; + frame += sizeof(pkt->vh) - targ->g->virt_header; + size = targ->g->pkt_size + targ->g->virt_header; D("start"); if (setaffinity(targ->thread, targ->affinity)) goto quit; - /* setup poll(2) mechanism. */ - memset(fds, 0, sizeof(fds)); - fds[0].fd = targ->fd; - fds[0].events = (POLLOUT); /* main loop.*/ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); if (rate_limit) { - tmptime.tv_sec = 2; - tmptime.tv_nsec = 0; - timespec_add(&targ->tic, &tmptime); + targ->tic = timespec_add(targ->tic, (struct timespec){2,0}); targ->tic.tv_nsec = 0; - if (wait_time(targ->tic, NULL, NULL) == -1) { - D("wait_time: %s", strerror(errno)); - goto quit; - } + wait_time(targ->tic); nexttime = targ->tic; } - if (targ->g->dev_type == DEV_PCAP) { - int size = targ->g->pkt_size; - void *pkt = &targ->pkt; - pcap_t *p = targ->g->p; + if (targ->g->dev_type == DEV_TAP) { + D("writing to file desc %d", targ->g->main_fd); for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { - if (pcap_inject(p, pkt, size) != -1) + if (write(targ->g->main_fd, frame, size) != -1) sent++; update_addresses(pkt, targ->g); if (i > 10000) { @@ -933,13 +1017,12 @@ sender_body(void *data) i = 0; } } - } else if (targ->g->dev_type == DEV_TAP) { /* tap */ - int size = targ->g->pkt_size; - void *pkt = &targ->pkt; - D("writing to file desc %d", targ->g->main_fd); +#ifndef NO_PCAP + } else if (targ->g->dev_type == DEV_PCAP) { + pcap_t *p = targ->g->p; for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) { - if (write(targ->g->main_fd, pkt, size) != -1) + if (pcap_inject(p, frame, size) != -1) sent++; update_addresses(pkt, targ->g); if (i > 10000) { @@ -947,6 +1030,7 @@ sender_body(void *data) i = 0; } } +#endif /* NO_PCAP */ } else { int tosend = 0; int frags = targ->g->frags; @@ -955,20 +1039,22 @@ sender_body(void *data) if (rate_limit && tosend <= 0) { tosend = targ->g->burst; - timespec_add(&nexttime, &targ->g->tx_period); - if (wait_time(nexttime, &tmptime, &waited) == -1) { - D("wait_time"); - goto quit; - } + nexttime = timespec_add(nexttime, targ->g->tx_period); + wait_time(nexttime); } /* * wait for available room in the send queue(s) */ - if (poll(fds, 1, 2000) <= 0) { + if (poll(&pfd, 1, 2000) <= 0) { if (targ->cancel) break; - D("poll error/timeout on queue %d", targ->me); + D("poll error/timeout on queue %d: %s", targ->me, + strerror(errno)); + // goto quit; + } + if (pfd.revents & POLLERR) { + D("poll error"); goto quit; } /* @@ -978,20 +1064,20 @@ sender_body(void *data) D("drop copy"); options &= ~OPT_COPY; } - for (i = targ->qfirst; i < targ->qlast; i++) { + for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { int m, limit = rate_limit ? tosend : targ->g->burst; if (n > 0 && n - sent < limit) limit = n - sent; txring = NETMAP_TXRING(nifp, i); - if (txring->avail == 0) + if (nm_ring_empty(txring)) continue; if (frags > 1) limit = ((limit + frags - 1) / frags) * frags; - - m = send_packets(txring, &targ->pkt, targ->g, + + m = send_packets(txring, pkt, frame, size, targ->g, limit, options, frags); - ND("limit %d avail %d frags %d m %d", - limit, txring->avail, frags, m); + ND("limit %d tail %d frags %d m %d", + limit, txring->tail, frags, m); sent += m; targ->count = sent; if (rate_limit) { @@ -1002,17 +1088,17 @@ sender_body(void *data) } } /* flush any remaining packets */ - ioctl(fds[0].fd, NIOCTXSYNC, NULL); + ioctl(pfd.fd, NIOCTXSYNC, NULL); /* final part: wait all the TX queues to be empty. */ - for (i = targ->qfirst; i < targ->qlast; i++) { + for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) { txring = NETMAP_TXRING(nifp, i); - while (!NETMAP_TX_RING_EMPTY(txring)) { - ioctl(fds[0].fd, NIOCTXSYNC, NULL); + while (nm_tx_pending(txring)) { + ioctl(pfd.fd, NIOCTXSYNC, NULL); usleep(1); /* wait 1 tick */ } } - } + } /* end DEV_NETMAP */ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->completed = 1; @@ -1026,6 +1112,7 @@ quit: } +#ifndef NO_PCAP static void receive_pcap(u_char *user, const struct pcap_pkthdr * h, const u_char * bytes) @@ -1035,15 +1122,17 @@ receive_pcap(u_char *user, const struct pcap_pkthdr * h, (void)bytes; /* UNUSED */ (*count)++; } +#endif /* !NO_PCAP */ static int receive_packets(struct netmap_ring *ring, u_int limit, int dump) { - u_int cur, rx; + u_int cur, rx, n; cur = ring->cur; - if (ring->avail < limit) - limit = ring->avail; + n = nm_ring_space(ring); + if (n < limit) + limit = n; for (rx = 0; rx < limit; rx++) { struct netmap_slot *slot = &ring->slot[cur]; char *p = NETMAP_BUF(ring, slot->buf_idx); @@ -1051,10 +1140,9 @@ receive_packets(struct netmap_ring *ring, u_int limit, int dump) if (dump) dump_payload(p, slot->len, ring, cur); - cur = NETMAP_RING_NEXT(ring, cur); + cur = nm_ring_next(ring, cur); } - ring->avail -= rx; - ring->cur = cur; + ring->head = ring->cur = cur; return (rx); } @@ -1063,8 +1151,8 @@ static void * receiver_body(void *data) { struct targ *targ = (struct targ *) data; - struct pollfd fds[1]; - struct netmap_if *nifp = targ->nifp; + struct pollfd pfd = { .fd = targ->fd, .events = POLLIN }; + struct netmap_if *nifp = targ->nmd->nifp; struct netmap_ring *rxring; int i; uint64_t received = 0; @@ -1072,27 +1160,18 @@ receiver_body(void *data) if (setaffinity(targ->thread, targ->affinity)) goto quit; - /* setup poll(2) mechanism. */ - memset(fds, 0, sizeof(fds)); - fds[0].fd = targ->fd; - fds[0].events = (POLLIN); - /* unbounded wait for the first packet. */ for (;;) { - i = poll(fds, 1, 1000); - if (i > 0 && !(fds[0].revents & POLLERR)) + i = poll(&pfd, 1, 1000); + if (i > 0 && !(pfd.revents & POLLERR)) break; - D("waiting for initial packets, poll returns %d %d", i, fds[0].revents); + RD(1, "waiting for initial packets, poll returns %d %d", + i, pfd.revents); } /* main loop, exit after 1s silence */ clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic); - if (targ->g->dev_type == DEV_PCAP) { - while (!targ->cancel) { - /* XXX should we poll ? */ - pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, NULL); - } - } else if (targ->g->dev_type == DEV_TAP) { + if (targ->g->dev_type == DEV_TAP) { D("reading from %s fd %d", targ->g->ifname, targ->g->main_fd); while (!targ->cancel) { char buf[2048]; @@ -1100,34 +1179,46 @@ receiver_body(void *data) if (read(targ->g->main_fd, buf, sizeof(buf)) > 0) targ->count++; } +#ifndef NO_PCAP + } else if (targ->g->dev_type == DEV_PCAP) { + while (!targ->cancel) { + /* XXX should we poll ? */ + pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap, NULL); + } +#endif /* !NO_PCAP */ } else { int dump = targ->g->options & OPT_DUMP; while (!targ->cancel) { /* Once we started to receive packets, wait at most 1 seconds before quitting. */ - if (poll(fds, 1, 1 * 1000) <= 0 && !targ->g->forever) { + if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) { clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); targ->toc.tv_sec -= 1; /* Subtract timeout time. */ - break; + goto out; } - for (i = targ->qfirst; i < targ->qlast; i++) { + if (pfd.revents & POLLERR) { + D("poll err"); + goto quit; + } + + for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) { int m; rxring = NETMAP_RXRING(nifp, i); - if (rxring->avail == 0) + if (nm_ring_empty(rxring)) continue; m = receive_packets(rxring, targ->g->burst, dump); received += m; } targ->count = received; - - // tell the card we have read the data - //ioctl(fds[0].fd, NIOCRXSYNC, NULL); } } + clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc); + +out: targ->completed = 1; targ->count = received; @@ -1144,10 +1235,10 @@ quit: static const char * norm(char *buf, double val) { - char *units[] = { "", "K", "M", "G" }; + char *units[] = { "", "K", "M", "G", "T" }; u_int i; - for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *); i++) + for (i = 0; val >=1000 && i < sizeof(units)/sizeof(char *) - 1; i++) val /= 1000; sprintf(buf, "%.2f %s", val, units[i]); return buf; @@ -1159,8 +1250,8 @@ tx_output(uint64_t sent, int size, double delta) double bw, raw_bw, pps; char b1[40], b2[80], b3[80]; - printf("Sent %" PRIu64 " packets, %d bytes each, in %.2f seconds.\n", - sent, size, delta); + printf("Sent %llu packets, %d bytes each, in %.2f seconds.\n", + (unsigned long long)sent, size, delta); if (delta == 0) delta = 1e-6; if (size < 60) /* correct for min packet size */ @@ -1181,7 +1272,8 @@ rx_output(uint64_t received, double delta) double pps; char b1[40]; - printf("Received %" PRIu64 " packets, in %.2f seconds.\n", received, delta); + printf("Received %llu packets, in %.2f seconds.\n", + (unsigned long long) received, delta); if (delta == 0) delta = 1e-6; @@ -1215,6 +1307,7 @@ usage(void) "\t-w wait_for_link_time in seconds\n" "\t-R rate in packets per second\n" "\t-X dump payload\n" + "\t-H len add empty virtio-net-header with size 'len'\n" "", cmd); @@ -1232,68 +1325,57 @@ start_threads(struct glob_arg *g) * using a single descriptor. */ for (i = 0; i < g->nthreads; i++) { - bzero(&targs[i], sizeof(targs[i])); - targs[i].fd = -1; /* default, with pcap */ - targs[i].g = g; + struct targ *t = &targs[i]; + + bzero(t, sizeof(*t)); + t->fd = -1; /* default, with pcap */ + t->g = g; if (g->dev_type == DEV_NETMAP) { - struct nmreq tifreq; - int tfd; + struct nm_desc nmd = *g->nmd; /* copy, we overwrite ringid */ - /* register interface. */ - tfd = open("/dev/netmap", O_RDWR); - if (tfd == -1) { - D("Unable to open /dev/netmap"); - continue; + if (g->nthreads > 1) { + if (nmd.req.nr_flags != NR_REG_ALL_NIC) { + D("invalid nthreads mode %d", nmd.req.nr_flags); + continue; + } + nmd.req.nr_flags = NR_REG_ONE_NIC; + nmd.req.nr_ringid = i; } - targs[i].fd = tfd; - - bzero(&tifreq, sizeof(tifreq)); - strncpy(tifreq.nr_name, g->ifname, sizeof(tifreq.nr_name)); - tifreq.nr_version = NETMAP_API; - tifreq.nr_ringid = (g->nthreads > 1) ? (i | NETMAP_HW_RING) : 0; - parse_nmr_config(g->nmr_config, &tifreq); + /* Only touch one of the rings (rx is already ok) */ + if (g->td_body == receiver_body) + nmd.req.nr_ringid |= NETMAP_NO_TX_POLL; - /* - * if we are acting as a receiver only, do not touch the transmit ring. - * This is not the default because many apps may use the interface - * in both directions, but a pure receiver does not. - */ - if (g->td_body == receiver_body) { - tifreq.nr_ringid |= NETMAP_NO_TX_POLL; - } + /* register interface. Override ifname and ringid etc. */ - if ((ioctl(tfd, NIOCREGIF, &tifreq)) == -1) { - D("Unable to register %s", g->ifname); + t->nmd = nm_open(t->g->ifname, NULL, g->nmd_flags | + NM_OPEN_IFNAME | NM_OPEN_NO_MMAP, g->nmd); + if (t->nmd == NULL) { + D("Unable to open %s: %s", + t->g->ifname, strerror(errno)); continue; } - D("memsize is %d MB", tifreq.nr_memsize >> 20); - targs[i].nmr = tifreq; - targs[i].nifp = NETMAP_IF(g->mmap_addr, tifreq.nr_offset); - D("nifp flags 0x%x", targs[i].nifp->ni_flags); - /* start threads. */ - targs[i].qfirst = (g->nthreads > 1) ? i : 0; - targs[i].qlast = (g->nthreads > 1) ? i+1 : - (g->td_body == receiver_body ? tifreq.nr_rx_rings : tifreq.nr_tx_rings); + t->fd = t->nmd->fd; + } else { targs[i].fd = g->main_fd; } - targs[i].used = 1; - targs[i].me = i; + t->used = 1; + t->me = i; if (g->affinity >= 0) { if (g->affinity < g->cpus) - targs[i].affinity = g->affinity; + t->affinity = g->affinity; else - targs[i].affinity = i % g->cpus; - } else - targs[i].affinity = -1; + t->affinity = i % g->cpus; + } else { + t->affinity = -1; + } /* default, init packets */ - initialize_packet(&targs[i]); + initialize_packet(t); - if (pthread_create(&targs[i].thread, NULL, g->td_body, - &targs[i]) == -1) { - D("Unable to create thread %d", i); - targs[i].used = 0; + if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) { + D("Unable to create thread %d: %s", i, strerror(errno)); + t->used = 0; } } } @@ -1318,7 +1400,6 @@ main_thread(struct glob_arg *g) delta.tv_usec = (g->report_interval%1000)*1000; select(0, NULL, NULL, NULL, &delta); gettimeofday(&now, NULL); - time_second = now.tv_sec; timersub(&now, &toc, &toc); my_count = 0; for (i = 0; i < g->nthreads; i++) { @@ -1331,8 +1412,10 @@ main_thread(struct glob_arg *g) continue; npkts = my_count - prev; pps = (npkts*1000000 + usec/2) / usec; - D("%" PRIu64 " pps (%" PRIu64 " pkts in %" PRIu64 " usec)", - pps, npkts, usec); + D("%llu pps (%llu pkts in %llu usec)", + (unsigned long long)pps, + (unsigned long long)npkts, + (unsigned long long)usec); prev = my_count; toc = now; if (done == g->nthreads) @@ -1376,7 +1459,7 @@ main_thread(struct glob_arg *g) rx_output(count, delta_t); if (g->dev_type == DEV_NETMAP) { - munmap(g->mmap_addr, g->mmap_size); + munmap(g->nmd->mem, g->nmd->req.nr_memsize); close(g->main_fd); } } @@ -1439,7 +1522,7 @@ tap_alloc(char *dev) /* try to create the device */ if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) { - D("failed to to a TUNSETIFF"); + D("failed to to a TUNSETIFF: %s", strerror(errno)); close(fd); return err; } @@ -1464,7 +1547,6 @@ main(int arc, char **argv) struct glob_arg g; - struct nmreq nmr; int ch; int wait_link = 2; int devqueues = 1; /* how many device queues */ @@ -1488,9 +1570,10 @@ main(int arc, char **argv) g.tx_rate = 0; g.frags = 1; g.nmr_config = ""; + g.virt_header = 0; while ( (ch = getopt(arc, argv, - "a:f:F:n:i:It:r:l:d:s:D:S:b:c:o:p:PT:w:WvR:XC:")) != -1) { + "a:f:F:n:i:Il:d:s:D:S:b:c:o:p:T:w:WvR:XC:H:e:")) != -1) { struct sf *fn; switch(ch) { @@ -1532,31 +1615,39 @@ main(int arc, char **argv) break; case 'i': /* interface */ - g.ifname = optarg; - if (!strncmp(optarg, "tap", 3)) - g.dev_type = DEV_TAP; - else + /* a prefix of tap: netmap: or pcap: forces the mode. + * otherwise we guess + */ + D("interface is %s", optarg); + if (strlen(optarg) > MAX_IFNAMELEN - 8) { + D("ifname too long %s", optarg); + break; + } + strcpy(g.ifname, optarg); + if (!strcmp(optarg, "null")) { g.dev_type = DEV_NETMAP; - if (!strcmp(g.ifname, "null")) g.dummy_send = 1; + } else if (!strncmp(optarg, "tap:", 4)) { + g.dev_type = DEV_TAP; + strcpy(g.ifname, optarg + 4); + } else if (!strncmp(optarg, "pcap:", 5)) { + g.dev_type = DEV_PCAP; + strcpy(g.ifname, optarg + 5); + } else if (!strncmp(optarg, "netmap:", 7) || + !strncmp(optarg, "vale", 4)) { + g.dev_type = DEV_NETMAP; + } else if (!strncmp(optarg, "tap", 3)) { + g.dev_type = DEV_TAP; + } else { /* prepend netmap: */ + g.dev_type = DEV_NETMAP; + sprintf(g.ifname, "netmap:%s", optarg); + } break; case 'I': g.options |= OPT_INDIRECT; /* XXX use indirect buffer */ break; - case 't': /* send, deprecated */ - D("-t deprecated, please use -f tx -n %s", optarg); - g.td_body = sender_body; - g.npackets = atoi(optarg); - break; - - case 'r': /* receive */ - D("-r deprecated, please use -f rx -n %s", optarg); - g.td_body = receiver_body; - g.npackets = atoi(optarg); - break; - case 'l': /* pkt_size */ g.pkt_size = atoi(optarg); break; @@ -1591,10 +1682,6 @@ main(int arc, char **argv) g.nthreads = atoi(optarg); break; - case 'P': - g.dev_type = DEV_PCAP; - break; - case 'D': /* destination mac */ g.dst_mac.name = optarg; break; @@ -1613,6 +1700,13 @@ main(int arc, char **argv) break; case 'C': g.nmr_config = strdup(optarg); + break; + case 'H': + g.virt_header = atoi(optarg); + break; + case 'e': /* extra bufs */ + g.extra_bufs = atoi(optarg); + break; } } @@ -1649,6 +1743,18 @@ main(int arc, char **argv) extract_mac_range(&g.src_mac); extract_mac_range(&g.dst_mac); + if (g.src_ip.start != g.src_ip.end || + g.src_ip.port0 != g.src_ip.port1 || + g.dst_ip.start != g.dst_ip.end || + g.dst_ip.port0 != g.dst_ip.port1) + g.options |= OPT_COPY; + + if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1 + && g.virt_header != VIRT_HDR_2) { + D("bad virtio-net-header length"); + usage(); + } + if (g.dev_type == DEV_TAP) { D("want to use tap %s", g.ifname); g.main_fd = tap_alloc(g.ifname); @@ -1656,7 +1762,8 @@ main(int arc, char **argv) D("cannot open tap %s", g.ifname); usage(); } - } else if (g.dev_type > DEV_NETMAP) { +#ifndef NO_PCAP + } else if (g.dev_type == DEV_PCAP) { char pcap_errbuf[PCAP_ERRBUF_SIZE]; D("using pcap on %s", g.ifname); @@ -1666,54 +1773,37 @@ main(int arc, char **argv) D("cannot open pcap on %s", g.ifname); usage(); } - } else if (g.dummy_send) { +#endif /* !NO_PCAP */ + } else if (g.dummy_send) { /* but DEV_NETMAP */ D("using a dummy send routine"); } else { - bzero(&nmr, sizeof(nmr)); - nmr.nr_version = NETMAP_API; + struct nm_desc base_nmd; + + bzero(&base_nmd, sizeof(base_nmd)); + + g.nmd_flags = 0; + g.nmd_flags |= parse_nmr_config(g.nmr_config, &base_nmd.req); + if (g.extra_bufs) { + base_nmd.req.nr_arg3 = g.extra_bufs; + g.nmd_flags |= NM_OPEN_ARG3; + } + /* - * Open the netmap device to fetch the number of queues of our - * interface. + * Open the netmap device using nm_open(). * - * The first NIOCREGIF also detaches the card from the * protocol stack and may cause a reset of the card, * which in turn may take some time for the PHY to - * reconfigure. + * reconfigure. We do the open here to have time to reset. */ - g.main_fd = open("/dev/netmap", O_RDWR); - if (g.main_fd == -1) { - D("Unable to open /dev/netmap"); - // fail later + g.nmd = nm_open(g.ifname, NULL, g.nmd_flags, &base_nmd); + if (g.nmd == NULL) { + D("Unable to open %s: %s", g.ifname, strerror(errno)); + goto out; } - /* - * Register the interface on the netmap device: from now on, - * we can operate on the network interface without any - * interference from the legacy network stack. - * - * We decide to put the first interface registration here to - * give time to cards that take a long time to reset the PHY. - */ - bzero(&nmr, sizeof(nmr)); - nmr.nr_version = NETMAP_API; - strncpy(nmr.nr_name, g.ifname, sizeof(nmr.nr_name)); - nmr.nr_version = NETMAP_API; - parse_nmr_config(g.nmr_config, &nmr); - if (ioctl(g.main_fd, NIOCREGIF, &nmr) == -1) { - D("Unable to register interface %s", g.ifname); - //continue, fail later - } - ND("%s: txr %d txd %d rxr %d rxd %d", g.ifname, - nmr.nr_tx_rings, nmr.nr_tx_slots, - nmr.nr_rx_rings, nmr.nr_rx_slots); - //if ((ioctl(g.main_fd, NIOCGINFO, &nmr)) == -1) { - // D("Unable to get if info without name"); - //} else { - // D("map size is %d Kb", nmr.nr_memsize >> 10); - //} - if ((ioctl(g.main_fd, NIOCGINFO, &nmr)) == -1) { - D("Unable to get if info for %s", g.ifname); - } - devqueues = nmr.nr_rx_rings; + g.main_fd = g.nmd->fd; + D("mapped %dKB at %p", g.nmd->req.nr_memsize>>10, g.nmd->mem); + + devqueues = g.nmd->req.nr_rx_rings; /* validate provided nthreads. */ if (g.nthreads < 1 || g.nthreads > devqueues) { @@ -1721,22 +1811,22 @@ main(int arc, char **argv) // continue, fail later } - /* - * Map the netmap shared memory: instead of issuing mmap() - * inside the body of the threads, we prefer to keep this - * operation here to simplify the thread logic. - */ - D("mapping %d Kbytes", nmr.nr_memsize>>10); - g.mmap_size = nmr.nr_memsize; - g.mmap_addr = (struct netmap_d *) mmap(0, nmr.nr_memsize, - PROT_WRITE | PROT_READ, - MAP_SHARED, g.main_fd, 0); - if (g.mmap_addr == MAP_FAILED) { - D("Unable to mmap %d KB", nmr.nr_memsize >> 10); - // continue, fail later - } - + if (verbose) { + struct netmap_if *nifp = g.nmd->nifp; + struct nmreq *req = &g.nmd->req; + D("nifp at offset %d, %d tx %d rx region %d", + req->nr_offset, req->nr_tx_rings, req->nr_rx_rings, + req->nr_arg2); + for (i = 0; i <= req->nr_tx_rings; i++) { + D(" TX%d at 0x%lx", i, + (char *)NETMAP_TXRING(nifp, i) - (char *)nifp); + } + for (i = 0; i <= req->nr_rx_rings; i++) { + D(" RX%d at 0x%lx", i, + (char *)NETMAP_RXRING(nifp, i) - (char *)nifp); + } + } /* Print some debug information. */ fprintf(stdout, @@ -1751,7 +1841,8 @@ main(int arc, char **argv) g.src_ip.name, g.dst_ip.name, g.src_mac.name, g.dst_mac.name); } - + +out: /* Exit if something went wrong. */ if (g.main_fd < 0) { D("aborting"); @@ -1759,7 +1850,7 @@ main(int arc, char **argv) } } - + if (g.options) { D("--- SPECIAL OPTIONS:%s%s%s%s%s\n", g.options & OPT_PREFETCH ? " prefetch" : "", @@ -1772,14 +1863,17 @@ main(int arc, char **argv) g.tx_period.tv_sec = g.tx_period.tv_nsec = 0; if (g.tx_rate > 0) { /* try to have at least something every second, - * reducing the burst size to 0.5s worth of data + * reducing the burst size to some 0.01s worth of data * (but no less than one full set of fragments) */ - if (g.burst > g.tx_rate/2) - g.burst = g.tx_rate/2; + uint64_t x; + int lim = (g.tx_rate)/300; + if (g.burst > lim) + g.burst = lim; if (g.burst < g.frags) g.burst = g.frags; - g.tx_period.tv_nsec = (1e9 / g.tx_rate) * g.burst; + x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate; + g.tx_period.tv_nsec = x; g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000; g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000; } @@ -1795,16 +1889,6 @@ main(int arc, char **argv) global_nthreads = g.nthreads; signal(SIGINT, sigint_h); -#if 0 // XXX this is not needed, i believe - if (g.dev_type > DEV_NETMAP) { - g.p = pcap_open_live(g.ifname, 0, 1, 100, NULL); - if (g.p == NULL) { - D("cannot open pcap on %s", g.ifname); - usage(); - } else - D("using pcap %p on %s", g.p, g.ifname); - } -#endif // XXX start_threads(&g); main_thread(&g); return 0; diff --git a/tools/tools/netmap/vale-ctl.c b/tools/tools/netmap/vale-ctl.c index 0a478ba08..e1d8da568 100644 --- a/tools/tools/netmap/vale-ctl.c +++ b/tools/tools/netmap/vale-ctl.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013 Michio Honda. All rights reserved. + * Copyright (C) 2013-2014 Michio Honda. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -33,6 +33,7 @@ #include /* close */ #include /* ioctl */ #include +#include /* apple needs sockaddr */ #include /* ifreq */ #include #include @@ -69,20 +70,22 @@ bdg_ctl(const char *name, int nr_cmd, int nr_arg) nr_arg = 0; nmr.nr_arg1 = nr_arg; error = ioctl(fd, NIOCREGIF, &nmr); - if (error == -1) - D("Unable to %s %s to the bridge", nr_cmd == + if (error == -1) { + ND("Unable to %s %s to the bridge", nr_cmd == NETMAP_BDG_DETACH?"detach":"attach", name); - else - D("Success to %s %s to the bridge\n", nr_cmd == + perror(name); + } else + ND("Success to %s %s to the bridge", nr_cmd == NETMAP_BDG_DETACH?"detach":"attach", name); break; case NETMAP_BDG_LIST: if (strlen(nmr.nr_name)) { /* name to bridge/port info */ error = ioctl(fd, NIOCGINFO, &nmr); - if (error) - D("Unable to obtain info for %s", name); - else + if (error) { + ND("Unable to obtain info for %s", name); + perror(name); + } else D("%s at bridge:%d port:%d", name, nmr.nr_arg1, nmr.nr_arg2); break; @@ -101,9 +104,10 @@ bdg_ctl(const char *name, int nr_cmd, int nr_arg) default: /* GINFO */ nmr.nr_cmd = nmr.nr_arg1 = nmr.nr_arg2 = 0; error = ioctl(fd, NIOCGINFO, &nmr); - if (error) - D("Unable to get if info for %s", name); - else + if (error) { + ND("Unable to get if info for %s", name); + perror(name); + } else D("%s: %d queues.", name, nmr.nr_rx_rings); break; } @@ -118,7 +122,7 @@ main(int argc, char *argv[]) const char *command = basename(argv[0]); char *name = NULL; - if (argc != 3 && argc != 1 /* list all */ ) { + if (argc > 3) { usage: fprintf(stderr, "Usage:\n" @@ -127,12 +131,13 @@ usage: "\t-d interface interface name to be detached\n" "\t-a interface interface name to be attached\n" "\t-h interface interface name to be attached with the host stack\n" - "\t-l list all or specified bridge's interfaces\n" + "\t-l list all or specified bridge's interfaces (default)\n" "", command); return 0; } - while ((ch = getopt(argc, argv, "d:a:h:g:l:")) != -1) { + while ((ch = getopt(argc, argv, "d:a:h:g:l")) != -1) { + name = optarg; /* default */ switch (ch) { default: fprintf(stderr, "bad option %c %s", ch, optarg); @@ -152,12 +157,16 @@ usage: break; case 'l': nr_cmd = NETMAP_BDG_LIST; + if (optind < argc && argv[optind][0] == '-') + name = NULL; break; } - name = optarg; + if (optind != argc) { + // fprintf(stderr, "optind %d argc %d\n", optind, argc); + goto usage; + } } if (argc == 1) nr_cmd = NETMAP_BDG_LIST; - bdg_ctl(name, nr_cmd, nr_arg); - return 0; + return bdg_ctl(name, nr_cmd, nr_arg) ? 1 : 0; } -- 2.45.0