2 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3 * Copyright (C) 2013-2015 Universita` di Pisa. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * $Id: pkt-gen.c 12346 2013-06-12 17:36:25Z luigi $
31 * Example program to show how to build a multithreaded packet
32 * source/sink using the netmap device.
34 * In this example we create a programmable number of threads
35 * to take care of all the queues of the interface used to
36 * send or receive traffic.
40 #define _GNU_SOURCE /* for CPU_SET() */
41 #include <arpa/inet.h> /* ntohs */
43 #include <ctype.h> // isprint()
46 #include <ifaddrs.h> /* getifaddrs */
47 #include <libnetmap.h>
49 #include <net/ethernet.h>
50 #include <netinet/in.h>
51 #include <netinet/ip.h>
52 #include <netinet/ip6.h>
53 #include <netinet/udp.h>
55 #include <pcap/pcap.h>
62 #include <sys/ioctl.h>
65 #if !defined(_WIN32) && !defined(linux)
66 #include <sys/sysctl.h> /* sysctl */
68 #include <sys/types.h>
69 #include <unistd.h> // sysconf()
71 #define IPV6_VERSION 0x60
72 #define IPV6_DEFHLIM 64
77 static void usage(int);
80 #define cpuset_t DWORD_PTR //uint64_t
81 static inline void CPU_ZERO(cpuset_t *p)
86 static inline void CPU_SET(uint32_t i, cpuset_t *p)
91 #define pthread_setaffinity_np(a, b, c) !SetThreadAffinityMask(a, *c) //((void)a, 0)
92 #define TAP_CLONEDEV "/dev/tap"
93 #define AF_LINK 18 //defined in winsocks.h
94 #define CLOCK_REALTIME_PRECISE CLOCK_REALTIME
95 #include <net/if_dl.h>
98 * Convert an ASCII representation of an ethernet address to
102 ether_aton(const char *a)
105 static struct ether_addr o;
106 unsigned int o0, o1, o2, o3, o4, o5;
108 i = sscanf(a, "%x:%x:%x:%x:%x:%x", &o0, &o1, &o2, &o3, &o4, &o5);
120 return ((struct ether_addr *)&o);
124 * Convert a binary representation of an ethernet address to
128 ether_ntoa(const struct ether_addr *n)
133 i = sprintf(a, "%02x:%02x:%02x:%02x:%02x:%02x",
134 n->octet[0], n->octet[1], n->octet[2],
135 n->octet[3], n->octet[4], n->octet[5]);
136 return (i < 17 ? NULL : (char *)&a);
142 #define cpuset_t cpu_set_t
144 #define ifr_flagshigh ifr_flags /* only the low 16 bits here */
145 #define IFF_PPROMISC IFF_PROMISC /* IFF_PPROMISC does not exist */
146 #include <linux/ethtool.h>
147 #include <linux/sockios.h>
149 #define CLOCK_REALTIME_PRECISE CLOCK_REALTIME
150 #include <netinet/ether.h> /* ether_aton */
151 #include <linux/if_packet.h> /* sockaddr_ll */
155 #include <sys/endian.h> /* le64toh */
156 #include <machine/param.h>
158 #include <pthread_np.h> /* pthread w/ affinity */
159 #include <sys/cpuset.h> /* cpu_set */
160 #include <net/if_dl.h> /* LLADDR */
161 #endif /* __FreeBSD__ */
165 #define cpuset_t uint64_t // XXX
166 static inline void CPU_ZERO(cpuset_t *p)
171 static inline void CPU_SET(uint32_t i, cpuset_t *p)
173 *p |= 1<< (i & 0x3f);
176 #define pthread_setaffinity_np(a, b, c) ((void)a, 0)
178 #define ifr_flagshigh ifr_flags // XXX
179 #define IFF_PPROMISC IFF_PROMISC
180 #include <net/if_dl.h> /* LLADDR */
181 #define clock_gettime(a,b) \
182 do {struct timespec t0 = {0,0}; *(b) = t0; } while (0)
183 #endif /* __APPLE__ */
185 static const char *default_payload = "netmap pkt-gen DIRECT payload\n"
186 "http://info.iet.unipi.it/~luigi/netmap/ ";
188 static const char *indirect_payload = "netmap pkt-gen indirect payload\n"
189 "http://info.iet.unipi.it/~luigi/netmap/ ";
191 static int verbose = 0;
192 static int normalize = 1;
194 #define VIRT_HDR_1 10 /* length of a base vnet-hdr */
195 #define VIRT_HDR_2 12 /* length of the extenede vnet-hdr */
196 #define VIRT_HDR_MAX VIRT_HDR_2
198 uint8_t fields[VIRT_HDR_MAX];
201 #define MAX_BODYSIZE 65536
204 struct virt_header vh;
205 struct ether_header eh;
210 uint8_t body[MAX_BODYSIZE]; /* hardwired */
215 uint8_t body[MAX_BODYSIZE]; /* hardwired */
218 } __attribute__((__packed__));
220 #define PKT(p, f, af) \
221 ((af) == AF_INET ? (p)->ipv4.f: (p)->ipv6.f)
227 uint32_t start, end; /* same as struct in_addr */
230 struct in6_addr start, end;
231 uint8_t sgroup, egroup;
234 uint16_t port0, port1;
239 struct ether_addr start, end;
242 /* ifname can be netmap:foo-xxxx */
243 #define MAX_IFNAMELEN 512 /* our buffer for ifname */
244 //#define MAX_PKTSIZE 1536
245 #define MAX_PKTSIZE MAX_BODYSIZE /* XXX: + IP_HDR + ETH_HDR */
247 /* compact timestamp to fit into 60 byte packet. (enough to obtain RTT) */
254 * global arguments for all threads
258 int af; /* address family AF_INET/AF_INET6 */
259 struct ip_range src_ip;
260 struct ip_range dst_ip;
261 struct mac_range dst_mac;
262 struct mac_range src_mac;
267 uint64_t npackets; /* total packets to send */
268 int frags; /* fragments per packet */
269 u_int frag_size; /* size of each fragment */
271 int cpus; /* cpus used for running */
272 int system_cpus; /* cpus on the system */
274 int options; /* testing */
275 #define OPT_PREFETCH 1
279 #define OPT_TS 16 /* add a timestamp */
280 #define OPT_INDIRECT 32 /* use indirect buffers, tx only */
281 #define OPT_DUMP 64 /* dump rx/tx traffic */
282 #define OPT_RUBBISH 256 /* send whatever the buffers contain */
283 #define OPT_RANDOM_SRC 512
284 #define OPT_RANDOM_DST 1024
285 #define OPT_PPS_STATS 2048
286 #define OPT_UPDATE_CSUM 4096
293 struct timespec tx_period;
297 struct nmport_d *nmd;
299 int report_interval; /* milliseconds between prints */
300 void *(*td_body)(void *);
303 char ifname[MAX_IFNAMELEN];
304 const char *nmr_config;
306 int virt_header; /* send also the virt_header */
307 char *packet_file; /* -P option */
310 int64_t win[STATS_WIN];
312 int framing; /* #bits of framing (for bw output) */
314 enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP };
323 * Arguments for a new thread. The same structure is used by
324 * the source and the sink
332 struct nmport_d *nmd;
333 /* these ought to be volatile, but they are
334 * only sampled and errors should not accumulate
338 struct timespec tic, toc;
350 static __inline uint16_t
351 cksum_add(uint16_t sum, uint16_t a)
356 return (res + (res < a));
360 extract_ipv4_addr(char *name, uint32_t *addr, uint16_t *port)
365 pp = strchr(name, ':');
366 if (pp != NULL) { /* do we have ports ? */
368 *port = (uint16_t)strtol(pp, NULL, 0);
371 inet_pton(AF_INET, name, &a);
372 *addr = ntohl(a.s_addr);
376 extract_ipv6_addr(char *name, struct in6_addr *addr, uint16_t *port,
382 * We accept IPv6 address in the following form:
383 * group@[2001:DB8::1001]:port (w/ brackets and port)
384 * group@[2001:DB8::1] (w/ brackets and w/o port)
385 * group@2001:DB8::1234 (w/o brackets and w/o port)
387 pp = strchr(name, '@');
390 *group = (uint8_t)strtol(name, NULL, 0);
397 pp = strchr(name, ']');
400 if (pp != NULL && *pp != ':')
402 if (pp != NULL) { /* do we have ports ? */
404 *port = (uint16_t)strtol(pp, NULL, 0);
406 inet_pton(AF_INET6, name, addr);
409 * extract the extremes from a range of ipv4 addresses.
410 * addr_lo[-addr_hi][:port_lo[-port_hi]]
413 extract_ip_range(struct ip_range *r, int af)
415 char *name, *ap, start[INET6_ADDRSTRLEN];
416 char end[INET6_ADDRSTRLEN];
421 D("extract IP range from %s", r->name);
423 name = strdup(r->name);
428 /* the first - splits start/end of range */
429 ap = strchr(name, '-');
432 r->port0 = 1234; /* default port */
433 if (af == AF_INET6) {
434 r->ipv6.sgroup = 7; /* default group */
435 extract_ipv6_addr(name, &r->ipv6.start, &r->port0,
438 extract_ipv4_addr(name, &r->ipv4.start, &r->port0);
441 if (af == AF_INET6) {
443 r->ipv6.egroup = r->ipv6.sgroup;
444 extract_ipv6_addr(ap, &r->ipv6.end, &r->port1,
447 r->ipv6.end = r->ipv6.start;
448 r->ipv6.egroup = r->ipv6.sgroup;
452 extract_ipv4_addr(ap, &r->ipv4.end, &r->port1);
453 if (r->ipv4.start > r->ipv4.end) {
455 r->ipv4.end = r->ipv4.start;
459 r->ipv4.end = r->ipv4.start;
462 if (r->port0 > r->port1) {
468 a.s_addr = htonl(r->ipv4.start);
469 inet_ntop(af, &a, start, sizeof(start));
470 a.s_addr = htonl(r->ipv4.end);
471 inet_ntop(af, &a, end, sizeof(end));
473 inet_ntop(af, &r->ipv6.start, start, sizeof(start));
474 inet_ntop(af, &r->ipv6.end, end, sizeof(end));
477 D("range is %s:%d to %s:%d", start, r->port0, end, r->port1);
479 D("range is %d@[%s]:%d to %d@[%s]:%d", r->ipv6.sgroup,
480 start, r->port0, r->ipv6.egroup, end, r->port1);
483 if (r->port0 != r->port1 ||
484 (af == AF_INET && r->ipv4.start != r->ipv4.end) ||
486 !IN6_ARE_ADDR_EQUAL(&r->ipv6.start, &r->ipv6.end)))
492 extract_mac_range(struct mac_range *r)
494 struct ether_addr *e;
496 D("extract MAC range from %s", r->name);
498 e = ether_aton(r->name);
500 D("invalid MAC address '%s'", r->name);
503 bcopy(e, &r->start, 6);
504 bcopy(e, &r->end, 6);
506 bcopy(targ->src_mac, eh->ether_shost, 6);
507 p = index(targ->g->src_mac, '-');
509 targ->src_mac_range = atoi(p+1);
511 bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6);
512 bcopy(targ->dst_mac, eh->ether_dhost, 6);
513 p = index(targ->g->dst_mac, '-');
515 targ->dst_mac_range = atoi(p+1);
518 D("%s starts at %s", r->name, ether_ntoa(&r->start));
523 get_if_mtu(const struct glob_arg *g)
527 const char *ifname = g->nmd->hdr.nr_name;
530 if (!strncmp(g->ifname, "netmap:", 7) && !strchr(ifname, '{')
531 && !strchr(ifname, '}')) {
533 len = strlen(ifname);
535 if (len > IFNAMSIZ) {
536 D("'%s' too long, cannot ask for MTU", ifname);
540 s = socket(AF_INET, SOCK_DGRAM, 0);
542 D("socket() failed: %s", strerror(errno));
546 memset(&ifreq, 0, sizeof(ifreq));
547 memcpy(ifreq.ifr_name, ifname, len);
549 ret = ioctl(s, SIOCGIFMTU, &ifreq);
551 D("ioctl(SIOCGIFMTU) failed: %s", strerror(errno));
556 return ifreq.ifr_mtu;
559 /* This is a pipe or a VALE port, where the MTU is very large,
560 * so we use some practical limit. */
564 static struct targ *targs;
565 static int global_nthreads;
567 /* control-C handler */
573 (void)sig; /* UNUSED */
574 D("received control-C on thread %p", (void *)pthread_self());
575 for (i = 0; i < global_nthreads; i++) {
580 /* sysctl wrapper to return the number of active CPUs */
585 #if defined (__FreeBSD__)
586 int mib[2] = { CTL_HW, HW_NCPU };
587 size_t len = sizeof(mib);
588 sysctl(mib, 2, &ncpus, &len, NULL, 0);
590 ncpus = sysconf(_SC_NPROCESSORS_ONLN);
591 #elif defined(_WIN32)
594 GetSystemInfo(&sysinfo);
595 ncpus = sysinfo.dwNumberOfProcessors;
604 #define sockaddr_dl sockaddr_ll
605 #define sdl_family sll_family
606 #define AF_LINK AF_PACKET
607 #define LLADDR(s) s->sll_addr;
608 #include <linux/if_tun.h>
609 #define TAP_CLONEDEV "/dev/net/tun"
610 #endif /* __linux__ */
613 #include <net/if_tun.h>
614 #define TAP_CLONEDEV "/dev/tap"
615 #endif /* __FreeBSD */
618 // #warning TAP not supported on apple ?
619 #include <net/if_utun.h>
620 #define TAP_CLONEDEV "/dev/tap"
621 #endif /* __APPLE__ */
625 * parse the vale configuration in conf and put it in nmr.
626 * Return the flag set if necessary.
627 * The configuration may consist of 1 to 4 numbers separated
628 * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings.
629 * Missing numbers or zeroes stand for default values.
630 * As an additional convenience, if exactly one number
631 * is specified, then this is assigned to both #tx-slots and #rx-slots.
632 * If there is no 4th number, then the 3rd is assigned to both #tx-rings
636 parse_nmr_config(const char* conf, struct nmreq_register *nmr)
641 if (conf == NULL || ! *conf)
643 nmr->nr_tx_rings = nmr->nr_rx_rings = 0;
644 nmr->nr_tx_slots = nmr->nr_rx_slots = 0;
646 for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) {
650 nmr->nr_tx_slots = nmr->nr_rx_slots = v;
653 nmr->nr_rx_slots = v;
656 nmr->nr_tx_rings = nmr->nr_rx_rings = v;
659 nmr->nr_rx_rings = v;
662 D("ignored config: %s", tok);
666 D("txr %d txd %d rxr %d rxd %d",
667 nmr->nr_tx_rings, nmr->nr_tx_slots,
668 nmr->nr_rx_rings, nmr->nr_rx_slots);
675 * locate the src mac address for our interface, put it
676 * into the user-supplied buffer. return 0 if ok, -1 on error.
679 source_hwaddr(const char *ifname, char *buf)
681 struct ifaddrs *ifaphead, *ifap;
683 if (getifaddrs(&ifaphead) != 0) {
684 D("getifaddrs %s failed", ifname);
688 /* remove 'netmap:' prefix before comparing interfaces */
689 if (!strncmp(ifname, "netmap:", 7))
692 for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) {
693 struct sockaddr_dl *sdl =
694 (struct sockaddr_dl *)ifap->ifa_addr;
697 if (!sdl || sdl->sdl_family != AF_LINK)
699 if (strncmp(ifap->ifa_name, ifname, IFNAMSIZ) != 0)
701 mac = (uint8_t *)LLADDR(sdl);
702 sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x",
703 mac[0], mac[1], mac[2],
704 mac[3], mac[4], mac[5]);
706 D("source hwaddr %s", buf);
709 freeifaddrs(ifaphead);
714 /* set the thread affinity. */
716 setaffinity(pthread_t me, int i)
723 /* Set thread affinity affinity.*/
725 CPU_SET(i, &cpumask);
727 if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) {
728 D("Unable to set affinity: %s", strerror(errno));
735 /* Compute the checksum of the given ip header. */
737 checksum(const void *data, uint16_t len, uint32_t sum)
739 const uint8_t *addr = data;
742 /* Checksum all the pairs of bytes first... */
743 for (i = 0; i < (len & ~1U); i += 2) {
744 sum += (uint16_t)ntohs(*((const uint16_t *)(addr + i)));
749 * If there's a single byte left over, checksum it, too.
750 * Network byte order is big-endian, so the remaining byte is
762 wrapsum(uint32_t sum)
768 /* Check the payload of the packet for errors (use it for debug).
769 * Look for consecutive ascii representations of the size of the packet.
772 dump_payload(const char *_p, int len, struct netmap_ring *ring, int cur)
776 const unsigned char *p = (const unsigned char *)_p;
778 /* get the length in ASCII of the length of the packet. */
780 printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n",
781 ring, cur, ring->slot[cur].buf_idx,
782 ring->slot[cur].flags, len);
783 /* hexdump routine */
784 for (i = 0; i < len; ) {
785 memset(buf, ' ', sizeof(buf));
786 sprintf(buf, "%5d: ", i);
788 for (j=0; j < 16 && i < len; i++, j++)
789 sprintf(buf+7+j*3, "%02x ", (uint8_t)(p[i]));
791 for (j=0; j < 16 && i < len; i++, j++)
792 sprintf(buf+7+j + 48, "%c",
793 isprint(p[i]) ? p[i] : '.');
799 * Fill a packet with some payload.
800 * We create a UDP packet so the payload starts at
801 * 14+20+8 = 42 bytes.
804 #define uh_sport source
805 #define uh_dport dest
811 new_ip_sum(uint16_t ip_sum, uint32_t oaddr, uint32_t naddr)
813 ip_sum = cksum_add(ip_sum, ~oaddr >> 16);
814 ip_sum = cksum_add(ip_sum, ~oaddr & 0xffff);
815 ip_sum = cksum_add(ip_sum, naddr >> 16);
816 ip_sum = cksum_add(ip_sum, naddr & 0xffff);
821 new_udp_sum(uint16_t udp_sum, uint16_t oport, uint16_t nport)
823 udp_sum = cksum_add(udp_sum, ~oport);
824 udp_sum = cksum_add(udp_sum, nport);
830 update_ip(struct pkt *pkt, struct targ *t)
832 struct glob_arg *g = t->g;
835 uint32_t oaddr, naddr;
836 uint16_t oport, nport;
837 uint16_t ip_sum = 0, udp_sum = 0;
839 memcpy(&ip, &pkt->ipv4.ip, sizeof(ip));
840 memcpy(&udp, &pkt->ipv4.udp, sizeof(udp));
842 ip_sum = udp_sum = 0;
843 naddr = oaddr = ntohl(ip.ip_src.s_addr);
844 nport = oport = ntohs(udp.uh_sport);
845 if (g->options & OPT_RANDOM_SRC) {
846 ip.ip_src.s_addr = nrand48(t->seed);
847 udp.uh_sport = nrand48(t->seed);
848 naddr = ntohl(ip.ip_src.s_addr);
849 nport = ntohs(udp.uh_sport);
850 ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
851 udp_sum = new_udp_sum(udp_sum, oport, nport);
853 if (oport < g->src_ip.port1) {
855 udp.uh_sport = htons(nport);
856 udp_sum = new_udp_sum(udp_sum, oport, nport);
859 nport = g->src_ip.port0;
860 udp.uh_sport = htons(nport);
861 if (oaddr < g->src_ip.ipv4.end) {
863 ip.ip_src.s_addr = htonl(naddr);
864 ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
867 naddr = g->src_ip.ipv4.start;
868 ip.ip_src.s_addr = htonl(naddr);
869 ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
872 naddr = oaddr = ntohl(ip.ip_dst.s_addr);
873 nport = oport = ntohs(udp.uh_dport);
874 if (g->options & OPT_RANDOM_DST) {
875 ip.ip_dst.s_addr = nrand48(t->seed);
876 udp.uh_dport = nrand48(t->seed);
877 naddr = ntohl(ip.ip_dst.s_addr);
878 nport = ntohs(udp.uh_dport);
879 ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
880 udp_sum = new_udp_sum(udp_sum, oport, nport);
882 if (oport < g->dst_ip.port1) {
884 udp.uh_dport = htons(nport);
885 udp_sum = new_udp_sum(udp_sum, oport, nport);
888 nport = g->dst_ip.port0;
889 udp.uh_dport = htons(nport);
890 if (oaddr < g->dst_ip.ipv4.end) {
892 ip.ip_dst.s_addr = htonl(naddr);
893 ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
896 naddr = g->dst_ip.ipv4.start;
897 ip.ip_dst.s_addr = htonl(naddr);
898 ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
901 /* update checksums */
903 udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(udp_sum));
905 ip.ip_sum = ~cksum_add(~ip.ip_sum, htons(ip_sum));
906 udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(ip_sum));
908 memcpy(&pkt->ipv4.ip, &ip, sizeof(ip));
909 memcpy(&pkt->ipv4.udp, &udp, sizeof(udp));
913 #define s6_addr16 __u6_addr.__u6_addr16
916 update_ip6(struct pkt *pkt, struct targ *t)
918 struct glob_arg *g = t->g;
922 uint16_t oaddr, naddr;
923 uint16_t oport, nport;
926 memcpy(&ip6, &pkt->ipv6.ip, sizeof(ip6));
927 memcpy(&udp, &pkt->ipv6.udp, sizeof(udp));
930 group = g->src_ip.ipv6.sgroup;
931 naddr = oaddr = ntohs(ip6.ip6_src.s6_addr16[group]);
932 nport = oport = ntohs(udp.uh_sport);
933 if (g->options & OPT_RANDOM_SRC) {
934 ip6.ip6_src.s6_addr16[group] = nrand48(t->seed);
935 udp.uh_sport = nrand48(t->seed);
936 naddr = ntohs(ip6.ip6_src.s6_addr16[group]);
937 nport = ntohs(udp.uh_sport);
940 if (oport < g->src_ip.port1) {
942 udp.uh_sport = htons(nport);
945 nport = g->src_ip.port0;
946 udp.uh_sport = htons(nport);
947 if (oaddr < ntohs(g->src_ip.ipv6.end.s6_addr16[group])) {
949 ip6.ip6_src.s6_addr16[group] = htons(naddr);
952 naddr = ntohs(g->src_ip.ipv6.start.s6_addr16[group]);
953 ip6.ip6_src.s6_addr16[group] = htons(naddr);
955 /* update checksums if needed */
957 udp_sum = cksum_add(~oaddr, naddr);
959 udp_sum = cksum_add(udp_sum,
960 cksum_add(~oport, nport));
962 group = g->dst_ip.ipv6.egroup;
963 naddr = oaddr = ntohs(ip6.ip6_dst.s6_addr16[group]);
964 nport = oport = ntohs(udp.uh_dport);
965 if (g->options & OPT_RANDOM_DST) {
966 ip6.ip6_dst.s6_addr16[group] = nrand48(t->seed);
967 udp.uh_dport = nrand48(t->seed);
968 naddr = ntohs(ip6.ip6_dst.s6_addr16[group]);
969 nport = ntohs(udp.uh_dport);
972 if (oport < g->dst_ip.port1) {
974 udp.uh_dport = htons(nport);
977 nport = g->dst_ip.port0;
978 udp.uh_dport = htons(nport);
979 if (oaddr < ntohs(g->dst_ip.ipv6.end.s6_addr16[group])) {
981 ip6.ip6_dst.s6_addr16[group] = htons(naddr);
984 naddr = ntohs(g->dst_ip.ipv6.start.s6_addr16[group]);
985 ip6.ip6_dst.s6_addr16[group] = htons(naddr);
987 /* update checksums */
989 udp_sum = cksum_add(udp_sum,
990 cksum_add(~oaddr, naddr));
992 udp_sum = cksum_add(udp_sum,
993 cksum_add(~oport, nport));
995 udp.uh_sum = ~cksum_add(~udp.uh_sum, udp_sum);
996 memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6));
997 memcpy(&pkt->ipv6.udp, &udp, sizeof(udp));
1001 update_addresses(struct pkt *pkt, struct targ *t)
1004 if (t->g->af == AF_INET)
1011 update_ip_size(struct pkt *pkt, int size)
1015 uint16_t oiplen, niplen;
1017 uint16_t ip_sum = 0;
1019 memcpy(&ip, &pkt->ipv4.ip, sizeof(ip));
1020 memcpy(&udp, &pkt->ipv4.udp, sizeof(udp));
1022 oiplen = ntohs(ip.ip_len);
1023 niplen = size - sizeof(struct ether_header);
1024 ip.ip_len = htons(niplen);
1025 nudplen = niplen - sizeof(struct ip);
1026 udp.uh_ulen = htons(nudplen);
1027 ip_sum = new_udp_sum(ip_sum, oiplen, niplen);
1029 /* update checksums */
1031 ip.ip_sum = ~cksum_add(~ip.ip_sum, htons(ip_sum));
1034 /* Magic: taken from sbin/dhclient/packet.c */
1035 udp.uh_sum = wrapsum(
1036 checksum(&udp, sizeof(udp), /* udp header */
1037 checksum(pkt->ipv4.body, /* udp payload */
1038 nudplen - sizeof(udp),
1039 checksum(&ip.ip_src, /* pseudo header */
1040 2 * sizeof(ip.ip_src),
1041 IPPROTO_UDP + (u_int32_t)ntohs(udp.uh_ulen)))));
1043 memcpy(&pkt->ipv4.ip, &ip, sizeof(ip));
1044 memcpy(&pkt->ipv4.udp, &udp, sizeof(udp));
1048 update_ip6_size(struct pkt *pkt, int size)
1052 uint16_t niplen, nudplen;
1055 memcpy(&ip6, &pkt->ipv6.ip, sizeof(ip6));
1056 memcpy(&udp, &pkt->ipv6.udp, sizeof(udp));
1058 nudplen = niplen = size - sizeof(struct ether_header) - sizeof(ip6);
1059 ip6.ip6_plen = htons(niplen);
1060 udp.uh_ulen = htons(nudplen);
1062 /* Save part of pseudo header checksum into csum */
1064 csum = IPPROTO_UDP << 24;
1065 csum = checksum(&csum, sizeof(csum), nudplen);
1066 udp.uh_sum = wrapsum(
1067 checksum(&udp, sizeof(udp), /* udp header */
1068 checksum(pkt->ipv6.body, /* udp payload */
1069 nudplen - sizeof(udp),
1070 checksum(&pkt->ipv6.ip.ip6_src, /* pseudo header */
1071 2 * sizeof(pkt->ipv6.ip.ip6_src), csum))));
1073 memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6));
1074 memcpy(&pkt->ipv6.udp, &udp, sizeof(udp));
1078 update_size(struct pkt *pkt, struct targ *t, int size)
1080 if (t->g->options & OPT_UPDATE_CSUM) {
1081 if (t->g->af == AF_INET)
1082 update_ip_size(pkt, size);
1084 update_ip6_size(pkt, size);
1089 * initialize one packet and prepare for the next one.
1090 * The copy could be done better instead of repeating it each time.
1093 initialize_packet(struct targ *targ)
1095 struct pkt *pkt = &targ->pkt;
1096 struct ether_header *eh;
1103 const char *payload = targ->g->options & OPT_INDIRECT ?
1104 indirect_payload : default_payload;
1105 int i, l0 = strlen(payload);
1108 char errbuf[PCAP_ERRBUF_SIZE];
1110 struct pcap_pkthdr *header;
1111 const unsigned char *packet;
1113 /* Read a packet from a PCAP file if asked. */
1114 if (targ->g->packet_file != NULL) {
1115 if ((file = pcap_open_offline(targ->g->packet_file,
1117 D("failed to open pcap file %s",
1118 targ->g->packet_file);
1119 if (pcap_next_ex(file, &header, &packet) < 0)
1120 D("failed to read packet from %s",
1121 targ->g->packet_file);
1122 if ((targ->frame = malloc(header->caplen)) == NULL)
1124 bcopy(packet, (unsigned char *)targ->frame, header->caplen);
1125 targ->g->pkt_size = header->caplen;
1131 paylen = targ->g->pkt_size - sizeof(*eh) -
1132 (targ->g->af == AF_INET ? sizeof(ip): sizeof(ip6));
1134 /* create a nice NUL-terminated string */
1135 for (i = 0; i < paylen; i += l0) {
1136 if (l0 > paylen - i)
1137 l0 = paylen - i; // last round
1138 bcopy(payload, PKT(pkt, body, targ->g->af) + i, l0);
1140 PKT(pkt, body, targ->g->af)[i - 1] = '\0';
1142 /* prepare the headers */
1144 bcopy(&targ->g->src_mac.start, eh->ether_shost, 6);
1145 bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6);
1147 if (targ->g->af == AF_INET) {
1148 eh->ether_type = htons(ETHERTYPE_IP);
1149 memcpy(&ip, &pkt->ipv4.ip, sizeof(ip));
1150 udp_ptr = &pkt->ipv4.udp;
1151 ip.ip_v = IPVERSION;
1152 ip.ip_hl = sizeof(ip) >> 2;
1154 ip.ip_tos = IPTOS_LOWDELAY;
1155 ip.ip_len = htons(targ->g->pkt_size - sizeof(*eh));
1157 ip.ip_off = htons(IP_DF); /* Don't fragment */
1158 ip.ip_ttl = IPDEFTTL;
1159 ip.ip_p = IPPROTO_UDP;
1160 ip.ip_dst.s_addr = htonl(targ->g->dst_ip.ipv4.start);
1161 ip.ip_src.s_addr = htonl(targ->g->src_ip.ipv4.start);
1162 ip.ip_sum = wrapsum(checksum(&ip, sizeof(ip), 0));
1163 memcpy(&pkt->ipv4.ip, &ip, sizeof(ip));
1165 eh->ether_type = htons(ETHERTYPE_IPV6);
1166 memcpy(&ip6, &pkt->ipv4.ip, sizeof(ip6));
1167 udp_ptr = &pkt->ipv6.udp;
1169 ip6.ip6_plen = htons(paylen);
1170 ip6.ip6_vfc = IPV6_VERSION;
1171 ip6.ip6_nxt = IPPROTO_UDP;
1172 ip6.ip6_hlim = IPV6_DEFHLIM;
1173 ip6.ip6_src = targ->g->src_ip.ipv6.start;
1174 ip6.ip6_dst = targ->g->dst_ip.ipv6.start;
1176 memcpy(&udp, udp_ptr, sizeof(udp));
1178 udp.uh_sport = htons(targ->g->src_ip.port0);
1179 udp.uh_dport = htons(targ->g->dst_ip.port0);
1180 udp.uh_ulen = htons(paylen);
1181 if (targ->g->af == AF_INET) {
1182 /* Magic: taken from sbin/dhclient/packet.c */
1183 udp.uh_sum = wrapsum(
1184 checksum(&udp, sizeof(udp), /* udp header */
1185 checksum(pkt->ipv4.body, /* udp payload */
1186 paylen - sizeof(udp),
1187 checksum(&pkt->ipv4.ip.ip_src, /* pseudo header */
1188 2 * sizeof(pkt->ipv4.ip.ip_src),
1189 IPPROTO_UDP + (u_int32_t)ntohs(udp.uh_ulen)))));
1190 memcpy(&pkt->ipv4.ip, &ip, sizeof(ip));
1192 /* Save part of pseudo header checksum into csum */
1193 csum = IPPROTO_UDP << 24;
1194 csum = checksum(&csum, sizeof(csum), paylen);
1195 udp.uh_sum = wrapsum(
1196 checksum(udp_ptr, sizeof(udp), /* udp header */
1197 checksum(pkt->ipv6.body, /* udp payload */
1198 paylen - sizeof(udp),
1199 checksum(&pkt->ipv6.ip.ip6_src, /* pseudo header */
1200 2 * sizeof(pkt->ipv6.ip.ip6_src), csum))));
1201 memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6));
1203 memcpy(udp_ptr, &udp, sizeof(udp));
1205 bzero(&pkt->vh, sizeof(pkt->vh));
1206 // dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0);
1210 get_vnet_hdr_len(struct glob_arg *g)
1212 struct nmreq_header hdr;
1213 struct nmreq_port_hdr ph;
1216 hdr = g->nmd->hdr; /* copy name and version */
1217 hdr.nr_reqtype = NETMAP_REQ_PORT_HDR_GET;
1219 memset(&ph, 0, sizeof(ph));
1220 hdr.nr_body = (uintptr_t)&ph;
1221 err = ioctl(g->main_fd, NIOCCTRL, &hdr);
1223 D("Unable to get virtio-net header length");
1227 g->virt_header = ph.nr_hdr_len;
1228 if (g->virt_header) {
1229 D("Port requires virtio-net header, length = %d",
1235 set_vnet_hdr_len(struct glob_arg *g)
1237 int err, l = g->virt_header;
1238 struct nmreq_header hdr;
1239 struct nmreq_port_hdr ph;
1244 hdr = g->nmd->hdr; /* copy name and version */
1245 hdr.nr_reqtype = NETMAP_REQ_PORT_HDR_SET;
1247 memset(&ph, 0, sizeof(ph));
1248 hdr.nr_body = (uintptr_t)&ph;
1249 err = ioctl(g->main_fd, NIOCCTRL, &hdr);
1251 D("Unable to set virtio-net header length %d", l);
1256 * create and enqueue a batch of packets on a ring.
1257 * On the last one set NS_REPORT to tell the driver to generate
1258 * an interrupt when done.
1261 send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,
1262 int size, struct targ *t, u_int count, int options)
1264 u_int n, sent, head = ring->head;
1265 u_int frags = t->frags;
1266 u_int frag_size = t->frag_size;
1267 struct netmap_slot *slot = &ring->slot[head];
1269 n = nm_ring_space(ring);
1271 if (options & (OPT_COPY | OPT_PREFETCH) ) {
1272 for (sent = 0; sent < count; sent++) {
1273 struct netmap_slot *slot = &ring->slot[head];
1274 char *p = NETMAP_BUF(ring, slot->buf_idx);
1276 __builtin_prefetch(p);
1277 head = nm_ring_next(ring, head);
1282 for (sent = 0; sent < count && n >= frags; sent++, n--) {
1285 u_int tosend = size;
1287 slot = &ring->slot[head];
1288 p = NETMAP_BUF(ring, slot->buf_idx);
1289 buf_changed = slot->flags & NS_BUF_CHANGED;
1292 if (options & OPT_RUBBISH) {
1294 } else if (options & OPT_INDIRECT) {
1295 slot->flags |= NS_INDIRECT;
1296 slot->ptr = (uint64_t)((uintptr_t)frame);
1297 } else if (frags > 1) {
1299 const char *f = frame;
1301 for (i = 0; i < frags - 1; i++) {
1302 memcpy(fp, f, frag_size);
1303 slot->len = frag_size;
1304 slot->flags = NS_MOREFRAG;
1305 if (options & OPT_DUMP)
1306 dump_payload(fp, frag_size, ring, head);
1307 tosend -= frag_size;
1309 head = nm_ring_next(ring, head);
1310 slot = &ring->slot[head];
1311 fp = NETMAP_BUF(ring, slot->buf_idx);
1316 memcpy(p, f, tosend);
1317 update_addresses(pkt, t);
1318 } else if ((options & (OPT_COPY | OPT_MEMCPY)) || buf_changed) {
1319 if (options & OPT_COPY)
1320 nm_pkt_copy(frame, p, size);
1322 memcpy(p, frame, size);
1323 update_addresses(pkt, t);
1324 } else if (options & OPT_PREFETCH) {
1325 __builtin_prefetch(p);
1328 if (options & OPT_DUMP)
1329 dump_payload(p, tosend, ring, head);
1330 head = nm_ring_next(ring, head);
1333 slot->flags |= NS_REPORT;
1334 ring->head = ring->cur = head;
1337 /* tell netmap that we need more slots */
1338 ring->cur = ring->tail;
1345 * Index of the highest bit set
1350 uint64_t m = 1ULL << 63;
1353 for (i = 63; i >= 0; i--, m >>=1)
1360 * wait until ts, either busy or sleeping if more than 1ms.
1361 * Return wakeup time.
1363 static struct timespec
1364 wait_time(struct timespec ts)
1367 struct timespec w, cur;
1368 clock_gettime(CLOCK_REALTIME_PRECISE, &cur);
1369 w = timespec_sub(ts, cur);
1372 else if (w.tv_sec > 0 || w.tv_nsec > 1000000)
1378 * Send a packet, and wait for a response.
1379 * The payload (after UDP header, ofs 42) has a 4-byte sequence
1380 * followed by a struct timeval (or bintime?)
1384 ping_body(void *data)
1386 struct targ *targ = (struct targ *) data;
1387 struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
1388 struct netmap_if *nifp = targ->nmd->nifp;
1392 struct timespec ts, now, last_print;
1393 struct timespec nexttime = {0, 0}; /* silence compiler */
1394 uint64_t sent = 0, n = targ->g->npackets;
1395 uint64_t count = 0, t_cur, t_min = ~0, av = 0;
1396 uint64_t g_min = ~0, g_av = 0;
1397 uint64_t buckets[64]; /* bins for delays, ns */
1398 int rate_limit = targ->g->tx_rate, tosend = 0;
1400 frame = (char*)&targ->pkt + sizeof(targ->pkt.vh) - targ->g->virt_header;
1401 size = targ->g->pkt_size + targ->g->virt_header;
1404 if (targ->g->nthreads > 1) {
1405 D("can only ping with 1 thread");
1409 if (targ->g->af == AF_INET6) {
1410 D("Warning: ping-pong with IPv6 not supported");
1413 bzero(&buckets, sizeof(buckets));
1414 clock_gettime(CLOCK_REALTIME_PRECISE, &last_print);
1417 targ->tic = timespec_add(now, (struct timespec){2,0});
1418 targ->tic.tv_nsec = 0;
1419 wait_time(targ->tic);
1420 nexttime = targ->tic;
1422 while (!targ->cancel && (n == 0 || sent < n)) {
1423 struct netmap_ring *ring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring);
1424 struct netmap_slot *slot;
1427 uint64_t limit, event = 0;
1429 if (rate_limit && tosend <= 0) {
1430 tosend = targ->g->burst;
1431 nexttime = timespec_add(nexttime, targ->g->tx_period);
1432 wait_time(nexttime);
1435 limit = rate_limit ? tosend : targ->g->burst;
1436 if (n > 0 && n - sent < limit)
1438 for (m = 0; (unsigned)m < limit; m++) {
1439 slot = &ring->slot[ring->head];
1441 p = NETMAP_BUF(ring, slot->buf_idx);
1443 if (nm_ring_empty(ring)) {
1444 D("-- ouch, cannot send");
1448 nm_pkt_copy(frame, p, size);
1449 clock_gettime(CLOCK_REALTIME_PRECISE, &ts);
1450 bcopy(&sent, p+42, sizeof(sent));
1451 tp = (struct tstamp *)(p+46);
1452 tp->sec = (uint32_t)ts.tv_sec;
1453 tp->nsec = (uint32_t)ts.tv_nsec;
1455 ring->head = ring->cur = nm_ring_next(ring, ring->head);
1460 targ->ctr.pkts = sent;
1461 targ->ctr.bytes = sent*size;
1462 targ->ctr.events = event;
1466 rv = ioctl(pfd.fd, NIOCTXSYNC, NULL);
1468 D("TXSYNC error on queue %d: %s", targ->me,
1472 ioctl(pfd.fd, NIOCRXSYNC, NULL);
1474 /* should use a parameter to decide how often to send */
1475 if ( (rv = poll(&pfd, 1, 3000)) <= 0) {
1476 D("poll error on queue %d: %s", targ->me,
1477 (rv ? strerror(errno) : "timeout"));
1480 #endif /* BUSYWAIT */
1481 /* see what we got back */
1485 for (i = targ->nmd->first_rx_ring;
1486 i <= targ->nmd->last_rx_ring; i++) {
1487 ring = NETMAP_RXRING(nifp, i);
1488 while (!nm_ring_empty(ring)) {
1493 slot = &ring->slot[ring->head];
1494 p = NETMAP_BUF(ring, slot->buf_idx);
1496 clock_gettime(CLOCK_REALTIME_PRECISE, &now);
1497 bcopy(p+42, &seq, sizeof(seq));
1498 tp = (struct tstamp *)(p+46);
1499 ts.tv_sec = (time_t)tp->sec;
1500 ts.tv_nsec = (long)tp->nsec;
1501 ts.tv_sec = now.tv_sec - ts.tv_sec;
1502 ts.tv_nsec = now.tv_nsec - ts.tv_nsec;
1503 if (ts.tv_nsec < 0) {
1504 ts.tv_nsec += 1000000000;
1507 if (0) D("seq %d/%llu delta %d.%09d", seq,
1508 (unsigned long long)sent,
1509 (int)ts.tv_sec, (int)ts.tv_nsec);
1510 t_cur = ts.tv_sec * 1000000000UL + ts.tv_nsec;
1517 /* now store it in a bucket */
1518 ring->head = ring->cur = nm_ring_next(ring, ring->head);
1524 //D("tx %d rx %d", sent, rx);
1526 ts.tv_sec = now.tv_sec - last_print.tv_sec;
1527 ts.tv_nsec = now.tv_nsec - last_print.tv_nsec;
1528 if (ts.tv_nsec < 0) {
1529 ts.tv_nsec += 1000000000;
1532 if (ts.tv_sec >= 1) {
1533 D("count %d RTT: min %d av %d ns",
1534 (int)count, (int)t_min, (int)(av/count));
1535 int k, j, kmin, off;
1538 for (kmin = 0; kmin < 64; kmin ++)
1541 for (k = 63; k >= kmin; k--)
1546 for (j = kmin; j <= k; j++) {
1547 off += sprintf(buf + off, " %5d", (int)buckets[j]);
1549 D("k: %d .. %d\n\t%s", 1<<kmin, 1<<k, buf);
1550 bzero(&buckets, sizeof(buckets));
1560 if (rx < m && ts.tv_sec <= 3 && !targ->cancel)
1562 #endif /* BUSYWAIT */
1566 D("RTT over %llu packets: min %d av %d ns",
1567 (long long unsigned)sent, (int)g_min,
1568 (int)((double)g_av/sent));
1570 targ->completed = 1;
1572 /* reset the ``used`` flag. */
1580 * reply to ping requests
1583 pong_body(void *data)
1585 struct targ *targ = (struct targ *) data;
1586 struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
1587 struct netmap_if *nifp = targ->nmd->nifp;
1588 struct netmap_ring *txring, *rxring;
1590 uint64_t sent = 0, n = targ->g->npackets;
1592 if (targ->g->nthreads > 1) {
1593 D("can only reply ping with 1 thread");
1597 D("understood ponger %llu but don't know how to do it",
1598 (unsigned long long)n);
1600 if (targ->g->af == AF_INET6) {
1601 D("Warning: ping-pong with IPv6 not supported");
1604 while (!targ->cancel && (n == 0 || sent < n)) {
1605 uint32_t txhead, txavail;
1608 ioctl(pfd.fd, NIOCRXSYNC, NULL);
1611 if ( (rv = poll(&pfd, 1, 1000)) <= 0) {
1612 D("poll error on queue %d: %s", targ->me,
1613 rv ? strerror(errno) : "timeout");
1617 txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring);
1618 txhead = txring->head;
1619 txavail = nm_ring_space(txring);
1620 /* see what we got back */
1621 for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
1622 rxring = NETMAP_RXRING(nifp, i);
1623 while (!nm_ring_empty(rxring)) {
1624 uint16_t *spkt, *dpkt;
1625 uint32_t head = rxring->head;
1626 struct netmap_slot *slot = &rxring->slot[head];
1628 src = NETMAP_BUF(rxring, slot->buf_idx);
1629 //D("got pkt %p of size %d", src, slot->len);
1630 rxring->head = rxring->cur = nm_ring_next(rxring, head);
1633 dst = NETMAP_BUF(txring,
1634 txring->slot[txhead].buf_idx);
1636 dpkt = (uint16_t *)dst;
1637 spkt = (uint16_t *)src;
1638 nm_pkt_copy(src, dst, slot->len);
1639 /* swap source and destination MAC */
1646 /* swap source and destination IPv4 */
1647 if (spkt[6] == htons(ETHERTYPE_IP)) {
1648 dpkt[13] = spkt[15];
1649 dpkt[14] = spkt[16];
1650 dpkt[15] = spkt[13];
1651 dpkt[16] = spkt[14];
1653 txring->slot[txhead].len = slot->len;
1654 //dump_payload(dst, slot->len, txring, txhead);
1655 txhead = nm_ring_next(txring, txhead);
1660 txring->head = txring->cur = txhead;
1661 targ->ctr.pkts = sent;
1663 ioctl(pfd.fd, NIOCTXSYNC, NULL);
1667 targ->completed = 1;
1669 /* reset the ``used`` flag. */
1677 sender_body(void *data)
1679 struct targ *targ = (struct targ *) data;
1680 struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT };
1681 struct netmap_if *nifp;
1682 struct netmap_ring *txring = NULL;
1684 uint64_t n = targ->g->npackets / targ->g->nthreads;
1687 int options = targ->g->options;
1688 struct timespec nexttime = { 0, 0}; // XXX silence compiler
1689 int rate_limit = targ->g->tx_rate;
1690 struct pkt *pkt = &targ->pkt;
1694 if (targ->frame == NULL) {
1695 frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header;
1696 size = targ->g->pkt_size + targ->g->virt_header;
1698 frame = targ->frame;
1699 size = targ->g->pkt_size;
1702 D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd);
1703 if (setaffinity(targ->thread, targ->affinity))
1707 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1709 targ->tic = timespec_add(targ->tic, (struct timespec){2,0});
1710 targ->tic.tv_nsec = 0;
1711 wait_time(targ->tic);
1712 nexttime = targ->tic;
1714 if (targ->g->dev_type == DEV_TAP) {
1715 D("writing to file desc %d", targ->g->main_fd);
1717 for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
1718 if (write(targ->g->main_fd, frame, size) != -1)
1720 update_addresses(pkt, targ);
1722 targ->ctr.pkts = sent;
1723 targ->ctr.bytes = sent*size;
1724 targ->ctr.events = sent;
1729 } else if (targ->g->dev_type == DEV_PCAP) {
1730 pcap_t *p = targ->g->p;
1732 for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
1733 if (pcap_inject(p, frame, size) != -1)
1735 update_addresses(pkt, targ);
1737 targ->ctr.pkts = sent;
1738 targ->ctr.bytes = sent*size;
1739 targ->ctr.events = sent;
1743 #endif /* NO_PCAP */
1746 u_int bufsz, frag_size = targ->g->frag_size;
1748 nifp = targ->nmd->nifp;
1749 txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring);
1750 bufsz = txring->nr_buf_size;
1751 if (bufsz < frag_size)
1753 targ->frag_size = targ->g->pkt_size / targ->frags;
1754 if (targ->frag_size > frag_size) {
1755 targ->frags = targ->g->pkt_size / frag_size;
1756 targ->frag_size = frag_size;
1757 if (targ->g->pkt_size % frag_size != 0)
1760 D("frags %u frag_size %u", targ->frags, targ->frag_size);
1762 /* mark all slots of all rings as changed so initial copy will be done */
1763 for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1765 struct netmap_slot *slot;
1767 txring = NETMAP_TXRING(nifp, i);
1768 for (j = 0; j < txring->num_slots; j++) {
1769 slot = &txring->slot[j];
1770 slot->flags = NS_BUF_CHANGED;
1774 while (!targ->cancel && (n == 0 || sent < n)) {
1777 if (rate_limit && tosend <= 0) {
1778 tosend = targ->g->burst;
1779 nexttime = timespec_add(nexttime, targ->g->tx_period);
1780 wait_time(nexttime);
1784 * wait for available room in the send queue(s)
1788 if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) {
1789 D("ioctl error on queue %d: %s", targ->me,
1793 #else /* !BUSYWAIT */
1794 if ( (rv = poll(&pfd, 1, 2000)) <= 0) {
1797 D("poll error on queue %d: %s", targ->me,
1798 rv ? strerror(errno) : "timeout");
1801 if (pfd.revents & POLLERR) {
1802 D("poll error on %d ring %d-%d", pfd.fd,
1803 targ->nmd->first_tx_ring, targ->nmd->last_tx_ring);
1806 #endif /* !BUSYWAIT */
1808 * scan our queues and send on those with room
1810 for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1812 uint64_t limit = rate_limit ? tosend : targ->g->burst;
1814 if (n > 0 && n == sent)
1817 if (n > 0 && n - sent < limit)
1819 txring = NETMAP_TXRING(nifp, i);
1820 if (nm_ring_empty(txring))
1823 if (targ->g->pkt_min_size > 0) {
1824 size = nrand48(targ->seed) %
1825 (targ->g->pkt_size - targ->g->pkt_min_size) +
1826 targ->g->pkt_min_size;
1827 update_size(pkt, targ, size);
1829 m = send_packets(txring, pkt, frame, size, targ,
1831 ND("limit %lu tail %d m %d",
1832 limit, txring->tail, m);
1834 if (m > 0) //XXX-ste: can m be 0?
1836 targ->ctr.pkts = sent;
1837 targ->ctr.bytes += m*size;
1838 targ->ctr.events = event;
1846 /* flush any remaining packets */
1847 if (txring != NULL) {
1848 D("flush tail %d head %d on thread %p",
1849 txring->tail, txring->head,
1850 (void *)pthread_self());
1851 ioctl(pfd.fd, NIOCTXSYNC, NULL);
1854 /* final part: wait all the TX queues to be empty. */
1855 for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
1856 txring = NETMAP_TXRING(nifp, i);
1857 while (!targ->cancel && nm_tx_pending(txring)) {
1858 RD(5, "pending tx tail %d head %d on ring %d",
1859 txring->tail, txring->head, i);
1860 ioctl(pfd.fd, NIOCTXSYNC, NULL);
1861 usleep(1); /* wait 1 tick */
1864 } /* end DEV_NETMAP */
1866 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1867 targ->completed = 1;
1868 targ->ctr.pkts = sent;
1869 targ->ctr.bytes = sent*size;
1870 targ->ctr.events = event;
1872 /* reset the ``used`` flag. */
1881 receive_pcap(u_char *user, const struct pcap_pkthdr * h,
1882 const u_char * bytes)
1884 struct my_ctrs *ctr = (struct my_ctrs *)user;
1885 (void)bytes; /* UNUSED */
1886 ctr->bytes += h->len;
1889 #endif /* !NO_PCAP */
1893 receive_packets(struct netmap_ring *ring, u_int limit, int dump, uint64_t *bytes)
1903 n = nm_ring_space(ring);
1906 for (rx = 0; rx < limit; rx++) {
1907 struct netmap_slot *slot = &ring->slot[head];
1908 char *p = NETMAP_BUF(ring, slot->buf_idx);
1910 *bytes += slot->len;
1912 dump_payload(p, slot->len, ring, head);
1913 if (!(slot->flags & NS_MOREFRAG))
1916 head = nm_ring_next(ring, head);
1918 ring->head = ring->cur = head;
1924 receiver_body(void *data)
1926 struct targ *targ = (struct targ *) data;
1927 struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
1928 struct netmap_if *nifp;
1929 struct netmap_ring *rxring;
1932 uint64_t n = targ->g->npackets / targ->g->nthreads;
1934 memset(&cur, 0, sizeof(cur));
1936 if (setaffinity(targ->thread, targ->affinity))
1939 D("reading from %s fd %d main_fd %d",
1940 targ->g->ifname, targ->fd, targ->g->main_fd);
1941 /* unbounded wait for the first packet. */
1942 for (;!targ->cancel;) {
1943 i = poll(&pfd, 1, 1000);
1944 if (i > 0 && !(pfd.revents & POLLERR))
1947 D("poll() error: %s", strerror(errno));
1950 if (pfd.revents & POLLERR) {
1954 RD(1, "waiting for initial packets, poll returns %d %d",
1957 /* main loop, exit after 1s silence */
1958 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
1959 if (targ->g->dev_type == DEV_TAP) {
1960 while (!targ->cancel && (n == 0 || targ->ctr.pkts < n)) {
1961 char buf[MAX_BODYSIZE];
1962 /* XXX should we poll ? */
1963 i = read(targ->g->main_fd, buf, sizeof(buf));
1966 targ->ctr.bytes += i;
1971 } else if (targ->g->dev_type == DEV_PCAP) {
1972 while (!targ->cancel && (n == 0 || targ->ctr.pkts < n)) {
1973 /* XXX should we poll ? */
1974 pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap,
1975 (u_char *)&targ->ctr);
1978 #endif /* !NO_PCAP */
1980 int dump = targ->g->options & OPT_DUMP;
1982 nifp = targ->nmd->nifp;
1983 while (!targ->cancel && (n == 0 || targ->ctr.pkts < n)) {
1984 /* Once we started to receive packets, wait at most 1 seconds
1987 if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) {
1988 D("ioctl error on queue %d: %s", targ->me,
1992 #else /* !BUSYWAIT */
1993 if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) {
1994 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
1995 targ->toc.tv_sec -= 1; /* Subtract timeout time. */
1999 if (pfd.revents & POLLERR) {
2003 #endif /* !BUSYWAIT */
2004 uint64_t cur_space = 0;
2005 for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
2008 rxring = NETMAP_RXRING(nifp, i);
2009 /* compute free space in the ring */
2010 m = rxring->head + rxring->num_slots - rxring->tail;
2011 if (m >= (int) rxring->num_slots)
2012 m -= rxring->num_slots;
2014 if (nm_ring_empty(rxring))
2017 m = receive_packets(rxring, targ->g->burst, dump, &cur.bytes);
2022 cur.min_space = targ->ctr.min_space;
2023 if (cur_space < cur.min_space)
2024 cur.min_space = cur_space;
2029 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
2031 #if !defined(BUSYWAIT)
2034 targ->completed = 1;
2038 /* reset the ``used`` flag. */
2045 txseq_body(void *data)
2047 struct targ *targ = (struct targ *) data;
2048 struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT };
2049 struct netmap_ring *ring;
2052 int options = targ->g->options | OPT_COPY;
2053 struct timespec nexttime = {0, 0};
2054 int rate_limit = targ->g->tx_rate;
2055 struct pkt *pkt = &targ->pkt;
2056 int frags = targ->g->frags;
2057 uint32_t sequence = 0;
2062 if (targ->g->nthreads > 1) {
2063 D("can only txseq ping with 1 thread");
2067 if (targ->g->npackets > 0) {
2068 D("Ignoring -n argument");
2071 frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header;
2072 size = targ->g->pkt_size + targ->g->virt_header;
2074 D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd);
2075 if (setaffinity(targ->thread, targ->affinity))
2078 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
2080 targ->tic = timespec_add(targ->tic, (struct timespec){2,0});
2081 targ->tic.tv_nsec = 0;
2082 wait_time(targ->tic);
2083 nexttime = targ->tic;
2086 /* Only use the first queue. */
2087 ring = NETMAP_TXRING(targ->nmd->nifp, targ->nmd->first_tx_ring);
2089 while (!targ->cancel) {
2098 budget = targ->g->burst;
2100 } else if (budget <= 0) {
2101 budget = targ->g->burst;
2102 nexttime = timespec_add(nexttime, targ->g->tx_period);
2103 wait_time(nexttime);
2106 /* wait for available room in the send queue */
2109 if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) {
2110 D("ioctl error on queue %d: %s", targ->me,
2114 #else /* !BUSYWAIT */
2115 if ( (rv = poll(&pfd, 1, 2000)) <= 0) {
2118 D("poll error on queue %d: %s", targ->me,
2119 rv ? strerror(errno) : "timeout");
2122 if (pfd.revents & POLLERR) {
2123 D("poll error on %d ring %d-%d", pfd.fd,
2124 targ->nmd->first_tx_ring, targ->nmd->last_tx_ring);
2127 #endif /* !BUSYWAIT */
2129 /* If no room poll() again. */
2130 space = nm_ring_space(ring);
2137 if (space < limit) {
2141 /* Cut off ``limit`` to make sure is multiple of ``frags``. */
2143 limit = (limit / frags) * frags;
2146 limit = sent + limit; /* Convert to absolute. */
2148 for (fcnt = frags, head = ring->head;
2149 sent < limit; sent++, sequence++) {
2150 struct netmap_slot *slot = &ring->slot[head];
2151 char *p = NETMAP_BUF(ring, slot->buf_idx);
2152 uint16_t *w = (uint16_t *)PKT(pkt, body, targ->g->af), t;
2154 memcpy(&sum, targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, sizeof(sum));
2158 PKT(pkt, body, targ->g->af)[0] = sequence >> 24;
2159 PKT(pkt, body, targ->g->af)[1] = (sequence >> 16) & 0xff;
2160 sum = ~cksum_add(~sum, cksum_add(~t, *w));
2162 PKT(pkt, body, targ->g->af)[2] = (sequence >> 8) & 0xff;
2163 PKT(pkt, body, targ->g->af)[3] = sequence & 0xff;
2164 sum = ~cksum_add(~sum, cksum_add(~t, *w));
2165 memcpy(targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, &sum, sizeof(sum));
2166 nm_pkt_copy(frame, p, size);
2167 if (fcnt == frags) {
2168 update_addresses(pkt, targ);
2171 if (options & OPT_DUMP) {
2172 dump_payload(p, size, ring, head);
2178 slot->flags |= NS_MOREFRAG;
2183 if (sent == limit - 1) {
2184 /* Make sure we don't push an incomplete
2186 assert(!(slot->flags & NS_MOREFRAG));
2187 slot->flags |= NS_REPORT;
2190 head = nm_ring_next(ring, head);
2196 ring->cur = ring->head = head;
2199 targ->ctr.pkts = sent;
2200 targ->ctr.bytes = sent * size;
2201 targ->ctr.events = event;
2204 /* flush any remaining packets */
2205 D("flush tail %d head %d on thread %p",
2206 ring->tail, ring->head,
2207 (void *)pthread_self());
2208 ioctl(pfd.fd, NIOCTXSYNC, NULL);
2210 /* final part: wait the TX queues to become empty. */
2211 while (!targ->cancel && nm_tx_pending(ring)) {
2212 RD(5, "pending tx tail %d head %d on ring %d",
2213 ring->tail, ring->head, targ->nmd->first_tx_ring);
2214 ioctl(pfd.fd, NIOCTXSYNC, NULL);
2215 usleep(1); /* wait 1 tick */
2218 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
2219 targ->completed = 1;
2220 targ->ctr.pkts = sent;
2221 targ->ctr.bytes = sent * size;
2222 targ->ctr.events = event;
2224 /* reset the ``used`` flag. */
2232 multi_slot_to_string(struct netmap_ring *ring, unsigned int head,
2233 unsigned int nfrags, char *strbuf, size_t strbuflen)
2238 for (f = 0; f < nfrags; f++) {
2239 struct netmap_slot *slot = &ring->slot[head];
2240 int m = snprintf(strbuf, strbuflen, "|%u,%x|", slot->len,
2242 if (m >= (int)strbuflen) {
2248 head = nm_ring_next(ring, head);
2255 rxseq_body(void *data)
2257 struct targ *targ = (struct targ *) data;
2258 struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
2259 int dump = targ->g->options & OPT_DUMP;
2260 struct netmap_ring *ring;
2261 unsigned int frags_exp = 1;
2263 unsigned int frags = 0;
2264 int first_packet = 1;
2266 int i, j, af, nrings;
2267 uint32_t seq, *seq_exp = NULL;
2269 memset(&cur, 0, sizeof(cur));
2271 if (setaffinity(targ->thread, targ->affinity))
2274 nrings = targ->nmd->last_rx_ring - targ->nmd->first_rx_ring + 1;
2275 seq_exp = calloc(nrings, sizeof(uint32_t));
2276 if (seq_exp == NULL) {
2277 D("failed to allocate seq array");
2281 D("reading from %s fd %d main_fd %d",
2282 targ->g->ifname, targ->fd, targ->g->main_fd);
2283 /* unbounded wait for the first packet. */
2284 for (;!targ->cancel;) {
2285 i = poll(&pfd, 1, 1000);
2286 if (i > 0 && !(pfd.revents & POLLERR))
2288 RD(1, "waiting for initial packets, poll returns %d %d",
2292 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
2295 while (!targ->cancel) {
2300 if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) {
2301 D("ioctl error on queue %d: %s", targ->me,
2305 #else /* !BUSYWAIT */
2306 if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) {
2307 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
2308 targ->toc.tv_sec -= 1; /* Subtract timeout time. */
2312 if (pfd.revents & POLLERR) {
2316 #endif /* !BUSYWAIT */
2318 for (j = targ->nmd->first_rx_ring; j <= targ->nmd->last_rx_ring; j++) {
2319 ring = NETMAP_RXRING(targ->nmd->nifp, j);
2320 if (nm_ring_empty(ring))
2323 limit = nm_ring_space(ring);
2324 if (limit > targ->g->burst)
2325 limit = targ->g->burst;
2329 * 1) we remove the early-return optimization from
2330 * the netmap poll implementation, or
2331 * 2) pipes get NS_MOREFRAG support.
2332 * With the current netmap implementation, an experiment like
2333 * pkt-gen -i vale:1{1 -f txseq -F 9
2334 * pkt-gen -i vale:1}1 -f rxseq
2335 * would get stuck as soon as we find nm_ring_space(ring) < 9,
2336 * since here limit is rounded to 0 and
2337 * pipe rxsync is not called anymore by the poll() of this loop.
2339 if (frags_exp > 1) {
2341 /* Cut off to the closest smaller multiple. */
2342 limit = (limit / frags_exp) * frags_exp;
2343 RD(2, "LIMIT %d --> %d", o, limit);
2347 for (head = ring->head, i = 0; i < limit; i++) {
2348 struct netmap_slot *slot = &ring->slot[head];
2349 char *p = NETMAP_BUF(ring, slot->buf_idx);
2350 int len = slot->len;
2354 dump_payload(p, slot->len, ring, head);
2358 if (!(slot->flags & NS_MOREFRAG)) {
2361 } else if (frags != frags_exp) {
2363 RD(1, "Received packets with %u frags, "
2364 "expected %u, '%s'", frags, frags_exp,
2365 multi_slot_to_string(ring, head-frags+1,
2367 prbuf, sizeof(prbuf)));
2374 p -= sizeof(pkt->vh) - targ->g->virt_header;
2375 len += sizeof(pkt->vh) - targ->g->virt_header;
2376 pkt = (struct pkt *)p;
2377 if (ntohs(pkt->eh.ether_type) == ETHERTYPE_IP)
2382 if ((char *)pkt + len < ((char *)PKT(pkt, body, af)) +
2384 RD(1, "%s: packet too small (len=%u)", __func__,
2387 seq = (PKT(pkt, body, af)[0] << 24) |
2388 (PKT(pkt, body, af)[1] << 16) |
2389 (PKT(pkt, body, af)[2] << 8) |
2390 PKT(pkt, body, af)[3];
2392 /* Grab the first one, whatever it
2396 } else if (seq != seq_exp[j]) {
2397 uint32_t delta = seq - seq_exp[j];
2399 if (delta < (0xFFFFFFFF >> 1)) {
2400 RD(2, "Sequence GAP: exp %u found %u",
2403 RD(2, "Sequence OUT OF ORDER: "
2404 "exp %u found %u", seq_exp[j], seq);
2411 cur.bytes += slot->len;
2412 head = nm_ring_next(ring, head);
2416 ring->cur = ring->head = head;
2422 clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
2426 #endif /* !BUSYWAIT */
2427 targ->completed = 1;
2431 if (seq_exp != NULL)
2433 /* reset the ``used`` flag. */
2441 tx_output(struct glob_arg *g, struct my_ctrs *cur, double delta, const char *msg)
2443 double bw, raw_bw, pps, abs;
2444 char b1[40], b2[80], b3[80];
2447 if (cur->pkts == 0) {
2448 printf("%s nothing.\n", msg);
2452 size = (int)(cur->bytes / cur->pkts);
2454 printf("%s %llu packets %llu bytes %llu events %d bytes each in %.2f seconds.\n",
2456 (unsigned long long)cur->pkts,
2457 (unsigned long long)cur->bytes,
2458 (unsigned long long)cur->events, size, delta);
2461 if (size < 60) /* correct for min packet size */
2463 pps = cur->pkts / delta;
2464 bw = (8.0 * cur->bytes) / delta;
2465 raw_bw = (8.0 * cur->bytes + cur->pkts * g->framing) / delta;
2466 abs = cur->pkts / (double)(cur->events);
2468 printf("Speed: %spps Bandwidth: %sbps (raw %sbps). Average batch: %.2f pkts\n",
2469 norm(b1, pps, normalize), norm(b2, bw, normalize), norm(b3, raw_bw, normalize), abs);
2475 /* This usage is generated from the pkt-gen man page:
2477 * and pasted here adding the string terminators and endlines with simple
2478 * regular expressions. */
2479 const char *cmd = "pkt-gen";
2483 " -h Show program usage and exit.\n"
2486 " Name of the network interface that pkt-gen operates on. It can be a system network interface\n"
2487 " (e.g., em0), the name of a vale(4) port (e.g., valeSSS:PPP), the name of a netmap pipe or\n"
2488 " monitor, or any valid netmap port name accepted by the nm_open library function, as docu-\n"
2489 " mented in netmap(4) (NIOCREGIF section).\n"
2492 " The function to be executed by pkt-gen. Specify tx for transmission, rx for reception, ping\n"
2493 " for client-side ping-pong operation, and pong for server-side ping-pong operation.\n"
2496 " Number of iterations of the pkt-gen function (with 0 meaning infinite). In case of tx or rx,\n"
2497 " count is the number of packets to receive or transmit. In case of ping or pong, count is the\n"
2498 " number of ping-pong transactions.\n"
2501 " Packet size in bytes excluding CRC. If passed a second time, use random sizes larger or\n"
2502 " equal than the second one and lower than the first one.\n"
2505 " Transmit or receive up to burst_size packets at a time.\n"
2507 " -4 Use IPv4 addresses.\n"
2509 " -6 Use IPv6 addresses.\n"
2511 " -d dst_ip[:port[-dst_ip:port]]\n"
2512 " Destination IPv4/IPv6 address and port, single or range.\n"
2514 " -s src_ip[:port[-src_ip:port]]\n"
2515 " Source IPv4/IPv6 address and port, single or range.\n"
2518 " Destination MAC address in colon notation (e.g., aa:bb:cc:dd:ee:00).\n"
2521 " Source MAC address in colon notation.\n"
2524 " Pin the first thread of pkt-gen to a particular CPU using pthread_setaffinity_np(3). If more\n"
2525 " threads are used, they are pinned to the subsequent CPUs, one per thread.\n"
2528 " Maximum number of CPUs to use (0 means to use all the available ones).\n"
2531 " Number of threads to use. By default, only a single thread is used to handle all the netmap\n"
2532 " rings. If threads is larger than one, each thread handles a single TX ring (in tx mode), a\n"
2533 " single RX ring (in rx mode), or a TX/RX ring pair. The number of threads must be less than or\n"
2534 " equal to the number of TX (or RX) rings available in the device specified by interface.\n"
2537 " Number of milliseconds between reports.\n"
2539 " -w wait_for_link_time\n"
2540 " Number of seconds to wait before starting the pkt-gen function, useful to make sure that the\n"
2541 " network link is up. A network device driver may take some time to enter netmap mode, or to\n"
2542 " create a new transmit/receive ring pair when netmap(4) requests one.\n"
2545 " Packet transmission rate. Not setting the packet transmission rate tells pkt-gen to transmit\n"
2546 " packets as quickly as possible. On servers from 2010 onward netmap(4) is able to com-\n"
2547 " pletely use all of the bandwidth of a 10 or 40Gbps link, so this option should be used unless\n"
2548 " your intention is to saturate the link.\n"
2550 " -X Dump payload of each packet transmitted or received.\n"
2552 " -H len Add empty virtio-net-header with size 'len'. Valid sizes are 0, 10 and 12. This option is\n"
2553 " only used with Virtual Machine technologies that use virtio as a network interface.\n"
2556 " Load the packet to be transmitted from a pcap file rather than constructing it within\n"
2559 " -z Use random IPv4/IPv6 src address/port.\n"
2561 " -Z Use random IPv4/IPv6 dst address/port.\n"
2563 " -N Do not normalize units (i.e., use bps, pps instead of Mbps, Kpps, etc.).\n"
2566 " Send multi-slot packets, each one with num_frags fragments. A multi-slot packet is repre-\n"
2567 " sented by two or more consecutive netmap slots with the NS_MOREFRAG flag set (except for the\n"
2568 " last slot). This is useful to transmit or receive packets larger than the netmap buffer\n"
2572 " In multi-slot mode, frag_size specifies the size of each fragment, if smaller than the packet\n"
2573 " length divided by num_frags.\n"
2575 " -I Use indirect buffers. It is only valid for transmitting on VALE ports, and it is implemented\n"
2576 " by setting the NS_INDIRECT flag in the netmap slots.\n"
2578 " -W Exit immediately if all the RX rings are empty the first time they are examined.\n"
2580 " -v Increase the verbosity level.\n"
2582 " -r In tx mode, do not initialize packets, but send whatever the content of the uninitialized\n"
2583 " netmap buffers is (rubbish mode).\n"
2585 " -A Compute mean and standard deviation (over a sliding window) for the transmit or receive rate.\n"
2587 " -B Take Ethernet framing and CRC into account when computing the average bps. This adds 4 bytes\n"
2588 " of CRC and 20 bytes of framing to each packet.\n"
2590 " -C tx_slots[,rx_slots[,tx_rings[,rx_rings]]]\n"
2591 " Configuration in terms of number of rings and slots to be used when opening the netmap port.\n"
2592 " Such configuration has an effect on software ports created on the fly, such as VALE ports and\n"
2593 " netmap pipes. The configuration may consist of 1 to 4 numbers separated by commas: tx_slots,\n"
2594 " rx_slots, tx_rings, rx_rings. Missing numbers or zeroes stand for default values. As an\n"
2595 " additional convenience, if exactly one number is specified, then this is assigned to both\n"
2596 " tx_slots and rx_slots. If there is no fourth number, then the third one is assigned to both\n"
2597 " tx_rings and rx_rings.\n"
2599 " -o options data generation options (parsed using atoi)\n"
2604 " OPT_TS 16 (add a timestamp)\n"
2605 " OPT_INDIRECT 32 (use indirect buffers)\n"
2606 " OPT_DUMP 64 (dump rx/tx traffic)\n"
2607 " OPT_RUBBISH 256\n"
2608 " (send whatever the buffers contain)\n"
2609 " OPT_RANDOM_SRC 512\n"
2610 " OPT_RANDOM_DST 1024\n"
2611 " OPT_PPS_STATS 2048\n"
2612 " OPT_UPDATE_CSUM 4096\n"
2619 start_threads(struct glob_arg *g) {
2622 targs = calloc(g->nthreads, sizeof(*targs));
2625 * Now create the desired number of threads, each one
2626 * using a single descriptor.
2628 for (i = 0; i < g->nthreads; i++) {
2629 uint64_t seed = (uint64_t)time(0) | ((uint64_t)time(0) << 32);
2632 bzero(t, sizeof(*t));
2633 t->fd = -1; /* default, with pcap */
2635 memcpy(t->seed, &seed, sizeof(t->seed));
2637 if (g->dev_type == DEV_NETMAP) {
2641 * if the user wants both HW and SW rings, we need to
2642 * know when to switch from NR_REG_ONE_NIC to NR_REG_ONE_SW
2644 if (g->orig_mode == NR_REG_NIC_SW) {
2645 m = (g->td_type == TD_TYPE_RECEIVER ?
2646 g->nmd->reg.nr_rx_rings :
2647 g->nmd->reg.nr_tx_rings);
2652 /* the first thread uses the fd opened by the main
2653 * thread, the other threads re-open /dev/netmap
2655 t->nmd = nmport_clone(g->nmd);
2660 if (m > 0 && j >= m) {
2661 /* switch to the software rings */
2662 t->nmd->reg.nr_mode = NR_REG_ONE_SW;
2665 t->nmd->reg.nr_ringid = j & NETMAP_RING_MASK;
2666 /* Only touch one of the rings (rx is already ok) */
2667 if (g->td_type == TD_TYPE_RECEIVER)
2668 t->nmd->reg.nr_flags |= NETMAP_NO_TX_POLL;
2670 /* register interface. Override ifname and ringid etc. */
2671 if (nmport_open_desc(t->nmd) < 0) {
2672 nmport_undo_prepare(t->nmd);
2680 t->frags = g->frags;
2682 targs[i].fd = g->main_fd;
2686 if (g->affinity >= 0) {
2687 t->affinity = (g->affinity + i) % g->cpus;
2691 /* default, init packets */
2692 initialize_packet(t);
2694 /* Wait for PHY reset. */
2695 D("Wait %d secs for phy reset", g->wait_link);
2696 sleep(g->wait_link);
2699 for (i = 0; i < g->nthreads; i++) {
2701 if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) {
2702 D("Unable to create thread %d: %s", i, strerror(errno));
2710 main_thread(struct glob_arg *g)
2714 struct my_ctrs prev, cur;
2716 struct timeval tic, toc;
2718 prev.pkts = prev.bytes = prev.events = 0;
2719 gettimeofday(&prev.t, NULL);
2721 char b1[40], b2[40], b3[40], b4[100];
2727 usec = wait_for_next_report(&prev.t, &cur.t,
2728 g->report_interval);
2730 cur.pkts = cur.bytes = cur.events = 0;
2732 if (usec < 10000) /* too short to be meaningful */
2734 /* accumulate counts for all threads */
2735 for (i = 0; i < g->nthreads; i++) {
2736 cur.pkts += targs[i].ctr.pkts;
2737 cur.bytes += targs[i].ctr.bytes;
2738 cur.events += targs[i].ctr.events;
2739 cur.min_space += targs[i].ctr.min_space;
2740 targs[i].ctr.min_space = 99999;
2741 if (targs[i].used == 0)
2744 x.pkts = cur.pkts - prev.pkts;
2745 x.bytes = cur.bytes - prev.bytes;
2746 x.events = cur.events - prev.events;
2747 pps = (x.pkts*1000000 + usec/2) / usec;
2748 abs = (x.events > 0) ? (x.pkts / (double) x.events) : 0;
2750 if (!(g->options & OPT_PPS_STATS)) {
2753 /* Compute some pps stats using a sliding window. */
2754 double ppsavg = 0.0, ppsdev = 0.0;
2757 g->win[g->win_idx] = pps;
2758 g->win_idx = (g->win_idx + 1) % STATS_WIN;
2760 for (i = 0; i < STATS_WIN; i++) {
2761 ppsavg += g->win[i];
2768 for (i = 0; i < STATS_WIN; i++) {
2769 if (g->win[i] == 0) {
2772 ppsdev += (g->win[i] - ppsavg) * (g->win[i] - ppsavg);
2775 ppsdev = sqrt(ppsdev);
2777 snprintf(b4, sizeof(b4), "[avg/std %s/%s pps]",
2778 norm(b1, ppsavg, normalize), norm(b2, ppsdev, normalize));
2781 D("%spps %s(%spkts %sbps in %llu usec) %.2f avg_batch %d min_space",
2782 norm(b1, pps, normalize), b4,
2783 norm(b2, (double)x.pkts, normalize),
2784 norm(b3, 1000000*((double)x.bytes*8+(double)x.pkts*g->framing)/usec, normalize),
2785 (unsigned long long)usec,
2786 abs, (int)cur.min_space);
2789 if (done == g->nthreads)
2795 cur.pkts = cur.bytes = cur.events = 0;
2797 for (i = 0; i < g->nthreads; i++) {
2798 struct timespec t_tic, t_toc;
2800 * Join active threads, unregister interfaces and close
2804 pthread_join(targs[i].thread, NULL); /* blocking */
2805 if (g->dev_type == DEV_NETMAP) {
2806 nmport_close(targs[i].nmd);
2807 targs[i].nmd = NULL;
2812 if (targs[i].completed == 0)
2813 D("ouch, thread %d exited with error", i);
2816 * Collect threads output and extract information about
2817 * how long it took to send all the packets.
2819 cur.pkts += targs[i].ctr.pkts;
2820 cur.bytes += targs[i].ctr.bytes;
2821 cur.events += targs[i].ctr.events;
2822 /* collect the largest start (tic) and end (toc) times,
2823 * XXX maybe we should do the earliest tic, or do a weighted
2826 t_tic = timeval2spec(&tic);
2827 t_toc = timeval2spec(&toc);
2828 if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic))
2829 tic = timespec2val(&targs[i].tic);
2830 if (!timerisset(&toc) || timespec_ge(&targs[i].toc, &t_toc))
2831 toc = timespec2val(&targs[i].toc);
2835 timersub(&toc, &tic, &toc);
2836 delta_t = toc.tv_sec + 1e-6* toc.tv_usec;
2837 if (g->td_type == TD_TYPE_SENDER)
2838 tx_output(g, &cur, delta_t, "Sent");
2839 else if (g->td_type == TD_TYPE_RECEIVER)
2840 tx_output(g, &cur, delta_t, "Received");
2850 static struct td_desc func[] = {
2851 { TD_TYPE_RECEIVER, "rx", receiver_body, 512}, /* default */
2852 { TD_TYPE_SENDER, "tx", sender_body, 512 },
2853 { TD_TYPE_OTHER, "ping", ping_body, 1 },
2854 { TD_TYPE_OTHER, "pong", pong_body, 1 },
2855 { TD_TYPE_SENDER, "txseq", txseq_body, 512 },
2856 { TD_TYPE_RECEIVER, "rxseq", rxseq_body, 512 },
2857 { 0, NULL, NULL, 0 }
2861 tap_alloc(char *dev)
2865 const char *clonedev = TAP_CLONEDEV;
2869 /* Arguments taken by the function:
2871 * char *dev: the name of an interface (or '\0'). MUST have enough
2872 * space to hold the interface name if '\0' is passed
2873 * int flags: interface flags (eg, IFF_TUN etc.)
2877 if (dev[3]) { /* tapSomething */
2878 static char buf[128];
2879 snprintf(buf, sizeof(buf), "/dev/%s", dev);
2883 /* open the device */
2884 if( (fd = open(clonedev, O_RDWR)) < 0 ) {
2887 D("%s open successful", clonedev);
2889 /* preparation of the struct ifr, of type "struct ifreq" */
2890 memset(&ifr, 0, sizeof(ifr));
2893 ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
2896 /* if a device name was specified, put it in the structure; otherwise,
2897 * the kernel will try to allocate the "next" device of the
2899 size_t len = strlen(dev);
2900 if (len > IFNAMSIZ) {
2901 D("%s too long", dev);
2904 memcpy(ifr.ifr_name, dev, len);
2907 /* try to create the device */
2908 if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) {
2909 D("failed to do a TUNSETIFF: %s", strerror(errno));
2914 /* if the operation was successful, write back the name of the
2915 * interface to the variable "dev", so the caller can know
2916 * it. Note that the caller MUST reserve space in *dev (see calling
2918 strcpy(dev, ifr.ifr_name);
2919 D("new name is %s", dev);
2922 /* this is the special file descriptor that the caller will use to talk
2923 * with the virtual interface */
2928 main(int arc, char **argv)
2931 struct sigaction sa;
2937 int devqueues = 1; /* how many device queues */
2938 int wait_link_arg = 0;
2940 int pkt_size_done = 0;
2942 struct td_desc *fn = func;
2944 bzero(&g, sizeof(g));
2949 g.report_interval = 1000; /* report interval */
2951 /* ip addresses can also be a range x.x.x.x-x.x.x.y */
2952 g.af = AF_INET; /* default */
2953 g.src_ip.name = "10.0.0.1";
2954 g.dst_ip.name = "10.1.0.1";
2955 g.dst_mac.name = "ff:ff:ff:ff:ff:ff";
2956 g.src_mac.name = NULL;
2960 g.cpus = 1; /* default */
2964 g.frag_size = (u_int)-1; /* use the netmap buffer size by default */
2967 g.wait_link = 2; /* wait 2 seconds for physical ports */
2969 while ((ch = getopt(arc, argv, "46a:f:F:Nn:i:Il:d:s:D:S:b:c:o:p:"
2970 "T:w:WvR:XC:H:rP:zZAhBM:")) != -1) {
2974 D("bad option %c %s", ch, optarg);
2995 g.npackets = strtoull(optarg, NULL, 10);
3000 if (i < 1 || i > 63) {
3001 D("invalid frags %d [1..63], ignore", i);
3008 g.frag_size = atoi(optarg);
3012 for (fn = func; fn->key; fn++) {
3013 if (!strcmp(fn->key, optarg))
3020 D("unrecognised function %s", optarg);
3024 case 'o': /* data generation options */
3025 g.options |= atoi(optarg);
3028 case 'a': /* force affinity */
3029 g.affinity = atoi(optarg);
3032 case 'i': /* interface */
3033 /* a prefix of tap: netmap: or pcap: forces the mode.
3034 * otherwise we guess
3036 D("interface is %s", optarg);
3037 if (strlen(optarg) > MAX_IFNAMELEN - 8) {
3038 D("ifname too long %s", optarg);
3041 strcpy(g.ifname, optarg);
3042 if (!strcmp(optarg, "null")) {
3043 g.dev_type = DEV_NETMAP;
3045 } else if (!strncmp(optarg, "tap:", 4)) {
3046 g.dev_type = DEV_TAP;
3047 strcpy(g.ifname, optarg + 4);
3048 } else if (!strncmp(optarg, "pcap:", 5)) {
3049 g.dev_type = DEV_PCAP;
3050 strcpy(g.ifname, optarg + 5);
3051 } else if (!strncmp(optarg, "netmap:", 7) ||
3052 !strncmp(optarg, "vale", 4)) {
3053 g.dev_type = DEV_NETMAP;
3054 } else if (!strncmp(optarg, "tap", 3)) {
3055 g.dev_type = DEV_TAP;
3056 } else { /* prepend netmap: */
3057 g.dev_type = DEV_NETMAP;
3058 sprintf(g.ifname, "netmap:%s", optarg);
3063 g.options |= OPT_INDIRECT; /* use indirect buffers */
3066 case 'l': /* pkt_size */
3067 if (pkt_size_done) {
3068 g.pkt_min_size = atoi(optarg);
3070 g.pkt_size = atoi(optarg);
3076 g.dst_ip.name = optarg;
3080 g.src_ip.name = optarg;
3083 case 'T': /* report interval */
3084 g.report_interval = atoi(optarg);
3088 g.wait_link = atoi(optarg);
3093 g.forever = 0; /* exit RX with no traffic */
3096 case 'b': /* burst */
3097 g.burst = atoi(optarg);
3100 g.cpus = atoi(optarg);
3103 g.nthreads = atoi(optarg);
3106 case 'D': /* destination mac */
3107 g.dst_mac.name = optarg;
3110 case 'S': /* source mac */
3111 g.src_mac.name = optarg;
3117 g.tx_rate = atoi(optarg);
3120 g.options |= OPT_DUMP;
3123 D("WARNING: the 'C' option is deprecated, use the '+conf:' libnetmap option instead");
3124 g.nmr_config = strdup(optarg);
3127 g.virt_header = atoi(optarg);
3130 g.packet_file = strdup(optarg);
3133 g.options |= OPT_RUBBISH;
3136 g.options |= OPT_RANDOM_SRC;
3139 g.options |= OPT_RANDOM_DST;
3142 g.options |= OPT_PPS_STATS;
3145 /* raw packets have4 bytes crc + 20 bytes framing */
3146 // XXX maybe add an option to pass the IFG
3152 if (strlen(g.ifname) <=0 ) {
3153 D("missing ifname");
3158 g.burst = fn->default_burst;
3159 D("using default burst size: %d", g.burst);
3162 g.system_cpus = i = system_ncpus();
3163 if (g.cpus < 0 || g.cpus > i) {
3164 D("%d cpus is too high, have only %d cpus", g.cpus, i);
3167 D("running on %d cpus (have %d)", g.cpus, i);
3171 if (!wait_link_arg && !strncmp(g.ifname, "vale", 4)) {
3175 if (g.pkt_size < 16 || g.pkt_size > MAX_PKTSIZE) {
3176 D("bad pktsize %d [16..%d]\n", g.pkt_size, MAX_PKTSIZE);
3180 if (g.pkt_min_size > 0 && (g.pkt_min_size < 16 || g.pkt_min_size > g.pkt_size)) {
3181 D("bad pktminsize %d [16..%d]\n", g.pkt_min_size, g.pkt_size);
3185 if (g.src_mac.name == NULL) {
3186 static char mybuf[20] = "00:00:00:00:00:00";
3187 /* retrieve source mac address. */
3188 if (source_hwaddr(g.ifname, mybuf) == -1) {
3189 D("Unable to retrieve source mac");
3190 // continue, fail later
3192 g.src_mac.name = mybuf;
3194 /* extract address ranges */
3195 if (extract_mac_range(&g.src_mac) || extract_mac_range(&g.dst_mac))
3197 g.options |= extract_ip_range(&g.src_ip, g.af);
3198 g.options |= extract_ip_range(&g.dst_ip, g.af);
3200 if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1
3201 && g.virt_header != VIRT_HDR_2) {
3202 D("bad virtio-net-header length");
3206 if (g.dev_type == DEV_TAP) {
3207 D("want to use tap %s", g.ifname);
3208 g.main_fd = tap_alloc(g.ifname);
3209 if (g.main_fd < 0) {
3210 D("cannot open tap %s", g.ifname);
3214 } else if (g.dev_type == DEV_PCAP) {
3215 char pcap_errbuf[PCAP_ERRBUF_SIZE];
3217 pcap_errbuf[0] = '\0'; // init the buffer
3218 g.p = pcap_open_live(g.ifname, 256 /* XXX */, 1, 100, pcap_errbuf);
3220 D("cannot open pcap on %s", g.ifname);
3223 g.main_fd = pcap_fileno(g.p);
3224 D("using pcap on %s fileno %d", g.ifname, g.main_fd);
3225 #endif /* !NO_PCAP */
3226 } else if (g.dummy_send) { /* but DEV_NETMAP */
3227 D("using a dummy send routine");
3229 g.nmd = nmport_prepare(g.ifname);
3233 parse_nmr_config(g.nmr_config, &g.nmd->reg);
3235 g.nmd->reg.nr_flags |= NR_ACCEPT_VNET_HDR;
3238 * Open the netmap device using nm_open().
3240 * protocol stack and may cause a reset of the card,
3241 * which in turn may take some time for the PHY to
3242 * reconfigure. We do the open here to have time to reset.
3244 g.orig_mode = g.nmd->reg.nr_mode;
3245 if (g.nthreads > 1) {
3246 switch (g.orig_mode) {
3247 case NR_REG_ALL_NIC:
3249 g.nmd->reg.nr_mode = NR_REG_ONE_NIC;
3252 g.nmd->reg.nr_mode = NR_REG_ONE_SW;
3257 g.nmd->reg.nr_ringid = 0;
3259 if (nmport_open_desc(g.nmd) < 0)
3261 g.main_fd = g.nmd->fd;
3262 ND("mapped %luKB at %p", (unsigned long)(g.nmd->req.nr_memsize>>10),
3265 if (g.virt_header) {
3266 /* Set the virtio-net header length, since the user asked
3267 * for it explicitly. */
3268 set_vnet_hdr_len(&g);
3270 /* Check whether the netmap port we opened requires us to send
3271 * and receive frames with virtio-net header. */
3272 get_vnet_hdr_len(&g);
3275 /* get num of queues in tx or rx */
3276 if (g.td_type == TD_TYPE_SENDER)
3277 devqueues = g.nmd->reg.nr_tx_rings + g.nmd->reg.nr_host_tx_rings;
3279 devqueues = g.nmd->reg.nr_rx_rings + g.nmd->reg.nr_host_rx_rings;
3281 /* validate provided nthreads. */
3282 if (g.nthreads < 1 || g.nthreads > devqueues) {
3283 D("bad nthreads %d, have %d queues", g.nthreads, devqueues);
3284 // continue, fail later
3287 if (g.td_type == TD_TYPE_SENDER) {
3288 int mtu = get_if_mtu(&g);
3290 if (mtu > 0 && g.pkt_size > mtu) {
3291 D("pkt_size (%d) must be <= mtu (%d)",
3298 struct netmap_if *nifp = g.nmd->nifp;
3299 struct nmreq_register *req = &g.nmd->reg;
3301 D("nifp at offset %"PRIu64" ntxqs %d nrxqs %d memid %d",
3302 req->nr_offset, req->nr_tx_rings, req->nr_rx_rings,
3304 for (i = 0; i < req->nr_tx_rings + req->nr_host_tx_rings; i++) {
3305 struct netmap_ring *ring = NETMAP_TXRING(nifp, i);
3306 D(" TX%d at offset %p slots %d", i,
3307 (void *)((char *)ring - (char *)nifp), ring->num_slots);
3309 for (i = 0; i < req->nr_rx_rings + req->nr_host_rx_rings; i++) {
3310 struct netmap_ring *ring = NETMAP_RXRING(nifp, i);
3311 D(" RX%d at offset %p slots %d", i,
3312 (void *)((char *)ring - (char *)nifp), ring->num_slots);
3316 /* Print some debug information. */
3318 "%s %s: %d queues, %d threads and %d cpus.\n",
3319 (g.td_type == TD_TYPE_SENDER) ? "Sending on" :
3320 ((g.td_type == TD_TYPE_RECEIVER) ? "Receiving from" :
3326 if (g.td_type == TD_TYPE_SENDER) {
3327 fprintf(stdout, "%s -> %s (%s -> %s)\n",
3328 g.src_ip.name, g.dst_ip.name,
3329 g.src_mac.name, g.dst_mac.name);
3333 /* Exit if something went wrong. */
3334 if (g.main_fd < 0) {
3342 D("--- SPECIAL OPTIONS:%s%s%s%s%s%s\n",
3343 g.options & OPT_PREFETCH ? " prefetch" : "",
3344 g.options & OPT_ACCESS ? " access" : "",
3345 g.options & OPT_MEMCPY ? " memcpy" : "",
3346 g.options & OPT_INDIRECT ? " indirect" : "",
3347 g.options & OPT_COPY ? " copy" : "",
3348 g.options & OPT_RUBBISH ? " rubbish " : "");
3351 g.tx_period.tv_sec = g.tx_period.tv_nsec = 0;
3352 if (g.tx_rate > 0) {
3353 /* try to have at least something every second,
3354 * reducing the burst size to some 0.01s worth of data
3355 * (but no less than one full set of fragments)
3358 int lim = (g.tx_rate)/300;
3363 x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate;
3364 g.tx_period.tv_nsec = x;
3365 g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000;
3366 g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000;
3368 if (g.td_type == TD_TYPE_SENDER)
3369 D("Sending %d packets every %jd.%09ld s",
3370 g.burst, (intmax_t)g.tx_period.tv_sec, g.tx_period.tv_nsec);
3371 /* Install ^C handler. */
3372 global_nthreads = g.nthreads;
3374 sigaddset(&ss, SIGINT);
3375 /* block SIGINT now, so that all created threads will inherit the mask */
3376 if (pthread_sigmask(SIG_BLOCK, &ss, NULL) < 0) {
3377 D("failed to block SIGINT: %s", strerror(errno));
3379 if (start_threads(&g) < 0)
3381 /* Install the handler and re-enable SIGINT for the main thread */
3382 memset(&sa, 0, sizeof(sa));
3383 sa.sa_handler = sigint_h;
3384 if (sigaction(SIGINT, &sa, NULL) < 0) {
3385 D("failed to install ^C handler: %s", strerror(errno));
3388 if (pthread_sigmask(SIG_UNBLOCK, &ss, NULL) < 0) {
3389 D("failed to re-enable SIGINT: %s", strerror(errno));