2 * Copyright (c) 2005 Robert N. M. Watson
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include <sys/types.h>
30 #include <sys/socket.h>
31 #include <sys/stdint.h>
33 #include <sys/utsname.h>
36 #include <netinet/in.h>
48 * juggle is a simple IPC/context switch performance test, which works on
49 * pairs of file descriptors of various types. In various runs, it considers
50 * the cost of bouncing a message synchronously across the descriptor pair,
51 * either in the same thread, two different threads, or two different
52 * processes. Timing measurements for each series of I/O's are reported, but
53 * the first measurement in each series discarded as "warmup" on the IPC
54 * primitive. Variations on the test permit for pipelining, or the insertion
55 * of more than one packet into the stream at a time, intended to permit
56 * greater parallelism, hopefully allowing performance numbers to reflect
57 * use of available parallelism, and/or intelligence in context switching to
58 * avoid premature switching when multiple messages are queued.
62 * The UDP test uses UDP over the loopback interface. Two arbitrary but
65 #define UDP_PORT1 2020
66 #define UDP_PORT2 2021
69 * Size of each message. Must be smaller than the socket buffer or pipe
70 * buffer maximum size, as we want to send it atomically without blocking.
71 * If pipelining is in use, must be able to fit PIPELINE_MAX of these
72 * messages into the send queue.
74 #define MESSAGELEN 128
77 * Number of message cycles -- into fd1, out of fd2, into fd2, and out of
78 * fd1. By counting in cycles, we allow the master thread or process to
79 * perform timing without explicitly synchronizing with the secondary thread
82 #define NUMCYCLES 1024
85 * Number of times to run each test.
90 * Number of in-flight messages per cycle. I adjusting this value, be
91 * careful not to exceed the socket/etc buffer depth, or messages may be lost
92 * or result in blocking.
94 #define PIPELINE_MAX 4
97 udp_create(int *fd1p, int *fd2p)
99 struct sockaddr_in sin1, sin2;
102 sock1 = socket(PF_INET, SOCK_DGRAM, 0);
106 sock2 = socket(PF_INET, SOCK_DGRAM, 0);
112 bzero(&sin1, sizeof(sin1));
113 sin1.sin_len = sizeof(sin1);
114 sin1.sin_family = AF_INET;
115 sin1.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
116 sin1.sin_port = htons(UDP_PORT1);
118 bzero(&sin2, sizeof(sin2));
119 sin2.sin_len = sizeof(sin2);
120 sin2.sin_family = AF_INET;
121 sin2.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
122 sin2.sin_port = htons(UDP_PORT2);
124 if (bind(sock1, (struct sockaddr *) &sin1, sizeof(sin1)) < 0) {
130 if (bind(sock2, (struct sockaddr *) &sin2, sizeof(sin2)) < 0) {
136 if (connect(sock1, (struct sockaddr *) &sin2, sizeof(sin2)) < 0) {
142 if (connect(sock2, (struct sockaddr *) &sin1, sizeof(sin1)) < 0) {
155 pipe_create(int *fd1p, int *fd2p)
169 socketpairdgram_create(int *fd1p, int *fd2p)
173 if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, fds) < 0)
183 socketpairstream_create(int *fd1p, int *fd2p)
187 if (socketpair(PF_LOCAL, SOCK_STREAM, 0, fds) < 0)
199 u_char buffer[MESSAGELEN];
202 bzero(buffer, sizeof(buffer));
204 len = write(s, buffer, sizeof(buffer));
207 if (len != sizeof(buffer)) {
217 u_char buffer[MESSAGELEN];
220 len = read(s, buffer, sizeof(buffer));
223 if (len != sizeof(buffer)) {
231 * Juggle messages between two file descriptors in a single thread/process,
232 * so simply a measure of IPC performance.
234 static struct timespec
235 juggle(int fd1, int fd2, int pipeline)
237 struct timespec tstart, tfinish;
240 if (clock_gettime(CLOCK_REALTIME, &tstart) < 0)
241 err(-1, "juggle: clock_gettime");
243 for (i = 0; i < NUMCYCLES; i++) {
245 for (j = 0; j < pipeline; j++) {
246 if (message_send(fd1) < 0)
247 err(-1, "message_send fd1");
250 for (j = 0; j < pipeline; j++) {
251 if (message_recv(fd2) < 0)
252 err(-1, "message_recv fd2");
254 if (message_send(fd2) < 0)
255 err(-1, "message_send fd2");
258 for (j = 0; j < pipeline; j++) {
259 if (message_recv(fd1) < 0)
260 err(-1, "message_recv fd1");
264 if (clock_gettime(CLOCK_REALTIME, &tfinish) < 0)
265 err(-1, "juggle: clock_gettime");
267 timespecsub(&tfinish, &tstart, &tfinish);
273 * Juggle messages between two file descriptors in two threads, so measure
274 * the cost of IPC and the cost of a thread context switch.
276 * In order to avoid measuring thread creation time, we make use of a
277 * condition variable to decide when both threads are ready to begin
280 static int threaded_child_ready;
281 static pthread_mutex_t threaded_mtx;
282 static pthread_cond_t threaded_cond;
283 static int threaded_pipeline;
286 juggling_thread(void *arg)
292 if (pthread_mutex_lock(&threaded_mtx) != 0)
293 err(-1, "juggling_thread: pthread_mutex_lock");
295 threaded_child_ready = 1;
297 if (pthread_cond_signal(&threaded_cond) != 0)
298 err(-1, "juggling_thread: pthread_cond_signal");
300 if (pthread_mutex_unlock(&threaded_mtx) != 0)
301 err(-1, "juggling_thread: pthread_mutex_unlock");
303 for (i = 0; i < NUMCYCLES; i++) {
304 for (j = 0; j < threaded_pipeline; j++) {
305 if (message_recv(fd2) < 0)
306 err(-1, "message_recv fd2");
308 if (message_send(fd2) < 0)
309 err(-1, "message_send fd2");
316 static struct timespec
317 thread_juggle(int fd1, int fd2, int pipeline)
319 struct timespec tstart, tfinish;
323 threaded_pipeline = pipeline;
325 if (pthread_mutex_init(&threaded_mtx, NULL) != 0)
326 err(-1, "thread_juggle: pthread_mutex_init");
328 if (pthread_create(&thread, NULL, juggling_thread, &fd2) != 0)
329 err(-1, "thread_juggle: pthread_create");
331 if (pthread_mutex_lock(&threaded_mtx) != 0)
332 err(-1, "thread_juggle: pthread_mutex_lock");
334 while (!threaded_child_ready) {
335 if (pthread_cond_wait(&threaded_cond, &threaded_mtx) != 0)
336 err(-1, "thread_juggle: pthread_cond_wait");
339 if (pthread_mutex_unlock(&threaded_mtx) != 0)
340 err(-1, "thread_juggle: pthread_mutex_unlock");
342 if (clock_gettime(CLOCK_REALTIME, &tstart) < 0)
343 err(-1, "thread_juggle: clock_gettime");
345 for (i = 0; i < NUMCYCLES; i++) {
346 for (j = 0; j < pipeline; j++) {
347 if (message_send(fd1) < 0)
348 err(-1, "message_send fd1");
351 for (j = 0; j < pipeline; j++) {
352 if (message_recv(fd1) < 0)
353 err(-1, "message_recv fd1");
357 if (clock_gettime(CLOCK_REALTIME, &tfinish) < 0)
358 err(-1, "thread_juggle: clock_gettime");
360 if (pthread_join(thread, NULL) != 0)
361 err(-1, "thread_juggle: pthread_join");
363 timespecsub(&tfinish, &tstart, &tfinish);
369 * Juggle messages between two file descriptors in two processes, so measure
370 * the cost of IPC and the cost of a process context switch.
372 * Since we can't use a mutex between the processes, we simply do an extra
373 * write on the child to let the parent know that it's ready to start.
375 static struct timespec
376 process_juggle(int fd1, int fd2, int pipeline)
378 struct timespec tstart, tfinish;
379 pid_t pid, ppid, wpid;
386 err(-1, "process_juggle: fork");
389 if (message_send(fd2) < 0) {
393 err(-1, "process_juggle: child: message_send");
396 for (i = 0; i < NUMCYCLES; i++) {
397 for (j = 0; j < pipeline; j++) {
398 if (message_send(fd2) < 0)
399 err(-1, "message_send fd2");
401 if (message_recv(fd2) < 0)
402 err(-1, "message_recv fd2");
408 if (message_recv(fd1) < 0) {
412 err(-1, "process_juggle: parent: message_recv");
415 if (clock_gettime(CLOCK_REALTIME, &tstart) < 0)
416 err(-1, "process_juggle: clock_gettime");
418 for (i = 0; i < NUMCYCLES; i++) {
419 for (j = 0; j < pipeline; j++) {
420 if (message_send(fd1) < 0) {
424 err(-1, "message_send fd1");
428 for (j = 0; j < pipeline; j++) {
429 if (message_recv(fd1) < 0) {
433 err(-1, "message_recv fd1");
438 if (clock_gettime(CLOCK_REALTIME, &tfinish) < 0)
439 err(-1, "process_juggle: clock_gettime");
442 wpid = waitpid(pid, NULL, 0);
444 err(-1, "process_juggle: waitpid");
446 errx(-1, "process_juggle: waitpid: pid != wpid");
448 timespecsub(&tfinish, &tstart, &tfinish);
454 * When we print out results for larger pipeline sizes, we scale back by the
455 * depth of the pipeline. This generally means dividing by the pipeline
456 * depth. Except when it means dividing by zero.
459 scale_timespec(struct timespec *ts, int p)
469 static const struct ipctype {
470 int (*it_create)(int *fd1p, int *fd2p);
473 { pipe_create, "pipe" },
474 { udp_create, "udp" },
475 { socketpairdgram_create, "socketpairdgram" },
476 { socketpairstream_create, "socketpairstream" },
478 static const int ipctypes_len = (sizeof(ipctypes) / sizeof(struct ipctype));
481 main(int argc, char *argv[])
483 struct timespec juggle_results[LOOPS], process_results[LOOPS];
484 struct timespec thread_results[LOOPS];
485 int fd1, fd2, i, j, p;
488 printf("version, juggle.c %s\n", "$FreeBSD$");
492 printf("sysname, %s\n", uts.sysname);
493 printf("nodename, %s\n", uts.nodename);
494 printf("release, %s\n", uts.release);
495 printf("version, %s\n", uts.version);
496 printf("machine, %s\n", uts.machine);
499 printf("MESSAGELEN, %d\n", MESSAGELEN);
500 printf("NUMCYCLES, %d\n", NUMCYCLES);
501 printf("LOOPS, %d\n", LOOPS);
502 printf("PIPELINE_MAX, %d\n", PIPELINE_MAX);
505 printf("ipctype, test, pipeline_depth");
506 for (j = 0; j < LOOPS; j++)
507 printf(", data%d", j);
510 for (p = 0; p < PIPELINE_MAX + 1; p++) {
511 for (i = 0; i < ipctypes_len; i++) {
512 if (ipctypes[i].it_create(&fd1, &fd2) < 0)
513 err(-1, "main: %s", ipctypes[i].it_name);
516 * For each test, do one uncounted warmup, then LOOPS
517 * runs of the actual test.
520 for (j = 0; j < LOOPS; j++)
521 juggle_results[j] = juggle(fd1, fd2, p);
522 process_juggle(fd1, fd2, p);
523 for (j = 0; j < LOOPS; j++)
524 process_results[j] = process_juggle(fd1, fd2,
526 thread_juggle(fd1, fd2, p);
527 for (j = 0; j < LOOPS; j++)
528 thread_results[j] = thread_juggle(fd1, fd2,
530 for (j = 0; j < LOOPS; j++) {
531 thread_results[j].tv_sec = 0;
532 thread_results[j].tv_nsec = 0;
538 * When printing results for the round, normalize the results
539 * with respect to the pipeline depth. We're doing p times
540 * as much work, and are we taking p times as long?
542 for (i = 0; i < ipctypes_len; i++) {
543 printf("%s, juggle, %d, ", ipctypes[i].it_name, p);
544 for (j = 0; j < LOOPS; j++) {
547 scale_timespec(&juggle_results[j], p);
549 (intmax_t)juggle_results[j].tv_sec,
550 juggle_results[j].tv_nsec);
553 printf("%s, process_juggle, %d, ",
554 ipctypes[i].it_name, p);
555 for (j = 0; j < LOOPS; j++) {
558 scale_timespec(&process_results[j], p);
560 (intmax_t)process_results[j].tv_sec,
561 process_results[j].tv_nsec);
564 printf("%s, thread_juggle, %d, ",
565 ipctypes[i].it_name, p);
566 for (j = 0; j < LOOPS; j++) {
569 scale_timespec(&thread_results[j], p);
571 (intmax_t)thread_results[j].tv_sec,
572 thread_results[j].tv_nsec);