2 * Copyright (c) 2005 Robert N. M. Watson
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include <sys/types.h>
30 #include <sys/socket.h>
32 #include <sys/utsname.h>
35 #include <netinet/in.h>
47 * juggle is a simple IPC/context switch performance test, which works on
48 * pairs of file descriptors of various types. In various runs, it considers
49 * the cost of bouncing a message synchronously across the descriptor pair,
50 * either in the same thread, two different threads, or two different
51 * processes. Timing measurements for each series of I/O's are reported, but
52 * the first measurement in each series discarded as "warmup" on the IPC
53 * primitive. Variations on the test permit for pipelining, or the insertion
54 * of more than one packet into the stream at a time, intended to permit
55 * greater parallelism, hopefully allowing performance numbers to reflect
56 * use of available parallelism, and/or intelligence in context switching to
57 * avoid premature switching when multiple messages are queued.
61 * The UDP test uses UDP over the loopback interface. Two arbitrary but
64 #define UDP_PORT1 2020
65 #define UDP_PORT2 2021
68 * Size of each message. Must be smaller than the socket buffer or pipe
69 * buffer maximum size, as we want to send it atomically without blocking.
70 * If pipelining is in use, must be able to fit PIPELINE_MAX of these
71 * messages into the send queue.
73 #define MESSAGELEN 128
76 * Number of message cycles -- into fd1, out of fd2, into fd2, and out of
77 * fd1. By counting in cycles, we allow the master thread or process to
78 * perform timing without explicitly synchronizing with the secondary thread
81 #define NUMCYCLES 1024
84 * Number of times to run each test.
89 * Number of in-flight messages per cycle. I adjusting this value, be
90 * careful not to exceed the socket/etc buffer depth, or messages may be lost
91 * or result in blocking.
93 #define PIPELINE_MAX 4
96 * As in all programs, steal timespecsub() from time.h.
98 #define timespecsub(vvp, uvp) \
100 (vvp)->tv_sec -= (uvp)->tv_sec; \
101 (vvp)->tv_nsec -= (uvp)->tv_nsec; \
102 if ((vvp)->tv_nsec < 0) { \
104 (vvp)->tv_nsec += 1000000000; \
109 udp_create(int *fd1p, int *fd2p)
111 struct sockaddr_in sin1, sin2;
114 sock1 = socket(PF_INET, SOCK_DGRAM, 0);
118 sock2 = socket(PF_INET, SOCK_DGRAM, 0);
124 bzero(&sin1, sizeof(sin1));
125 sin1.sin_len = sizeof(sin1);
126 sin1.sin_family = AF_INET;
127 sin1.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
128 sin1.sin_port = htons(UDP_PORT1);
130 bzero(&sin2, sizeof(sin2));
131 sin2.sin_len = sizeof(sin2);
132 sin2.sin_family = AF_INET;
133 sin2.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
134 sin2.sin_port = htons(UDP_PORT2);
136 if (bind(sock1, (struct sockaddr *) &sin1, sizeof(sin1)) < 0) {
142 if (bind(sock2, (struct sockaddr *) &sin2, sizeof(sin2)) < 0) {
148 if (connect(sock1, (struct sockaddr *) &sin2, sizeof(sin2)) < 0) {
154 if (connect(sock2, (struct sockaddr *) &sin1, sizeof(sin1)) < 0) {
167 pipe_create(int *fd1p, int *fd2p)
181 socketpairdgram_create(int *fd1p, int *fd2p)
185 if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, fds) < 0)
195 socketpairstream_create(int *fd1p, int *fd2p)
199 if (socketpair(PF_LOCAL, SOCK_STREAM, 0, fds) < 0)
211 u_char buffer[MESSAGELEN];
214 bzero(buffer, sizeof(buffer));
216 len = write(s, buffer, sizeof(buffer));
219 if (len != sizeof(buffer)) {
229 u_char buffer[MESSAGELEN];
232 len = read(s, buffer, sizeof(buffer));
235 if (len != sizeof(buffer)) {
243 * Juggle messages between two file descriptors in a single thread/process,
244 * so simply a measure of IPC performance.
246 static struct timespec
247 juggle(int fd1, int fd2, int pipeline)
249 struct timespec tstart, tfinish;
252 if (clock_gettime(CLOCK_REALTIME, &tstart) < 0)
253 err(-1, "juggle: clock_gettime");
255 for (i = 0; i < NUMCYCLES; i++) {
257 for (j = 0; j < pipeline; j++) {
258 if (message_send(fd1) < 0)
259 err(-1, "message_send fd1");
262 for (j = 0; j < pipeline; j++) {
263 if (message_recv(fd2) < 0)
264 err(-1, "message_recv fd2");
266 if (message_send(fd2) < 0)
267 err(-1, "message_send fd2");
270 for (j = 0; j < pipeline; j++) {
271 if (message_recv(fd1) < 0)
272 err(-1, "message_recv fd1");
276 if (clock_gettime(CLOCK_REALTIME, &tfinish) < 0)
277 err(-1, "juggle: clock_gettime");
279 timespecsub(&tfinish, &tstart);
285 * Juggle messages between two file descriptors in two threads, so measure
286 * the cost of IPC and the cost of a thread context switch.
288 * In order to avoid measuring thread creation time, we make use of a
289 * condition variable to decide when both threads are ready to begin
292 static int threaded_child_ready;
293 static pthread_mutex_t threaded_mtx;
294 static pthread_cond_t threaded_cond;
295 static int threaded_pipeline;
298 juggling_thread(void *arg)
304 if (pthread_mutex_lock(&threaded_mtx) < 0)
305 err(-1, "juggling_thread: pthread_mutex_lock");
307 threaded_child_ready = 1;
309 if (pthread_cond_signal(&threaded_cond) < 0)
310 err(-1, "juggling_thread: pthread_cond_signal");
312 if (pthread_mutex_unlock(&threaded_mtx) < 0)
313 err(-1, "juggling_thread: pthread_mutex_unlock");
315 for (i = 0; i < NUMCYCLES; i++) {
316 for (j = 0; j < threaded_pipeline; j++) {
317 if (message_recv(fd2) < 0)
318 err(-1, "message_recv fd2");
320 if (message_send(fd2) < 0)
321 err(-1, "message_send fd2");
328 static struct timespec
329 thread_juggle(int fd1, int fd2, int pipeline)
331 struct timespec tstart, tfinish;
335 threaded_pipeline = pipeline;
337 if (pthread_mutex_init(&threaded_mtx, NULL) < 0)
338 err(-1, "thread_juggle: pthread_mutex_init");
340 if (pthread_create(&thread, NULL, juggling_thread, &fd2) < 0)
341 err(-1, "thread_juggle: pthread_create");
343 if (pthread_mutex_lock(&threaded_mtx) < 0)
344 err(-1, "thread_juggle: pthread_mutex_lock");
346 while (!threaded_child_ready) {
347 if (pthread_cond_wait(&threaded_cond, &threaded_mtx) < 0)
348 err(-1, "thread_juggle: pthread_cond_wait");
351 if (pthread_mutex_unlock(&threaded_mtx) < 0)
352 err(-1, "thread_juggle: pthread_mutex_unlock");
354 if (clock_gettime(CLOCK_REALTIME, &tstart) < 0)
355 err(-1, "thread_juggle: clock_gettime");
357 for (i = 0; i < NUMCYCLES; i++) {
358 for (j = 0; j < pipeline; j++) {
359 if (message_send(fd1) < 0)
360 err(-1, "message_send fd1");
363 for (j = 0; j < pipeline; j++) {
364 if (message_recv(fd1) < 0)
365 err(-1, "message_recv fd1");
369 if (clock_gettime(CLOCK_REALTIME, &tfinish) < 0)
370 err(-1, "thread_juggle: clock_gettime");
372 if (pthread_join(thread, NULL) < 0)
373 err(-1, "thread_juggle: pthread_join");
375 timespecsub(&tfinish, &tstart);
381 * Juggle messages between two file descriptors in two processes, so measure
382 * the cost of IPC and the cost of a process context switch.
384 * Since we can't use a mutex between the processes, we simply do an extra
385 * write on the child to let the parent know that it's ready to start.
387 static struct timespec
388 process_juggle(int fd1, int fd2, int pipeline)
390 struct timespec tstart, tfinish;
391 pid_t pid, ppid, wpid;
398 err(-1, "process_juggle: fork");
401 if (message_send(fd2) < 0) {
405 err(-1, "process_juggle: child: message_send");
408 for (i = 0; i < NUMCYCLES; i++) {
409 for (j = 0; j < pipeline; j++) {
410 if (message_send(fd2) < 0)
411 err(-1, "message_send fd2");
413 if (message_recv(fd2) < 0)
414 err(-1, "message_recv fd2");
420 if (message_recv(fd1) < 0) {
424 err(-1, "process_juggle: parent: message_recv");
427 if (clock_gettime(CLOCK_REALTIME, &tstart) < 0)
428 err(-1, "process_juggle: clock_gettime");
430 for (i = 0; i < NUMCYCLES; i++) {
431 for (j = 0; j < pipeline; j++) {
432 if (message_send(fd1) < 0) {
436 err(-1, "message_send fd1");
440 for (j = 0; j < pipeline; j++) {
441 if (message_recv(fd1) < 0) {
445 err(-1, "message_recv fd1");
450 if (clock_gettime(CLOCK_REALTIME, &tfinish) < 0)
451 err(-1, "process_juggle: clock_gettime");
454 wpid = waitpid(pid, NULL, 0);
456 err(-1, "process_juggle: waitpid");
458 errx(-1, "process_juggle: waitpid: pid != wpid");
460 timespecsub(&tfinish, &tstart);
466 * When we print out results for larger pipeline sizes, we scale back by the
467 * depth of the pipeline. This generally means dividing by the pipeline
468 * depth. Except when it means dividing by zero.
471 scale_timespec(struct timespec *ts, int p)
481 static const struct ipctype {
482 int (*it_create)(int *fd1p, int *fd2p);
485 { pipe_create, "pipe" },
486 { udp_create, "udp" },
487 { socketpairdgram_create, "socketpairdgram" },
488 { socketpairstream_create, "socketpairstream" },
490 static const int ipctypes_len = (sizeof(ipctypes) / sizeof(struct ipctype));
493 main(int argc, char *argv[])
495 struct timespec juggle_results[LOOPS], process_results[LOOPS];
496 struct timespec thread_results[LOOPS];
497 int fd1, fd2, i, j, p;
500 printf("version, juggle.c %s\n", "$FreeBSD$");
504 printf("sysname, %s\n", uts.sysname);
505 printf("nodename, %s\n", uts.nodename);
506 printf("release, %s\n", uts.release);
507 printf("version, %s\n", uts.version);
508 printf("machine, %s\n", uts.machine);
511 printf("MESSAGELEN, %d\n", MESSAGELEN);
512 printf("NUMCYCLES, %d\n", NUMCYCLES);
513 printf("LOOPS, %d\n", LOOPS);
514 printf("PIPELINE_MAX, %d\n", PIPELINE_MAX);
517 printf("ipctype, test, pipeline_depth");
518 for (j = 0; j < LOOPS; j++)
519 printf(", data%d", j);
522 for (p = 0; p < PIPELINE_MAX + 1; p++) {
523 for (i = 0; i < ipctypes_len; i++) {
524 if (ipctypes[i].it_create(&fd1, &fd2) < 0)
525 err(-1, "main: %s", ipctypes[i].it_name);
528 * For each test, do one uncounted warmup, then LOOPS
529 * runs of the actual test.
532 for (j = 0; j < LOOPS; j++)
533 juggle_results[j] = juggle(fd1, fd2, p);
534 process_juggle(fd1, fd2, p);
535 for (j = 0; j < LOOPS; j++)
536 process_results[j] = process_juggle(fd1, fd2,
538 thread_juggle(fd1, fd2, p);
539 for (j = 0; j < LOOPS; j++)
540 thread_results[j] = thread_juggle(fd1, fd2,
542 for (j = 0; j < LOOPS; j++) {
543 thread_results[j].tv_sec = 0;
544 thread_results[j].tv_nsec = 0;
550 * When printing results for the round, normalize the results
551 * with respect to the pipeline depth. We're doing p times
552 * as much work, and are we taking p times as long?
554 for (i = 0; i < ipctypes_len; i++) {
555 printf("%s, juggle, %d, ", ipctypes[i].it_name, p);
556 for (j = 0; j < LOOPS; j++) {
559 scale_timespec(&juggle_results[j], p);
560 printf("%u.%09lu", juggle_results[j].tv_sec,
561 juggle_results[j].tv_nsec);
564 printf("%s, process_juggle, %d, ",
565 ipctypes[i].it_name, p);
566 for (j = 0; j < LOOPS; j++) {
569 scale_timespec(&process_results[j], p);
570 printf("%u.%09lu", process_results[j].tv_sec,
571 process_results[j].tv_nsec);
574 printf("%s, thread_juggle, %d, ",
575 ipctypes[i].it_name, p);
576 for (j = 0; j < LOOPS; j++) {
579 scale_timespec(&thread_results[j], p);
580 printf("%u.%09lu", thread_results[j].tv_sec,
581 thread_results[j].tv_nsec);