2 * Copyright (c) 2007 Roman Divacky
3 * Copyright (c) 2014 Dmitry Chagin
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
31 #include "opt_compat.h"
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/imgact.h>
36 #include <sys/kernel.h>
37 #include <sys/limits.h>
39 #include <sys/mutex.h>
40 #include <sys/callout.h>
41 #include <sys/capsicum.h>
42 #include <sys/types.h>
45 #include <sys/filedesc.h>
46 #include <sys/filio.h>
47 #include <sys/errno.h>
48 #include <sys/event.h>
51 #include <sys/selinfo.h>
53 #include <sys/syscallsubr.h>
54 #include <sys/timespec.h>
57 #include <machine/../linux32/linux.h>
58 #include <machine/../linux32/linux32_proto.h>
60 #include <machine/../linux/linux.h>
61 #include <machine/../linux/linux_proto.h>
64 #include <compat/linux/linux_emul.h>
65 #include <compat/linux/linux_event.h>
66 #include <compat/linux/linux_file.h>
67 #include <compat/linux/linux_timer.h>
68 #include <compat/linux/linux_util.h>
71 * epoll defines 'struct epoll_event' with the field 'data' as 64 bits
72 * on all architectures. But on 32 bit architectures BSD 'struct kevent' only
73 * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied
74 * data verbatuim. Therefore we allocate 64-bit memory block to pass
75 * user supplied data for every file descriptor.
78 typedef uint64_t epoll_udata_t;
80 struct epoll_emuldata {
81 uint32_t fdc; /* epoll udata max index */
82 epoll_udata_t udata[1]; /* epoll user data vector */
85 #define EPOLL_DEF_SZ 16
86 #define EPOLL_SIZE(fdn) \
87 (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t))
93 #if defined(__amd64__)
94 __attribute__((packed))
98 #define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
100 static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata);
101 static int epoll_to_kevent(struct thread *td, struct file *epfp,
102 int fd, struct epoll_event *l_event, int *kev_flags,
103 struct kevent *kevent, int *nkevents);
104 static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
105 static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
106 static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
107 static int epoll_delete_event(struct thread *td, struct file *epfp,
109 static int epoll_delete_all_events(struct thread *td, struct file *epfp,
112 struct epoll_copyin_args {
113 struct kevent *changelist;
116 struct epoll_copyout_args {
117 struct epoll_event *leventlist;
124 typedef uint64_t eventfd_t;
126 static fo_rdwr_t eventfd_read;
127 static fo_rdwr_t eventfd_write;
128 static fo_ioctl_t eventfd_ioctl;
129 static fo_poll_t eventfd_poll;
130 static fo_kqfilter_t eventfd_kqfilter;
131 static fo_stat_t eventfd_stat;
132 static fo_close_t eventfd_close;
133 static fo_fill_kinfo_t eventfd_fill_kinfo;
135 static struct fileops eventfdops = {
136 .fo_read = eventfd_read,
137 .fo_write = eventfd_write,
138 .fo_truncate = invfo_truncate,
139 .fo_ioctl = eventfd_ioctl,
140 .fo_poll = eventfd_poll,
141 .fo_kqfilter = eventfd_kqfilter,
142 .fo_stat = eventfd_stat,
143 .fo_close = eventfd_close,
144 .fo_chmod = invfo_chmod,
145 .fo_chown = invfo_chown,
146 .fo_sendfile = invfo_sendfile,
147 .fo_fill_kinfo = eventfd_fill_kinfo,
148 .fo_flags = DFLAG_PASSABLE
151 static void filt_eventfddetach(struct knote *kn);
152 static int filt_eventfdread(struct knote *kn, long hint);
153 static int filt_eventfdwrite(struct knote *kn, long hint);
155 static struct filterops eventfd_rfiltops = {
157 .f_detach = filt_eventfddetach,
158 .f_event = filt_eventfdread
160 static struct filterops eventfd_wfiltops = {
162 .f_detach = filt_eventfddetach,
163 .f_event = filt_eventfdwrite
167 typedef uint64_t timerfd_t;
169 static fo_rdwr_t timerfd_read;
170 static fo_poll_t timerfd_poll;
171 static fo_kqfilter_t timerfd_kqfilter;
172 static fo_stat_t timerfd_stat;
173 static fo_close_t timerfd_close;
174 static fo_fill_kinfo_t timerfd_fill_kinfo;
176 static struct fileops timerfdops = {
177 .fo_read = timerfd_read,
178 .fo_write = invfo_rdwr,
179 .fo_truncate = invfo_truncate,
180 .fo_ioctl = eventfd_ioctl,
181 .fo_poll = timerfd_poll,
182 .fo_kqfilter = timerfd_kqfilter,
183 .fo_stat = timerfd_stat,
184 .fo_close = timerfd_close,
185 .fo_chmod = invfo_chmod,
186 .fo_chown = invfo_chown,
187 .fo_sendfile = invfo_sendfile,
188 .fo_fill_kinfo = timerfd_fill_kinfo,
189 .fo_flags = DFLAG_PASSABLE
192 static void filt_timerfddetach(struct knote *kn);
193 static int filt_timerfdread(struct knote *kn, long hint);
195 static struct filterops timerfd_rfiltops = {
197 .f_detach = filt_timerfddetach,
198 .f_event = filt_timerfdread
204 struct selinfo efd_sel;
209 clockid_t tfd_clockid;
210 struct itimerspec tfd_time;
211 struct callout tfd_callout;
214 struct selinfo tfd_sel;
218 static int eventfd_create(struct thread *td, uint32_t initval, int flags);
219 static void linux_timerfd_expire(void *);
220 static void linux_timerfd_curval(struct timerfd *, struct itimerspec *);
224 epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata)
226 struct linux_pemuldata *pem;
227 struct epoll_emuldata *emd;
233 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
235 LINUX_PEM_XLOCK(pem);
236 if (pem->epoll == NULL) {
237 emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
243 emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK);
248 emd->udata[fd] = udata;
249 LINUX_PEM_XUNLOCK(pem);
253 epoll_create_common(struct thread *td, int flags)
257 error = kern_kqueue(td, flags, NULL);
261 epoll_fd_install(td, EPOLL_DEF_SZ, 0);
267 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
271 * args->size is unused. Linux just tests it
272 * and then forgets it as well.
277 return (epoll_create_common(td, 0));
281 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
285 if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
289 if ((args->flags & LINUX_O_CLOEXEC) != 0)
292 return (epoll_create_common(td, flags));
295 /* Structure converting function from epoll to kevent. */
297 epoll_to_kevent(struct thread *td, struct file *epfp,
298 int fd, struct epoll_event *l_event, int *kev_flags,
299 struct kevent *kevent, int *nkevents)
301 uint32_t levents = l_event->events;
302 struct linux_pemuldata *pem;
305 /* flags related to how event is registered */
306 if ((levents & LINUX_EPOLLONESHOT) != 0)
307 *kev_flags |= EV_ONESHOT;
308 if ((levents & LINUX_EPOLLET) != 0)
309 *kev_flags |= EV_CLEAR;
310 if ((levents & LINUX_EPOLLERR) != 0)
311 *kev_flags |= EV_ERROR;
312 if ((levents & LINUX_EPOLLRDHUP) != 0)
313 *kev_flags |= EV_EOF;
315 /* flags related to what event is registered */
316 if ((levents & LINUX_EPOLL_EVRD) != 0) {
317 EV_SET(kevent++, fd, EVFILT_READ, *kev_flags, 0, 0, 0);
320 if ((levents & LINUX_EPOLL_EVWR) != 0) {
321 EV_SET(kevent++, fd, EVFILT_WRITE, *kev_flags, 0, 0, 0);
325 if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
329 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
330 KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n"));
332 LINUX_PEM_XLOCK(pem);
333 if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
334 pem->flags |= LINUX_XUNSUP_EPOLL;
335 LINUX_PEM_XUNLOCK(pem);
336 linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n",
339 LINUX_PEM_XUNLOCK(pem);
347 * Structure converting function from kevent to epoll. In a case
348 * this is called on error in registration we store the error in
349 * event->data and pick it up later in linux_epoll_ctl().
352 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
355 if ((kevent->flags & EV_ERROR) != 0) {
356 l_event->events = LINUX_EPOLLERR;
360 /* XXX EPOLLPRI, EPOLLHUP */
361 switch (kevent->filter) {
363 l_event->events = LINUX_EPOLLIN;
364 if ((kevent->flags & EV_EOF) != 0)
365 l_event->events |= LINUX_EPOLLRDHUP;
368 l_event->events = LINUX_EPOLLOUT;
374 * Copyout callback used by kevent. This converts kevent
375 * events to epoll events and copies them back to the
376 * userspace. This is also called on error on registering
380 epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
382 struct epoll_copyout_args *args;
383 struct linux_pemuldata *pem;
384 struct epoll_emuldata *emd;
385 struct epoll_event *eep;
388 args = (struct epoll_copyout_args*) arg;
389 eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
391 pem = pem_find(args->p);
392 KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
393 LINUX_PEM_SLOCK(pem);
395 KASSERT(emd != NULL, ("epoll proc epolldata not found.\n"));
397 for (i = 0; i < count; i++) {
398 kevent_to_epoll(&kevp[i], &eep[i]);
401 KASSERT(fd <= emd->fdc, ("epoll user data vector"
402 " is too small.\n"));
403 eep[i].data = emd->udata[fd];
405 LINUX_PEM_SUNLOCK(pem);
407 error = copyout(eep, args->leventlist, count * sizeof(*eep));
409 args->leventlist += count;
410 args->count += count;
411 } else if (args->error == 0)
419 * Copyin callback used by kevent. This copies already
420 * converted filters from kernel memory to the kevent
421 * internal kernel memory. Hence the memcpy instead of
425 epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
427 struct epoll_copyin_args *args;
429 args = (struct epoll_copyin_args*) arg;
431 memcpy(kevp, args->changelist, count * sizeof(*kevp));
432 args->changelist += count;
438 * Load epoll filter, convert it to kevent filter
439 * and load it into kevent subsystem.
442 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
444 struct file *epfp, *fp;
445 struct epoll_copyin_args ciargs;
446 struct kevent kev[2];
447 struct kevent_copyops k_ops = { &ciargs,
450 struct epoll_event le;
456 if (args->op != LINUX_EPOLL_CTL_DEL) {
457 error = copyin(args->event, &le, sizeof(le));
462 error = fget(td, args->epfd,
463 cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp);
466 if (epfp->f_type != DTYPE_KQUEUE) {
471 /* Protect user data vector from incorrectly supplied fd. */
472 error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp);
476 /* Linux disallows spying on himself */
482 ciargs.changelist = kev;
484 if (args->op != LINUX_EPOLL_CTL_DEL) {
485 kev_flags = EV_ADD | EV_ENABLE;
486 error = epoll_to_kevent(td, epfp, args->fd, &le,
487 &kev_flags, kev, &nchanges);
493 case LINUX_EPOLL_CTL_MOD:
494 error = epoll_delete_all_events(td, epfp, args->fd);
499 case LINUX_EPOLL_CTL_ADD:
501 * kqueue_register() return ENOENT if event does not exists
502 * and the EV_ADD flag is not set.
504 kev[0].flags &= ~EV_ADD;
505 error = kqfd_register(args->epfd, &kev[0], td, 1);
506 if (error != ENOENT) {
511 kev[0].flags |= EV_ADD;
514 case LINUX_EPOLL_CTL_DEL:
515 /* CTL_DEL means unregister this fd with this epoll */
516 error = epoll_delete_all_events(td, epfp, args->fd);
524 epoll_fd_install(td, args->fd, le.data);
526 error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
537 * Wait for a filter to be triggered on the epoll file descriptor.
540 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
541 int maxevents, int timeout, sigset_t *uset)
543 struct epoll_copyout_args coargs;
544 struct kevent_copyops k_ops = { &coargs,
547 struct timespec ts, *tsp;
553 if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
556 error = fget(td, epfd,
557 cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp);
560 if (epfp->f_type != DTYPE_KQUEUE) {
565 error = kern_sigprocmask(td, SIG_SETMASK, uset,
569 td->td_pflags |= TDP_OLDMASK;
571 * Make sure that ast() is called on return to
572 * usermode and TDP_OLDMASK is cleared, restoring old
576 td->td_flags |= TDF_ASTPENDING;
581 coargs.leventlist = events;
582 coargs.p = td->td_proc;
591 /* Convert from milliseconds to timespec. */
592 ts.tv_sec = timeout / 1000;
593 ts.tv_nsec = (timeout % 1000) * 1000000;
599 error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
600 if (error == 0 && coargs.error != 0)
601 error = coargs.error;
604 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
605 * Maybe we should translate that but I don't think it matters at all.
608 td->td_retval[0] = coargs.count;
612 error = kern_sigprocmask(td, SIG_SETMASK, &omask,
620 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
623 return (linux_epoll_wait_common(td, args->epfd, args->events,
624 args->maxevents, args->timeout, NULL));
628 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
630 sigset_t mask, *pmask;
634 if (args->mask != NULL) {
635 if (args->sigsetsize != sizeof(l_sigset_t))
637 error = copyin(args->mask, &lmask, sizeof(l_sigset_t));
640 linux_to_bsd_sigset(&lmask, &mask);
644 return (linux_epoll_wait_common(td, args->epfd, args->events,
645 args->maxevents, args->timeout, pmask));
649 epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter)
651 struct epoll_copyin_args ciargs;
653 struct kevent_copyops k_ops = { &ciargs,
657 ciargs.changelist = &kev;
658 EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0);
660 return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL));
664 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
668 error1 = epoll_delete_event(td, epfp, fd, EVFILT_READ);
669 error2 = epoll_delete_event(td, epfp, fd, EVFILT_WRITE);
671 /* return 0 if at least one result positive */
672 return (error1 == 0 ? 0 : error2);
676 eventfd_create(struct thread *td, uint32_t initval, int flags)
678 struct filedesc *fdp;
681 int fflags, fd, error;
684 if ((flags & LINUX_O_CLOEXEC) != 0)
687 fdp = td->td_proc->p_fd;
688 error = falloc(td, &fp, &fd, fflags);
692 efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO);
693 efd->efd_flags = flags;
694 efd->efd_count = initval;
695 mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF);
697 knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock);
699 fflags = FREAD | FWRITE;
700 if ((flags & LINUX_O_NONBLOCK) != 0)
703 finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops);
706 td->td_retval[0] = fd;
711 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
714 return (eventfd_create(td, args->initval, 0));
718 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
721 if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0)
724 return (eventfd_create(td, args->initval, args->flags));
728 eventfd_close(struct file *fp, struct thread *td)
733 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
736 seldrain(&efd->efd_sel);
737 knlist_destroy(&efd->efd_sel.si_note);
739 fp->f_ops = &badfileops;
740 mtx_destroy(&efd->efd_lock);
747 eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
748 int flags, struct thread *td)
755 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
758 if (uio->uio_resid < sizeof(eventfd_t))
762 mtx_lock(&efd->efd_lock);
764 if (efd->efd_count == 0) {
765 if ((fp->f_flag & FNONBLOCK) != 0) {
766 mtx_unlock(&efd->efd_lock);
769 error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0);
774 if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) {
778 count = efd->efd_count;
781 KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
782 selwakeup(&efd->efd_sel);
783 wakeup(&efd->efd_count);
784 mtx_unlock(&efd->efd_lock);
785 error = uiomove(&count, sizeof(eventfd_t), uio);
787 mtx_unlock(&efd->efd_lock);
793 eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
794 int flags, struct thread *td)
801 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
804 if (uio->uio_resid < sizeof(eventfd_t))
807 error = uiomove(&count, sizeof(eventfd_t), uio);
810 if (count == UINT64_MAX)
813 mtx_lock(&efd->efd_lock);
815 if (UINT64_MAX - efd->efd_count <= count) {
816 if ((fp->f_flag & FNONBLOCK) != 0) {
817 mtx_unlock(&efd->efd_lock);
818 /* Do not not return the number of bytes written */
819 uio->uio_resid += sizeof(eventfd_t);
822 error = mtx_sleep(&efd->efd_count, &efd->efd_lock,
823 PCATCH, "lefdwr", 0);
828 efd->efd_count += count;
829 KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
830 selwakeup(&efd->efd_sel);
831 wakeup(&efd->efd_count);
833 mtx_unlock(&efd->efd_lock);
839 eventfd_poll(struct file *fp, int events, struct ucred *active_cred,
846 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
849 mtx_lock(&efd->efd_lock);
850 if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0)
851 revents |= events & (POLLIN|POLLRDNORM);
852 if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count)
853 revents |= events & (POLLOUT|POLLWRNORM);
855 selrecord(td, &efd->efd_sel);
856 mtx_unlock(&efd->efd_lock);
863 eventfd_kqfilter(struct file *fp, struct knote *kn)
868 if (fp->f_type != DTYPE_LINUXEFD || efd == NULL)
871 mtx_lock(&efd->efd_lock);
872 switch (kn->kn_filter) {
874 kn->kn_fop = &eventfd_rfiltops;
877 kn->kn_fop = &eventfd_wfiltops;
880 mtx_unlock(&efd->efd_lock);
885 knlist_add(&efd->efd_sel.si_note, kn, 1);
886 mtx_unlock(&efd->efd_lock);
892 filt_eventfddetach(struct knote *kn)
894 struct eventfd *efd = kn->kn_hook;
896 mtx_lock(&efd->efd_lock);
897 knlist_remove(&efd->efd_sel.si_note, kn, 1);
898 mtx_unlock(&efd->efd_lock);
903 filt_eventfdread(struct knote *kn, long hint)
905 struct eventfd *efd = kn->kn_hook;
908 mtx_assert(&efd->efd_lock, MA_OWNED);
909 ret = (efd->efd_count > 0);
916 filt_eventfdwrite(struct knote *kn, long hint)
918 struct eventfd *efd = kn->kn_hook;
921 mtx_assert(&efd->efd_lock, MA_OWNED);
922 ret = (UINT64_MAX - 1 > efd->efd_count);
929 eventfd_ioctl(struct file *fp, u_long cmd, void *data,
930 struct ucred *active_cred, struct thread *td)
933 if (fp->f_data == NULL || (fp->f_type != DTYPE_LINUXEFD &&
934 fp->f_type != DTYPE_LINUXTFD))
941 atomic_set_int(&fp->f_flag, FNONBLOCK);
943 atomic_clear_int(&fp->f_flag, FNONBLOCK);
953 eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
962 eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
965 kif->kf_type = KF_TYPE_UNKNOWN;
970 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
972 struct filedesc *fdp;
976 int fflags, fd, error;
978 if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
981 error = linux_to_native_clockid(&clockid, args->clockid);
984 if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
988 if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
991 fdp = td->td_proc->p_fd;
992 error = falloc(td, &fp, &fd, fflags);
996 tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
997 tfd->tfd_clockid = clockid;
998 mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
1000 callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
1001 knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
1004 if ((args->flags & LINUX_O_NONBLOCK) != 0)
1005 fflags |= FNONBLOCK;
1007 finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
1010 td->td_retval[0] = fd;
1015 timerfd_close(struct file *fp, struct thread *td)
1017 struct timerfd *tfd;
1020 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
1023 timespecclear(&tfd->tfd_time.it_value);
1024 timespecclear(&tfd->tfd_time.it_interval);
1026 mtx_lock(&tfd->tfd_lock);
1027 callout_drain(&tfd->tfd_callout);
1028 mtx_unlock(&tfd->tfd_lock);
1030 seldrain(&tfd->tfd_sel);
1031 knlist_destroy(&tfd->tfd_sel.si_note);
1033 fp->f_ops = &badfileops;
1034 mtx_destroy(&tfd->tfd_lock);
1041 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
1042 int flags, struct thread *td)
1044 struct timerfd *tfd;
1049 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
1052 if (uio->uio_resid < sizeof(timerfd_t))
1056 mtx_lock(&tfd->tfd_lock);
1058 if (tfd->tfd_canceled) {
1060 mtx_unlock(&tfd->tfd_lock);
1063 if (tfd->tfd_count == 0) {
1064 if ((fp->f_flag & FNONBLOCK) != 0) {
1065 mtx_unlock(&tfd->tfd_lock);
1068 error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
1073 count = tfd->tfd_count;
1075 mtx_unlock(&tfd->tfd_lock);
1076 error = uiomove(&count, sizeof(timerfd_t), uio);
1078 mtx_unlock(&tfd->tfd_lock);
1084 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
1087 struct timerfd *tfd;
1091 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
1094 mtx_lock(&tfd->tfd_lock);
1095 if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
1096 revents |= events & (POLLIN|POLLRDNORM);
1098 selrecord(td, &tfd->tfd_sel);
1099 mtx_unlock(&tfd->tfd_lock);
1106 timerfd_kqfilter(struct file *fp, struct knote *kn)
1108 struct timerfd *tfd;
1111 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
1114 if (kn->kn_filter == EVFILT_READ)
1115 kn->kn_fop = &timerfd_rfiltops;
1120 knlist_add(&tfd->tfd_sel.si_note, kn, 0);
1126 filt_timerfddetach(struct knote *kn)
1128 struct timerfd *tfd = kn->kn_hook;
1130 mtx_lock(&tfd->tfd_lock);
1131 knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
1132 mtx_unlock(&tfd->tfd_lock);
1137 filt_timerfdread(struct knote *kn, long hint)
1139 struct timerfd *tfd = kn->kn_hook;
1141 return (tfd->tfd_count > 0);
1146 timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
1155 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
1158 kif->kf_type = KF_TYPE_UNKNOWN;
1163 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
1166 if (tfd->tfd_clockid == CLOCK_REALTIME)
1168 else /* CLOCK_MONOTONIC */
1173 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
1175 struct timespec cts;
1177 linux_timerfd_clocktime(tfd, &cts);
1178 *ots = tfd->tfd_time;
1179 if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
1180 timespecsub(&ots->it_value, &cts);
1181 if (ots->it_value.tv_sec < 0 ||
1182 (ots->it_value.tv_sec == 0 &&
1183 ots->it_value.tv_nsec == 0)) {
1184 ots->it_value.tv_sec = 0;
1185 ots->it_value.tv_nsec = 1;
1191 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
1193 cap_rights_t rights;
1194 struct l_itimerspec lots;
1195 struct itimerspec ots;
1196 struct timerfd *tfd;
1200 error = fget(td, args->fd, cap_rights_init(&rights, CAP_READ), &fp);
1204 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
1209 mtx_lock(&tfd->tfd_lock);
1210 linux_timerfd_curval(tfd, &ots);
1211 mtx_unlock(&tfd->tfd_lock);
1213 error = native_to_linux_itimerspec(&lots, &ots);
1215 error = copyout(&lots, args->old_value, sizeof(lots));
1223 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
1225 struct l_itimerspec lots;
1226 struct itimerspec nts, ots;
1227 struct timespec cts, ts;
1228 cap_rights_t rights;
1229 struct timerfd *tfd;
1234 if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
1237 error = copyin(args->new_value, &lots, sizeof(lots));
1240 error = linux_to_native_itimerspec(&nts, &lots);
1244 error = fget(td, args->fd, cap_rights_init(&rights, CAP_WRITE), &fp);
1248 if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
1253 mtx_lock(&tfd->tfd_lock);
1254 if (!timespecisset(&nts.it_value))
1255 timespecclear(&nts.it_interval);
1256 if (args->old_value != NULL)
1257 linux_timerfd_curval(tfd, &ots);
1259 tfd->tfd_time = nts;
1260 if (timespecisset(&nts.it_value)) {
1261 linux_timerfd_clocktime(tfd, &cts);
1263 if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
1264 timespecadd(&tfd->tfd_time.it_value, &cts);
1266 timespecsub(&ts, &cts);
1268 TIMESPEC_TO_TIMEVAL(&tv, &ts);
1269 callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1270 linux_timerfd_expire, tfd);
1271 tfd->tfd_canceled = false;
1273 tfd->tfd_canceled = true;
1274 callout_stop(&tfd->tfd_callout);
1276 mtx_unlock(&tfd->tfd_lock);
1278 if (args->old_value != NULL) {
1279 error = native_to_linux_itimerspec(&lots, &ots);
1281 error = copyout(&lots, args->old_value, sizeof(lots));
1290 linux_timerfd_expire(void *arg)
1292 struct timespec cts, ts;
1294 struct timerfd *tfd;
1296 tfd = (struct timerfd *)arg;
1298 linux_timerfd_clocktime(tfd, &cts);
1299 if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
1300 if (timespecisset(&tfd->tfd_time.it_interval))
1301 timespecadd(&tfd->tfd_time.it_value,
1302 &tfd->tfd_time.it_interval);
1304 /* single shot timer */
1305 timespecclear(&tfd->tfd_time.it_value);
1306 if (timespecisset(&tfd->tfd_time.it_value)) {
1307 ts = tfd->tfd_time.it_value;
1308 timespecsub(&ts, &cts);
1309 TIMESPEC_TO_TIMEVAL(&tv, &ts);
1310 callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1311 linux_timerfd_expire, tfd);
1314 KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
1315 selwakeup(&tfd->tfd_sel);
1316 wakeup(&tfd->tfd_count);
1317 } else if (timespecisset(&tfd->tfd_time.it_value)) {
1318 ts = tfd->tfd_time.it_value;
1319 timespecsub(&ts, &cts);
1320 TIMESPEC_TO_TIMEVAL(&tv, &ts);
1321 callout_reset(&tfd->tfd_callout, tvtohz(&tv),
1322 linux_timerfd_expire, tfd);