sys/kern/kern_event.c

   1 /*-
   2  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
   3  * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25  * SUCH DAMAGE.
  26  */
  27
  28 #include <sys/cdefs.h>
  29 __FBSDID("$FreeBSD$");
  30
  31 #include <sys/param.h>
  32 #include <sys/systm.h>
  33 #include <sys/kernel.h>
  34 #include <sys/lock.h>
  35 #include <sys/mutex.h>
  36 #include <sys/proc.h>
  37 #include <sys/malloc.h>
  38 #include <sys/unistd.h>
  39 #include <sys/file.h>
  40 #include <sys/filedesc.h>
  41 #include <sys/filio.h>
  42 #include <sys/fcntl.h>
  43 #include <sys/kthread.h>
  44 #include <sys/selinfo.h>
  45 #include <sys/queue.h>
  46 #include <sys/event.h>
  47 #include <sys/eventvar.h>
  48 #include <sys/poll.h>
  49 #include <sys/protosw.h>
  50 #include <sys/sigio.h>
  51 #include <sys/signalvar.h>
  52 #include <sys/socket.h>
  53 #include <sys/socketvar.h>
  54 #include <sys/stat.h>
  55 #include <sys/sysctl.h>
  56 #include <sys/sysproto.h>
  57 #include <sys/taskqueue.h>
  58 #include <sys/uio.h>
  59
  60 #include <vm/uma.h>
  61
  62 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
  63
  64 /*
  65  * This lock is used if multiple kq locks are required.  This possibly
  66  * should be made into a per proc lock.
  67  */
  68 static struct mtx       kq_global;
  69 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
  70 #define KQ_GLOBAL_LOCK(lck, haslck)     do {    \
  71         if (!haslck)                            \
  72                 mtx_lock(lck);                  \
  73         haslck = 1;                             \
  74 } while (0)
  75 #define KQ_GLOBAL_UNLOCK(lck, haslck)   do {    \
  76         if (haslck)                             \
  77                 mtx_unlock(lck);                        \
  78         haslck = 0;                             \
  79 } while (0)
  80
  81 TASKQUEUE_DEFINE_THREAD(kqueue);
  82
  83 static int      kqueue_aquire(struct file *fp, struct kqueue **kqp);
  84 static void     kqueue_release(struct kqueue *kq, int locked);
  85 static int      kqueue_expand(struct kqueue *kq, struct filterops *fops,
  86                     uintptr_t ident, int waitok);
  87 static void     kqueue_task(void *arg, int pending);
  88 static int      kqueue_scan(struct kqueue *kq, int maxevents,
  89                     struct kevent *ulistp, const struct timespec *timeout,
  90                     struct kevent *keva, struct thread *td);
  91 static void     kqueue_wakeup(struct kqueue *kq);
  92 static struct filterops *kqueue_fo_find(int filt);
  93 static void     kqueue_fo_release(int filt);
  94
  95 static fo_rdwr_t        kqueue_read;
  96 static fo_rdwr_t        kqueue_write;
  97 static fo_ioctl_t       kqueue_ioctl;
  98 static fo_poll_t        kqueue_poll;
  99 static fo_kqfilter_t    kqueue_kqfilter;
 100 static fo_stat_t        kqueue_stat;
 101 static fo_close_t       kqueue_close;
 102
 103 static struct fileops kqueueops = {
 104         .fo_read = kqueue_read,
 105         .fo_write = kqueue_write,
 106         .fo_ioctl = kqueue_ioctl,
 107         .fo_poll = kqueue_poll,
 108         .fo_kqfilter = kqueue_kqfilter,
 109         .fo_stat = kqueue_stat,
 110         .fo_close = kqueue_close,
 111 };
 112
 113 static int      knote_attach(struct knote *kn, struct kqueue *kq);
 114 static void     knote_drop(struct knote *kn, struct thread *td);
 115 static void     knote_enqueue(struct knote *kn);
 116 static void     knote_dequeue(struct knote *kn);
 117 static void     knote_init(void);
 118 static struct   knote *knote_alloc(int waitok);
 119 static void     knote_free(struct knote *kn);
 120
 121 static void     filt_kqdetach(struct knote *kn);
 122 static int      filt_kqueue(struct knote *kn, long hint);
 123 static int      filt_procattach(struct knote *kn);
 124 static void     filt_procdetach(struct knote *kn);
 125 static int      filt_proc(struct knote *kn, long hint);
 126 static int      filt_fileattach(struct knote *kn);
 127 static void     filt_timerexpire(void *knx);
 128 static int      filt_timerattach(struct knote *kn);
 129 static void     filt_timerdetach(struct knote *kn);
 130 static int      filt_timer(struct knote *kn, long hint);
 131
 132 static struct filterops file_filtops =
 133         { 1, filt_fileattach, NULL, NULL };
 134 static struct filterops kqread_filtops =
 135         { 1, NULL, filt_kqdetach, filt_kqueue };
 136 /* XXX - move to kern_proc.c?  */
 137 static struct filterops proc_filtops =
 138         { 0, filt_procattach, filt_procdetach, filt_proc };
 139 static struct filterops timer_filtops =
 140         { 0, filt_timerattach, filt_timerdetach, filt_timer };
 141
 142 static uma_zone_t       knote_zone;
 143 static int              kq_ncallouts = 0;
 144 static int              kq_calloutmax = (4 * 1024);
 145 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
 146     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
 147
 148 /* XXX - ensure not KN_INFLUX?? */
 149 #define KNOTE_ACTIVATE(kn, islock) do {                                 \
 150         if ((islock))                                                   \
 151                 mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);            \
 152         else                                                            \
 153                 KQ_LOCK((kn)->kn_kq);                                   \
 154         (kn)->kn_status |= KN_ACTIVE;                                   \
 155         if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)         \
 156                 knote_enqueue((kn));                                    \
 157         if (!(islock))                                                  \
 158                 KQ_UNLOCK((kn)->kn_kq);                                 \
 159 } while(0)
 160 #define KQ_LOCK(kq) do {                                                \
 161         mtx_lock(&(kq)->kq_lock);                                       \
 162 } while (0)
 163 #define KQ_FLUX_WAKEUP(kq) do {                                         \
 164         if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {            \
 165                 (kq)->kq_state &= ~KQ_FLUXWAIT;                         \
 166                 wakeup((kq));                                           \
 167         }                                                               \
 168 } while (0)
 169 #define KQ_UNLOCK_FLUX(kq) do {                                         \
 170         KQ_FLUX_WAKEUP(kq);                                             \
 171         mtx_unlock(&(kq)->kq_lock);                                     \
 172 } while (0)
 173 #define KQ_UNLOCK(kq) do {                                              \
 174         mtx_unlock(&(kq)->kq_lock);                                     \
 175 } while (0)
 176 #define KQ_OWNED(kq) do {                                               \
 177         mtx_assert(&(kq)->kq_lock, MA_OWNED);                           \
 178 } while (0)
 179 #define KQ_NOTOWNED(kq) do {                                            \
 180         mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);                        \
 181 } while (0)
 182 #define KN_LIST_LOCK(kn) do {                                           \
 183         if (kn->kn_knlist != NULL)                                      \
 184                 mtx_lock(kn->kn_knlist->kl_lock);                       \
 185 } while (0)
 186 #define KN_LIST_UNLOCK(kn) do {                                         \
 187         if (kn->kn_knlist != NULL)                                      \
 188                 mtx_unlock(kn->kn_knlist->kl_lock);                     \
 189 } while (0)
 190
 191 #define KN_HASHSIZE             64              /* XXX should be tunable */
 192 #define KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
 193
 194 static int
 195 filt_nullattach(struct knote *kn)
 196 {
 197
 198         return (ENXIO);
 199 };
 200
 201 struct filterops null_filtops =
 202         { 0, filt_nullattach, NULL, NULL };
 203
 204 /* XXX - make SYSINIT to add these, and move into respective modules. */
 205 extern struct filterops sig_filtops;
 206 extern struct filterops fs_filtops;
 207
 208 /*
 209  * Table for for all system-defined filters.
 210  */
 211 static struct mtx       filterops_lock;
 212 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
 213         MTX_DEF);
 214 static struct {
 215         struct filterops *for_fop;
 216         int for_refcnt;
 217 } sysfilt_ops[EVFILT_SYSCOUNT] = {
 218         { &file_filtops },                      /* EVFILT_READ */
 219         { &file_filtops },                      /* EVFILT_WRITE */
 220         { &null_filtops },                      /* EVFILT_AIO */
 221         { &file_filtops },                      /* EVFILT_VNODE */
 222         { &proc_filtops },                      /* EVFILT_PROC */
 223         { &sig_filtops },                       /* EVFILT_SIGNAL */
 224         { &timer_filtops },                     /* EVFILT_TIMER */
 225         { &file_filtops },                      /* EVFILT_NETDEV */
 226         { &fs_filtops },                        /* EVFILT_FS */
 227 };
 228
 229 /*
 230  * Simple redirection for all cdevsw style objects to call their fo_kqfilter
 231  * method.
 232  */
 233 static int
 234 filt_fileattach(struct knote *kn)
 235 {
 236
 237         return (fo_kqfilter(kn->kn_fp, kn));
 238 }
 239
 240 /*ARGSUSED*/
 241 static int
 242 kqueue_kqfilter(struct file *fp, struct knote *kn)
 243 {
 244         struct kqueue *kq = kn->kn_fp->f_data;
 245
 246         if (kn->kn_filter != EVFILT_READ)
 247                 return (EINVAL);
 248
 249         kn->kn_status |= KN_KQUEUE;
 250         kn->kn_fop = &kqread_filtops;
 251         knlist_add(&kq->kq_sel.si_note, kn, 0);
 252
 253         return (0);
 254 }
 255
 256 static void
 257 filt_kqdetach(struct knote *kn)
 258 {
 259         struct kqueue *kq = kn->kn_fp->f_data;
 260
 261         knlist_remove(&kq->kq_sel.si_note, kn, 0);
 262 }
 263
 264 /*ARGSUSED*/
 265 static int
 266 filt_kqueue(struct knote *kn, long hint)
 267 {
 268         struct kqueue *kq = kn->kn_fp->f_data;
 269
 270         kn->kn_data = kq->kq_count;
 271         return (kn->kn_data > 0);
 272 }
 273
 274 /* XXX - move to kern_proc.c?  */
 275 static int
 276 filt_procattach(struct knote *kn)
 277 {
 278         struct proc *p;
 279         int immediate;
 280         int error;
 281
 282         immediate = 0;
 283         p = pfind(kn->kn_id);
 284         if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
 285                 p = zpfind(kn->kn_id);
 286                 immediate = 1;
 287         } else if (p != NULL && (p->p_flag & P_WEXIT)) {
 288                 immediate = 1;
 289         }
 290
 291         if (p == NULL)
 292                 return (ESRCH);
 293         if ((error = p_cansee(curthread, p)))
 294                 return (error);
 295
 296         kn->kn_ptr.p_proc = p;
 297         kn->kn_flags |= EV_CLEAR;               /* automatically set */
 298
 299         /*
 300          * internal flag indicating registration done by kernel
 301          */
 302         if (kn->kn_flags & EV_FLAG1) {
 303                 kn->kn_data = kn->kn_sdata;             /* ppid */
 304                 kn->kn_fflags = NOTE_CHILD;
 305                 kn->kn_flags &= ~EV_FLAG1;
 306         }
 307
 308         if (immediate == 0)
 309                 knlist_add(&p->p_klist, kn, 1);
 310
 311         /*
 312          * Immediately activate any exit notes if the target process is a
 313          * zombie.  This is necessary to handle the case where the target
 314          * process, e.g. a child, dies before the kevent is registered.
 315          */
 316         if (immediate && filt_proc(kn, NOTE_EXIT))
 317                 KNOTE_ACTIVATE(kn, 0);
 318
 319         PROC_UNLOCK(p);
 320
 321         return (0);
 322 }
 323
 324 /*
 325  * The knote may be attached to a different process, which may exit,
 326  * leaving nothing for the knote to be attached to.  So when the process
 327  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
 328  * it will be deleted when read out.  However, as part of the knote deletion,
 329  * this routine is called, so a check is needed to avoid actually performing
 330  * a detach, because the original process does not exist any more.
 331  */
 332 /* XXX - move to kern_proc.c?  */
 333 static void
 334 filt_procdetach(struct knote *kn)
 335 {
 336         struct proc *p;
 337
 338         p = kn->kn_ptr.p_proc;
 339         knlist_remove(&p->p_klist, kn, 0);
 340         kn->kn_ptr.p_proc = NULL;
 341 }
 342
 343 /* XXX - move to kern_proc.c?  */
 344 static int
 345 filt_proc(struct knote *kn, long hint)
 346 {
 347         struct proc *p = kn->kn_ptr.p_proc;
 348         u_int event;
 349
 350         /*
 351          * mask off extra data
 352          */
 353         event = (u_int)hint & NOTE_PCTRLMASK;
 354
 355         /*
 356          * if the user is interested in this event, record it.
 357          */
 358         if (kn->kn_sfflags & event)
 359                 kn->kn_fflags |= event;
 360
 361         /*
 362          * process is gone, so flag the event as finished.
 363          */
 364         if (event == NOTE_EXIT) {
 365                 if (!(kn->kn_status & KN_DETACHED))
 366                         knlist_remove_inevent(&p->p_klist, kn);
 367                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 368                 kn->kn_ptr.p_proc = NULL;
 369                 return (1);
 370         }
 371
 372         /*
 373          * process forked, and user wants to track the new process,
 374          * so attach a new knote to it, and immediately report an
 375          * event with the parent's pid.
 376          */
 377         if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
 378                 struct kevent kev;
 379                 int error;
 380
 381                 /*
 382                  * register knote with new process.
 383                  */
 384                 kev.ident = hint & NOTE_PDATAMASK;      /* pid */
 385                 kev.filter = kn->kn_filter;
 386                 kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
 387                 kev.fflags = kn->kn_sfflags;
 388                 kev.data = kn->kn_id;                   /* parent */
 389                 kev.udata = kn->kn_kevent.udata;        /* preserve udata */
 390                 error = kqueue_register(kn->kn_kq, &kev, NULL, 0);
 391                 if (error)
 392                         kn->kn_fflags |= NOTE_TRACKERR;
 393         }
 394
 395         return (kn->kn_fflags != 0);
 396 }
 397
 398 static int
 399 timertoticks(intptr_t data)
 400 {
 401         struct timeval tv;
 402         int tticks;
 403
 404         tv.tv_sec = data / 1000;
 405         tv.tv_usec = (data % 1000) * 1000;
 406         tticks = tvtohz(&tv);
 407
 408         return tticks;
 409 }
 410
 411 /* XXX - move to kern_timeout.c? */
 412 static void
 413 filt_timerexpire(void *knx)
 414 {
 415         struct knote *kn = knx;
 416         struct callout *calloutp;
 417
 418         kn->kn_data++;
 419         KNOTE_ACTIVATE(kn, 0);  /* XXX - handle locking */
 420
 421         if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
 422                 calloutp = (struct callout *)kn->kn_hook;
 423                 callout_reset(calloutp, timertoticks(kn->kn_sdata),
 424                     filt_timerexpire, kn);
 425         }
 426 }
 427
 428 /*
 429  * data contains amount of time to sleep, in milliseconds
 430  */
 431 /* XXX - move to kern_timeout.c? */
 432 static int
 433 filt_timerattach(struct knote *kn)
 434 {
 435         struct callout *calloutp;
 436
 437         atomic_add_int(&kq_ncallouts, 1);
 438
 439         if (kq_ncallouts >= kq_calloutmax) {
 440                 atomic_add_int(&kq_ncallouts, -1);
 441                 return (ENOMEM);
 442         }
 443
 444         kn->kn_flags |= EV_CLEAR;               /* automatically set */
 445         kn->kn_status &= ~KN_DETACHED;          /* knlist_add usually sets it */
 446         MALLOC(calloutp, struct callout *, sizeof(*calloutp),
 447             M_KQUEUE, M_WAITOK);
 448         callout_init(calloutp, CALLOUT_MPSAFE);
 449         kn->kn_hook = calloutp;
 450         callout_reset(calloutp, timertoticks(kn->kn_sdata), filt_timerexpire,
 451             kn);
 452
 453         return (0);
 454 }
 455
 456 /* XXX - move to kern_timeout.c? */
 457 static void
 458 filt_timerdetach(struct knote *kn)
 459 {
 460         struct callout *calloutp;
 461
 462         calloutp = (struct callout *)kn->kn_hook;
 463         callout_drain(calloutp);
 464         FREE(calloutp, M_KQUEUE);
 465         atomic_add_int(&kq_ncallouts, -1);
 466         kn->kn_status |= KN_DETACHED;   /* knlist_remove usually clears it */
 467 }
 468
 469 /* XXX - move to kern_timeout.c? */
 470 static int
 471 filt_timer(struct knote *kn, long hint)
 472 {
 473
 474         return (kn->kn_data != 0);
 475 }
 476
 477 /*
 478  * MPSAFE
 479  */
 480 int
 481 kqueue(struct thread *td, struct kqueue_args *uap)
 482 {
 483         struct filedesc *fdp;
 484         struct kqueue *kq;
 485         struct file *fp;
 486         int fd, error;
 487
 488         fdp = td->td_proc->p_fd;
 489         error = falloc(td, &fp, &fd);
 490         if (error)
 491                 goto done2;
 492
 493         /* An extra reference on `nfp' has been held for us by falloc(). */
 494         kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
 495         mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
 496         TAILQ_INIT(&kq->kq_head);
 497         kq->kq_fdp = fdp;
 498         knlist_init(&kq->kq_sel.si_note, &kq->kq_lock);
 499         TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
 500
 501         FILEDESC_LOCK_FAST(fdp);
 502         SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
 503         FILEDESC_UNLOCK_FAST(fdp);
 504
 505         FILE_LOCK(fp);
 506         fp->f_flag = FREAD | FWRITE;
 507         fp->f_type = DTYPE_KQUEUE;
 508         fp->f_ops = &kqueueops;
 509         fp->f_data = kq;
 510         FILE_UNLOCK(fp);
 511         fdrop(fp, td);
 512
 513         td->td_retval[0] = fd;
 514 done2:
 515         return (error);
 516 }
 517
 518 #ifndef _SYS_SYSPROTO_H_
 519 struct kevent_args {
 520         int     fd;
 521         const struct kevent *changelist;
 522         int     nchanges;
 523         struct  kevent *eventlist;
 524         int     nevents;
 525         const struct timespec *timeout;
 526 };
 527 #endif
 528 /*
 529  * MPSAFE
 530  */
 531 int
 532 kevent(struct thread *td, struct kevent_args *uap)
 533 {
 534         struct kevent keva[KQ_NEVENTS];
 535         struct kevent *kevp;
 536         struct kqueue *kq;
 537         struct file *fp;
 538         struct timespec ts;
 539         int i, n, nerrors, error;
 540
 541         if ((error = fget(td, uap->fd, &fp)) != 0)
 542                 return (error);
 543         if ((error = kqueue_aquire(fp, &kq)) != 0)
 544                 goto done_norel;
 545
 546         if (uap->timeout != NULL) {
 547                 error = copyin(uap->timeout, &ts, sizeof(ts));
 548                 if (error)
 549                         goto done;
 550                 uap->timeout = &ts;
 551         }
 552
 553         nerrors = 0;
 554
 555         while (uap->nchanges > 0) {
 556                 n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges;
 557                 error = copyin(uap->changelist, keva,
 558                     n * sizeof *keva);
 559                 if (error)
 560                         goto done;
 561                 for (i = 0; i < n; i++) {
 562                         kevp = &keva[i];
 563                         kevp->flags &= ~EV_SYSFLAGS;
 564                         error = kqueue_register(kq, kevp, td, 1);
 565                         if (error) {
 566                                 if (uap->nevents != 0) {
 567                                         kevp->flags = EV_ERROR;
 568                                         kevp->data = error;
 569                                         (void) copyout(kevp,
 570                                             uap->eventlist,
 571                                             sizeof(*kevp));
 572                                         uap->eventlist++;
 573                                         uap->nevents--;
 574                                         nerrors++;
 575                                 } else {
 576                                         goto done;
 577                                 }
 578                         }
 579                 }
 580                 uap->nchanges -= n;
 581                 uap->changelist += n;
 582         }
 583         if (nerrors) {
 584                 td->td_retval[0] = nerrors;
 585                 error = 0;
 586                 goto done;
 587         }
 588
 589         error = kqueue_scan(kq, uap->nevents, uap->eventlist, uap->timeout,
 590             keva, td);
 591 done:
 592         kqueue_release(kq, 0);
 593 done_norel:
 594         if (fp != NULL)
 595                 fdrop(fp, td);
 596         return (error);
 597 }
 598
 599 int
 600 kqueue_add_filteropts(int filt, struct filterops *filtops)
 601 {
 602         int error;
 603
 604         if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
 605                 printf(
 606 "trying to add a filterop that is out of range: %d is beyond %d\n",
 607                     ~filt, EVFILT_SYSCOUNT);
 608                 return EINVAL;
 609         }
 610         mtx_lock(&filterops_lock);
 611         if (sysfilt_ops[~filt].for_fop != &null_filtops &&
 612             sysfilt_ops[~filt].for_fop != NULL)
 613                 error = EEXIST;
 614         else {
 615                 sysfilt_ops[~filt].for_fop = filtops;
 616                 sysfilt_ops[~filt].for_refcnt = 0;
 617         }
 618         mtx_unlock(&filterops_lock);
 619
 620         return (0);
 621 }
 622
 623 int
 624 kqueue_del_filteropts(int filt)
 625 {
 626         int error;
 627
 628         error = 0;
 629         if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 630                 return EINVAL;
 631
 632         mtx_lock(&filterops_lock);
 633         if (sysfilt_ops[~filt].for_fop == &null_filtops ||
 634             sysfilt_ops[~filt].for_fop == NULL)
 635                 error = EINVAL;
 636         else if (sysfilt_ops[~filt].for_refcnt != 0)
 637                 error = EBUSY;
 638         else {
 639                 sysfilt_ops[~filt].for_fop = &null_filtops;
 640                 sysfilt_ops[~filt].for_refcnt = 0;
 641         }
 642         mtx_unlock(&filterops_lock);
 643
 644         return error;
 645 }
 646
 647 static struct filterops *
 648 kqueue_fo_find(int filt)
 649 {
 650
 651         if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 652                 return NULL;
 653
 654         mtx_lock(&filterops_lock);
 655         sysfilt_ops[~filt].for_refcnt++;
 656         if (sysfilt_ops[~filt].for_fop == NULL)
 657                 sysfilt_ops[~filt].for_fop = &null_filtops;
 658         mtx_unlock(&filterops_lock);
 659
 660         return sysfilt_ops[~filt].for_fop;
 661 }
 662
 663 static void
 664 kqueue_fo_release(int filt)
 665 {
 666
 667         if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 668                 return;
 669
 670         mtx_lock(&filterops_lock);
 671         KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
 672             ("filter object refcount not valid on release"));
 673         sysfilt_ops[~filt].for_refcnt--;
 674         mtx_unlock(&filterops_lock);
 675 }
 676
 677 /*
 678  * A ref to kq (obtained via kqueue_aquire) should be held.  waitok will
 679  * influence if memory allocation should wait.  Make sure it is 0 if you
 680  * hold any mutexes.
 681  */
 682 int
 683 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
 684 {
 685         struct filedesc *fdp;
 686         struct filterops *fops;
 687         struct file *fp;
 688         struct knote *kn, *tkn;
 689         int error, filt, event;
 690         int haskqglobal;
 691         int fd;
 692
 693         fdp = NULL;
 694         fp = NULL;
 695         kn = NULL;
 696         error = 0;
 697         haskqglobal = 0;
 698
 699         filt = kev->filter;
 700         fops = kqueue_fo_find(filt);
 701         if (fops == NULL)
 702                 return EINVAL;
 703
 704         tkn = knote_alloc(waitok);              /* prevent waiting with locks */
 705
 706 findkn:
 707         if (fops->f_isfd) {
 708                 KASSERT(td != NULL, ("td is NULL"));
 709                 fdp = td->td_proc->p_fd;
 710                 FILEDESC_LOCK(fdp);
 711                 /* validate descriptor */
 712                 fd = kev->ident;
 713                 if (fd < 0 || fd >= fdp->fd_nfiles ||
 714                     (fp = fdp->fd_ofiles[fd]) == NULL) {
 715                         FILEDESC_UNLOCK(fdp);
 716                         error = EBADF;
 717                         goto done;
 718                 }
 719                 fhold(fp);
 720
 721                 if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
 722                     kev->ident, 0) != 0) {
 723                         /* unlock and try again */
 724                         FILEDESC_UNLOCK(fdp);
 725                         fdrop(fp, td);
 726                         fp = NULL;
 727                         error = kqueue_expand(kq, fops, kev->ident, waitok);
 728                         if (error)
 729                                 goto done;
 730                         goto findkn;
 731                 }
 732
 733                 if (fp->f_type == DTYPE_KQUEUE) {
 734                         /*
 735                          * if we add some inteligence about what we are doing,
 736                          * we should be able to support events on ourselves.
 737                          * We need to know when we are doing this to prevent
 738                          * getting both the knlist lock and the kq lock since
 739                          * they are the same thing.
 740                          */
 741                         if (fp->f_data == kq) {
 742                                 FILEDESC_UNLOCK(fdp);
 743                                 error = EINVAL;
 744                                 goto done_noglobal;
 745                         }
 746
 747                         KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 748                 }
 749
 750                 FILEDESC_UNLOCK(fdp);
 751                 KQ_LOCK(kq);
 752                 if (kev->ident < kq->kq_knlistsize) {
 753                         SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
 754                                 if (kev->filter == kn->kn_filter)
 755                                         break;
 756                 }
 757         } else {
 758                 if ((kev->flags & EV_ADD) == EV_ADD)
 759                         kqueue_expand(kq, fops, kev->ident, waitok);
 760
 761                 KQ_LOCK(kq);
 762                 if (kq->kq_knhashmask != 0) {
 763                         struct klist *list;
 764
 765                         list = &kq->kq_knhash[
 766                             KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
 767                         SLIST_FOREACH(kn, list, kn_link)
 768                                 if (kev->ident == kn->kn_id &&
 769                                     kev->filter == kn->kn_filter)
 770                                         break;
 771                 }
 772         }
 773
 774         /* knote is in the process of changing, wait for it to stablize. */
 775         if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
 776                 if (fp != NULL) {
 777                         fdrop(fp, td);
 778                         fp = NULL;
 779                 }
 780                 KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 781                 kq->kq_state |= KQ_FLUXWAIT;
 782                 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
 783                 goto findkn;
 784         }
 785
 786         if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
 787                 KQ_UNLOCK(kq);
 788                 error = ENOENT;
 789                 goto done;
 790         }
 791
 792         /*
 793          * kn now contains the matching knote, or NULL if no match
 794          */
 795         if (kev->flags & EV_ADD) {
 796                 if (kn == NULL) {
 797                         kn = tkn;
 798                         tkn = NULL;
 799                         if (kn == NULL) {
 800                                 error = ENOMEM;
 801                                 goto done;
 802                         }
 803                         kn->kn_fp = fp;
 804                         kn->kn_kq = kq;
 805                         kn->kn_fop = fops;
 806                         /*
 807                          * apply reference counts to knote structure, and
 808                          * do not release it at the end of this routine.
 809                          */
 810                         fops = NULL;
 811                         fp = NULL;
 812
 813                         kn->kn_sfflags = kev->fflags;
 814                         kn->kn_sdata = kev->data;
 815                         kev->fflags = 0;
 816                         kev->data = 0;
 817                         kn->kn_kevent = *kev;
 818                         kn->kn_status = KN_INFLUX|KN_DETACHED;
 819
 820                         error = knote_attach(kn, kq);
 821                         KQ_UNLOCK(kq);
 822                         if (error != 0) {
 823                                 tkn = kn;
 824                                 goto done;
 825                         }
 826
 827                         if ((error = kn->kn_fop->f_attach(kn)) != 0) {
 828                                 knote_drop(kn, td);
 829                                 goto done;
 830                         }
 831                         KN_LIST_LOCK(kn);
 832                 } else {
 833                         /*
 834                          * The user may change some filter values after the
 835                          * initial EV_ADD, but doing so will not reset any
 836                          * filter which has already been triggered.
 837                          */
 838                         kn->kn_status |= KN_INFLUX;
 839                         KQ_UNLOCK(kq);
 840                         KN_LIST_LOCK(kn);
 841                         kn->kn_sfflags = kev->fflags;
 842                         kn->kn_sdata = kev->data;
 843                         kn->kn_kevent.udata = kev->udata;
 844                 }
 845
 846                 /*
 847                  * We can get here with kn->kn_knlist == NULL.
 848                  * This can happen when the initial attach event decides that
 849                  * the event is "completed" already.  i.e. filt_procattach
 850                  * is called on a zombie process.  It will call filt_proc
 851                  * which will remove it from the list, and NULL kn_knlist.
 852                  */
 853                 event = kn->kn_fop->f_event(kn, 0);
 854                 KN_LIST_UNLOCK(kn);
 855                 KQ_LOCK(kq);
 856                 if (event)
 857                         KNOTE_ACTIVATE(kn, 1);
 858                 kn->kn_status &= ~KN_INFLUX;
 859         } else if (kev->flags & EV_DELETE) {
 860                 kn->kn_status |= KN_INFLUX;
 861                 KQ_UNLOCK(kq);
 862                 if (!(kn->kn_status & KN_DETACHED))
 863                         kn->kn_fop->f_detach(kn);
 864                 knote_drop(kn, td);
 865                 goto done;
 866         }
 867
 868         if ((kev->flags & EV_DISABLE) &&
 869             ((kn->kn_status & KN_DISABLED) == 0)) {
 870                 kn->kn_status |= KN_DISABLED;
 871         }
 872
 873         if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
 874                 kn->kn_status &= ~KN_DISABLED;
 875                 if ((kn->kn_status & KN_ACTIVE) &&
 876                     ((kn->kn_status & KN_QUEUED) == 0))
 877                         knote_enqueue(kn);
 878         }
 879         KQ_UNLOCK_FLUX(kq);
 880
 881 done:
 882         KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 883 done_noglobal:
 884         if (fp != NULL)
 885                 fdrop(fp, td);
 886         if (tkn != NULL)
 887                 knote_free(tkn);
 888         if (fops != NULL)
 889                 kqueue_fo_release(filt);
 890         return (error);
 891 }
 892
 893 static int
 894 kqueue_aquire(struct file *fp, struct kqueue **kqp)
 895 {
 896         int error;
 897         struct kqueue *kq;
 898
 899         error = 0;
 900
 901         FILE_LOCK(fp);
 902         do {
 903                 kq = fp->f_data;
 904                 if (fp->f_type != DTYPE_KQUEUE || kq == NULL) {
 905                         error = EBADF;
 906                         break;
 907                 }
 908                 *kqp = kq;
 909                 KQ_LOCK(kq);
 910                 if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
 911                         KQ_UNLOCK(kq);
 912                         error = EBADF;
 913                         break;
 914                 }
 915                 kq->kq_refcnt++;
 916                 KQ_UNLOCK(kq);
 917         } while (0);
 918         FILE_UNLOCK(fp);
 919
 920         return error;
 921 }
 922
 923 static void
 924 kqueue_release(struct kqueue *kq, int locked)
 925 {
 926         if (locked)
 927                 KQ_OWNED(kq);
 928         else
 929                 KQ_LOCK(kq);
 930         kq->kq_refcnt--;
 931         if (kq->kq_refcnt == 1)
 932                 wakeup(&kq->kq_refcnt);
 933         if (!locked)
 934                 KQ_UNLOCK(kq);
 935 }
 936
 937 static void
 938 kqueue_schedtask(struct kqueue *kq)
 939 {
 940
 941         KQ_OWNED(kq);
 942         KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
 943             ("scheduling kqueue task while draining"));
 944
 945         if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
 946                 taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task);
 947                 kq->kq_state |= KQ_TASKSCHED;
 948         }
 949 }
 950
 951 /*
 952  * Expand the kq to make sure we have storage for fops/ident pair.
 953  *
 954  * Return 0 on success (or no work necessary), return errno on failure.
 955  *
 956  * Not calling hashinit w/ waitok (proper malloc flag) should be safe.
 957  * If kqueue_register is called from a non-fd context, there usually/should
 958  * be no locks held.
 959  */
 960 static int
 961 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
 962         int waitok)
 963 {
 964         struct klist *list, *tmp_knhash;
 965         u_long tmp_knhashmask;
 966         int size;
 967         int fd;
 968         int mflag = waitok ? M_WAITOK : M_NOWAIT;
 969
 970         KQ_NOTOWNED(kq);
 971
 972         if (fops->f_isfd) {
 973                 fd = ident;
 974                 if (kq->kq_knlistsize <= fd) {
 975                         size = kq->kq_knlistsize;
 976                         while (size <= fd)
 977                                 size += KQEXTENT;
 978                         MALLOC(list, struct klist *,
 979                             size * sizeof list, M_KQUEUE, mflag);
 980                         if (list == NULL)
 981                                 return ENOMEM;
 982                         KQ_LOCK(kq);
 983                         if (kq->kq_knlistsize > fd) {
 984                                 FREE(list, M_KQUEUE);
 985                                 list = NULL;
 986                         } else {
 987                                 if (kq->kq_knlist != NULL) {
 988                                         bcopy(kq->kq_knlist, list,
 989                                             kq->kq_knlistsize * sizeof list);
 990                                         FREE(kq->kq_knlist, M_KQUEUE);
 991                                         kq->kq_knlist = NULL;
 992                                 }
 993                                 bzero((caddr_t)list +
 994                                     kq->kq_knlistsize * sizeof list,
 995                                     (size - kq->kq_knlistsize) * sizeof list);
 996                                 kq->kq_knlistsize = size;
 997                                 kq->kq_knlist = list;
 998                         }
 999                         KQ_UNLOCK(kq);
1000                 }
1001         } else {
1002                 if (kq->kq_knhashmask == 0) {
1003                         tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
1004                             &tmp_knhashmask);
1005                         if (tmp_knhash == NULL)
1006                                 return ENOMEM;
1007                         KQ_LOCK(kq);
1008                         if (kq->kq_knhashmask == 0) {
1009                                 kq->kq_knhash = tmp_knhash;
1010                                 kq->kq_knhashmask = tmp_knhashmask;
1011                         } else {
1012                                 free(tmp_knhash, M_KQUEUE);
1013                         }
1014                         KQ_UNLOCK(kq);
1015                 }
1016         }
1017
1018         KQ_NOTOWNED(kq);
1019         return 0;
1020 }
1021
1022 static void
1023 kqueue_task(void *arg, int pending)
1024 {
1025         struct kqueue *kq;
1026         int haskqglobal;
1027
1028         haskqglobal = 0;
1029         kq = arg;
1030
1031         KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1032         KQ_LOCK(kq);
1033
1034         KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
1035
1036         kq->kq_state &= ~KQ_TASKSCHED;
1037         if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
1038                 wakeup(&kq->kq_state);
1039         }
1040         KQ_UNLOCK(kq);
1041         KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1042 }
1043
1044 /*
1045  * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
1046  * We treat KN_MARKER knotes as if they are INFLUX.
1047  */
1048 static int
1049 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent *ulistp,
1050         const struct timespec *tsp, struct kevent *keva, struct thread *td)
1051 {
1052         struct kevent *kevp;
1053         struct timeval atv, rtv, ttv;
1054         struct knote *kn, *marker;
1055         int count, timeout, nkev, error;
1056         int haskqglobal;
1057
1058         count = maxevents;
1059         nkev = 0;
1060         error = 0;
1061         haskqglobal = 0;
1062
1063         if (maxevents == 0)
1064                 goto done_nl;
1065
1066         if (tsp != NULL) {
1067                 TIMESPEC_TO_TIMEVAL(&atv, tsp);
1068                 if (itimerfix(&atv)) {
1069                         error = EINVAL;
1070                         goto done_nl;
1071                 }
1072                 if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
1073                         timeout = -1;
1074                 else
1075                         timeout = atv.tv_sec > 24 * 60 * 60 ?
1076                             24 * 60 * 60 * hz : tvtohz(&atv);
1077                 getmicrouptime(&rtv);
1078                 timevaladd(&atv, &rtv);
1079         } else {
1080                 atv.tv_sec = 0;
1081                 atv.tv_usec = 0;
1082                 timeout = 0;
1083         }
1084         marker = knote_alloc(1);
1085         if (marker == NULL) {
1086                 error = ENOMEM;
1087                 goto done_nl;
1088         }
1089         marker->kn_status = KN_MARKER;
1090         KQ_LOCK(kq);
1091         goto start;
1092
1093 retry:
1094         if (atv.tv_sec || atv.tv_usec) {
1095                 getmicrouptime(&rtv);
1096                 if (timevalcmp(&rtv, &atv, >=))
1097                         goto done;
1098                 ttv = atv;
1099                 timevalsub(&ttv, &rtv);
1100                 timeout = ttv.tv_sec > 24 * 60 * 60 ?
1101                         24 * 60 * 60 * hz : tvtohz(&ttv);
1102         }
1103
1104 start:
1105         kevp = keva;
1106         if (kq->kq_count == 0) {
1107                 if (timeout < 0) {
1108                         error = EWOULDBLOCK;
1109                 } else {
1110                         KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1111                         kq->kq_state |= KQ_SLEEP;
1112                         error = msleep(kq, &kq->kq_lock, PSOCK | PCATCH,
1113                             "kqread", timeout);
1114                 }
1115                 if (error == 0)
1116                         goto retry;
1117                 /* don't restart after signals... */
1118                 if (error == ERESTART)
1119                         error = EINTR;
1120                 else if (error == EWOULDBLOCK)
1121                         error = 0;
1122                 goto done;
1123         }
1124
1125         TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
1126         while (count) {
1127                 KQ_OWNED(kq);
1128                 kn = TAILQ_FIRST(&kq->kq_head);
1129
1130                 if ((kn->kn_status == KN_MARKER && kn != marker) ||
1131                     (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
1132                         KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1133                         kq->kq_state |= KQ_FLUXWAIT;
1134                         error = msleep(kq, &kq->kq_lock, PSOCK,
1135                             "kqflxwt", 0);
1136                         continue;
1137                 }
1138
1139                 TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1140                 if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
1141                         kn->kn_status &= ~KN_QUEUED;
1142                         kq->kq_count--;
1143                         continue;
1144                 }
1145                 if (kn == marker) {
1146                         KQ_FLUX_WAKEUP(kq);
1147                         if (count == maxevents)
1148                                 goto retry;
1149                         goto done;
1150                 }
1151                 KASSERT((kn->kn_status & KN_INFLUX) == 0,
1152                     ("KN_INFLUX set when not suppose to be"));
1153
1154                 if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
1155                         kn->kn_status &= ~KN_QUEUED;
1156                         kn->kn_status |= KN_INFLUX;
1157                         kq->kq_count--;
1158                         KQ_UNLOCK(kq);
1159                         /*
1160                          * We don't need to lock the list since we've marked
1161                          * it _INFLUX.
1162                          */
1163                         *kevp = kn->kn_kevent;
1164                         if (!(kn->kn_status & KN_DETACHED))
1165                                 kn->kn_fop->f_detach(kn);
1166                         knote_drop(kn, td);
1167                         KQ_LOCK(kq);
1168                         kn = NULL;
1169                 } else {
1170                         kn->kn_status |= KN_INFLUX;
1171                         KQ_UNLOCK(kq);
1172                         if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
1173                                 KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
1174                         KN_LIST_LOCK(kn);
1175                         if (kn->kn_fop->f_event(kn, 0) == 0) {
1176                                 KN_LIST_UNLOCK(kn);
1177                                 KQ_LOCK(kq);
1178                                 kn->kn_status &=
1179                                     ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX);
1180                                 kq->kq_count--;
1181                                 continue;
1182                         }
1183                         *kevp = kn->kn_kevent;
1184                         KQ_LOCK(kq);
1185                         if (kn->kn_flags & EV_CLEAR) {
1186                                 kn->kn_data = 0;
1187                                 kn->kn_fflags = 0;
1188                                 kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
1189                                 kq->kq_count--;
1190                         } else
1191                                 TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1192                         KN_LIST_UNLOCK(kn);
1193                         kn->kn_status &= ~(KN_INFLUX);
1194                 }
1195
1196                 /* we are returning a copy to the user */
1197                 kevp++;
1198                 nkev++;
1199                 count--;
1200
1201                 if (nkev == KQ_NEVENTS) {
1202                         KQ_UNLOCK_FLUX(kq);
1203                         error = copyout(keva, ulistp, sizeof *keva * nkev);
1204                         ulistp += nkev;
1205                         nkev = 0;
1206                         kevp = keva;
1207                         KQ_LOCK(kq);
1208                         if (error)
1209                                 break;
1210                 }
1211         }
1212         TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
1213 done:
1214         KQ_OWNED(kq);
1215         KQ_UNLOCK_FLUX(kq);
1216         KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
1217         knote_free(marker);
1218 done_nl:
1219         KQ_NOTOWNED(kq);
1220         if (nkev != 0)
1221                 error = copyout(keva, ulistp, sizeof *keva * nkev);
1222         td->td_retval[0] = maxevents - count;
1223         return (error);
1224 }
1225
1226 /*
1227  * XXX
1228  * This could be expanded to call kqueue_scan, if desired.
1229  */
1230 /*ARGSUSED*/
1231 static int
1232 kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
1233         int flags, struct thread *td)
1234 {
1235         return (ENXIO);
1236 }
1237
1238 /*ARGSUSED*/
1239 static int
1240 kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
1241          int flags, struct thread *td)
1242 {
1243         return (ENXIO);
1244 }
1245
1246 /*ARGSUSED*/
1247 static int
1248 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
1249         struct ucred *active_cred, struct thread *td)
1250 {
1251         /*
1252          * Enabling sigio causes two major problems:
1253          * 1) infinite recursion:
1254          * Synopsys: kevent is being used to track signals and have FIOASYNC
1255          * set.  On receipt of a signal this will cause a kqueue to recurse
1256          * into itself over and over.  Sending the sigio causes the kqueue
1257          * to become ready, which in turn posts sigio again, forever.
1258          * Solution: this can be solved by setting a flag in the kqueue that
1259          * we have a SIGIO in progress.
1260          * 2) locking problems:
1261          * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
1262          * us above the proc and pgrp locks.
1263          * Solution: Post a signal using an async mechanism, being sure to
1264          * record a generation count in the delivery so that we do not deliver
1265          * a signal to the wrong process.
1266          *
1267          * Note, these two mechanisms are somewhat mutually exclusive!
1268          */
1269 #if 0
1270         struct kqueue *kq;
1271
1272         kq = fp->f_data;
1273         switch (cmd) {
1274         case FIOASYNC:
1275                 if (*(int *)data) {
1276                         kq->kq_state |= KQ_ASYNC;
1277                 } else {
1278                         kq->kq_state &= ~KQ_ASYNC;
1279                 }
1280                 return (0);
1281
1282         case FIOSETOWN:
1283                 return (fsetown(*(int *)data, &kq->kq_sigio));
1284
1285         case FIOGETOWN:
1286                 *(int *)data = fgetown(&kq->kq_sigio);
1287                 return (0);
1288         }
1289 #endif
1290
1291         return (ENOTTY);
1292 }
1293
1294 /*ARGSUSED*/
1295 static int
1296 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
1297         struct thread *td)
1298 {
1299         struct kqueue *kq;
1300         int revents = 0;
1301         int error;
1302
1303         if ((error = kqueue_aquire(fp, &kq)))
1304                 return POLLERR;
1305
1306         KQ_LOCK(kq);
1307         if (events & (POLLIN | POLLRDNORM)) {
1308                 if (kq->kq_count) {
1309                         revents |= events & (POLLIN | POLLRDNORM);
1310                 } else {
1311                         selrecord(td, &kq->kq_sel);
1312                         kq->kq_state |= KQ_SEL;
1313                 }
1314         }
1315         kqueue_release(kq, 1);
1316         KQ_UNLOCK(kq);
1317         return (revents);
1318 }
1319
1320 /*ARGSUSED*/
1321 static int
1322 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
1323         struct thread *td)
1324 {
1325
1326         return (ENXIO);
1327 }
1328
1329 /*ARGSUSED*/
1330 static int
1331 kqueue_close(struct file *fp, struct thread *td)
1332 {
1333         struct kqueue *kq = fp->f_data;
1334         struct filedesc *fdp;
1335         struct knote *kn;
1336         int i;
1337         int error;
1338
1339         if ((error = kqueue_aquire(fp, &kq)))
1340                 return error;
1341
1342         KQ_LOCK(kq);
1343
1344         KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
1345             ("kqueue already closing"));
1346         kq->kq_state |= KQ_CLOSING;
1347         if (kq->kq_refcnt > 1)
1348                 msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
1349
1350         KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
1351         fdp = kq->kq_fdp;
1352
1353         KASSERT(knlist_empty(&kq->kq_sel.si_note),
1354             ("kqueue's knlist not empty"));
1355
1356         for (i = 0; i < kq->kq_knlistsize; i++) {
1357                 while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
1358                         KASSERT((kn->kn_status & KN_INFLUX) == 0,
1359                             ("KN_INFLUX set when not suppose to be"));
1360                         kn->kn_status |= KN_INFLUX;
1361                         KQ_UNLOCK(kq);
1362                         if (!(kn->kn_status & KN_DETACHED))
1363                                 kn->kn_fop->f_detach(kn);
1364                         knote_drop(kn, td);
1365                         KQ_LOCK(kq);
1366                 }
1367         }
1368         if (kq->kq_knhashmask != 0) {
1369                 for (i = 0; i <= kq->kq_knhashmask; i++) {
1370                         while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
1371                                 KASSERT((kn->kn_status & KN_INFLUX) == 0,
1372                                     ("KN_INFLUX set when not suppose to be"));
1373                                 kn->kn_status |= KN_INFLUX;
1374                                 KQ_UNLOCK(kq);
1375                                 if (!(kn->kn_status & KN_DETACHED))
1376                                         kn->kn_fop->f_detach(kn);
1377                                 knote_drop(kn, td);
1378                                 KQ_LOCK(kq);
1379                         }
1380                 }
1381         }
1382
1383         if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
1384                 kq->kq_state |= KQ_TASKDRAIN;
1385                 msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
1386         }
1387
1388         if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1389                 kq->kq_state &= ~KQ_SEL;
1390                 selwakeuppri(&kq->kq_sel, PSOCK);
1391         }
1392
1393         KQ_UNLOCK(kq);
1394
1395         FILEDESC_LOCK_FAST(fdp);
1396         SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list);
1397         FILEDESC_UNLOCK_FAST(fdp);
1398
1399         knlist_destroy(&kq->kq_sel.si_note);
1400         mtx_destroy(&kq->kq_lock);
1401         kq->kq_fdp = NULL;
1402
1403         if (kq->kq_knhash != NULL)
1404                 free(kq->kq_knhash, M_KQUEUE);
1405         if (kq->kq_knlist != NULL)
1406                 free(kq->kq_knlist, M_KQUEUE);
1407
1408         funsetown(&kq->kq_sigio);
1409         free(kq, M_KQUEUE);
1410         fp->f_data = NULL;
1411
1412         return (0);
1413 }
1414
1415 static void
1416 kqueue_wakeup(struct kqueue *kq)
1417 {
1418         KQ_OWNED(kq);
1419
1420         if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
1421                 kq->kq_state &= ~KQ_SLEEP;
1422                 wakeup(kq);
1423         }
1424         if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
1425                 kq->kq_state &= ~KQ_SEL;
1426                 selwakeuppri(&kq->kq_sel, PSOCK);
1427         }
1428         if (!knlist_empty(&kq->kq_sel.si_note))
1429                 kqueue_schedtask(kq);
1430         if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
1431                 pgsigio(&kq->kq_sigio, SIGIO, 0);
1432         }
1433 }
1434
1435 /*
1436  * Walk down a list of knotes, activating them if their event has triggered.
1437  *
1438  * There is a possibility to optimize in the case of one kq watching another.
1439  * Instead of scheduling a task to wake it up, you could pass enough state
1440  * down the chain to make up the parent kqueue.  Make this code functional
1441  * first.
1442  */
1443 void
1444 knote(struct knlist *list, long hint, int islocked)
1445 {
1446         struct kqueue *kq;
1447         struct knote *kn;
1448
1449         if (list == NULL)
1450                 return;
1451
1452         mtx_assert(list->kl_lock, islocked ? MA_OWNED : MA_NOTOWNED);
1453         if (!islocked)
1454                 mtx_lock(list->kl_lock);
1455         /*
1456          * If we unlock the list lock (and set KN_INFLUX), we can eliminate
1457          * the kqueue scheduling, but this will introduce four
1458          * lock/unlock's for each knote to test.  If we do, continue to use
1459          * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
1460          * only safe if you want to remove the current item, which we are
1461          * not doing.
1462          */
1463         SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
1464                 kq = kn->kn_kq;
1465                 if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
1466                         KQ_LOCK(kq);
1467                         if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
1468                                 kn->kn_status |= KN_HASKQLOCK;
1469                                 if (kn->kn_fop->f_event(kn, hint))
1470                                         KNOTE_ACTIVATE(kn, 1);
1471                                 kn->kn_status &= ~KN_HASKQLOCK;
1472                         }
1473                         KQ_UNLOCK(kq);
1474                 }
1475                 kq = NULL;
1476         }
1477         if (!islocked)
1478                 mtx_unlock(list->kl_lock);
1479 }
1480
1481 /*
1482  * add a knote to a knlist
1483  */
1484 void
1485 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
1486 {
1487         mtx_assert(knl->kl_lock, islocked ? MA_OWNED : MA_NOTOWNED);
1488         KQ_NOTOWNED(kn->kn_kq);
1489         KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) ==
1490             (KN_INFLUX|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
1491         if (!islocked)
1492                 mtx_lock(knl->kl_lock);
1493         SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
1494         if (!islocked)
1495                 mtx_unlock(knl->kl_lock);
1496         KQ_LOCK(kn->kn_kq);
1497         kn->kn_knlist = knl;
1498         kn->kn_status &= ~KN_DETACHED;
1499         KQ_UNLOCK(kn->kn_kq);
1500 }
1501
1502 static void
1503 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked, int kqislocked)
1504 {
1505         KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
1506         mtx_assert(knl->kl_lock, knlislocked ? MA_OWNED : MA_NOTOWNED);
1507         mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
1508         if (!kqislocked)
1509                 KASSERT((kn->kn_status & (KN_INFLUX|KN_DETACHED)) == KN_INFLUX,
1510     ("knlist_remove called w/o knote being KN_INFLUX or already removed"));
1511         if (!knlislocked)
1512                 mtx_lock(knl->kl_lock);
1513         SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
1514         kn->kn_knlist = NULL;
1515         if (!knlislocked)
1516                 mtx_unlock(knl->kl_lock);
1517         if (!kqislocked)
1518                 KQ_LOCK(kn->kn_kq);
1519         kn->kn_status |= KN_DETACHED;
1520         if (!kqislocked)
1521                 KQ_UNLOCK(kn->kn_kq);
1522 }
1523
1524 /*
1525  * remove all knotes from a specified klist
1526  */
1527 void
1528 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
1529 {
1530
1531         knlist_remove_kq(knl, kn, islocked, 0);
1532 }
1533
1534 /*
1535  * remove knote from a specified klist while in f_event handler.
1536  */
1537 void
1538 knlist_remove_inevent(struct knlist *knl, struct knote *kn)
1539 {
1540
1541         knlist_remove_kq(knl, kn, 1,
1542             (kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
1543 }
1544
1545 int
1546 knlist_empty(struct knlist *knl)
1547 {
1548
1549         mtx_assert(knl->kl_lock, MA_OWNED);
1550         return SLIST_EMPTY(&knl->kl_list);
1551 }
1552
1553 static struct mtx       knlist_lock;
1554 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
1555         MTX_DEF);
1556
1557 void
1558 knlist_init(struct knlist *knl, struct mtx *mtx)
1559 {
1560
1561         if (mtx == NULL)
1562                 knl->kl_lock = &knlist_lock;
1563         else
1564                 knl->kl_lock = mtx;
1565
1566         SLIST_INIT(&knl->kl_list);
1567 }
1568
1569 void
1570 knlist_destroy(struct knlist *knl)
1571 {
1572
1573 #ifdef INVARIANTS
1574         /*
1575          * if we run across this error, we need to find the offending
1576          * driver and have it call knlist_clear.
1577          */
1578         if (!SLIST_EMPTY(&knl->kl_list))
1579                 printf("WARNING: destroying knlist w/ knotes on it!\n");
1580 #endif
1581
1582         knl->kl_lock = NULL;
1583         SLIST_INIT(&knl->kl_list);
1584 }
1585
1586 /*
1587  * Even if we are locked, we may need to drop the lock to allow any influx
1588  * knotes time to "settle".
1589  */
1590 void
1591 knlist_clear(struct knlist *knl, int islocked)
1592 {
1593         struct knote *kn;
1594         struct kqueue *kq;
1595
1596         if (islocked)
1597                 mtx_assert(knl->kl_lock, MA_OWNED);
1598         else {
1599                 mtx_assert(knl->kl_lock, MA_NOTOWNED);
1600 again:          /* need to reaquire lock since we have dropped it */
1601                 mtx_lock(knl->kl_lock);
1602         }
1603
1604         SLIST_FOREACH(kn, &knl->kl_list, kn_selnext) {
1605                 kq = kn->kn_kq;
1606                 KQ_LOCK(kq);
1607                 if ((kn->kn_status & KN_INFLUX) &&
1608                     (kn->kn_status & KN_DETACHED) != KN_DETACHED) {
1609                         KQ_UNLOCK(kq);
1610                         continue;
1611                 }
1612                 /* Make sure cleared knotes disappear soon */
1613                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
1614                 knlist_remove_kq(knl, kn, 1, 1);
1615                 KQ_UNLOCK(kq);
1616                 kq = NULL;
1617         }
1618
1619         if (!SLIST_EMPTY(&knl->kl_list)) {
1620                 /* there are still KN_INFLUX remaining */
1621                 kn = SLIST_FIRST(&knl->kl_list);
1622                 kq = kn->kn_kq;
1623                 KQ_LOCK(kq);
1624                 KASSERT(kn->kn_status & KN_INFLUX,
1625                     ("knote removed w/o list lock"));
1626                 mtx_unlock(knl->kl_lock);
1627                 kq->kq_state |= KQ_FLUXWAIT;
1628                 msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
1629                 kq = NULL;
1630                 goto again;
1631         }
1632
1633         SLIST_INIT(&knl->kl_list);
1634
1635         if (islocked)
1636                 mtx_assert(knl->kl_lock, MA_OWNED);
1637         else {
1638                 mtx_unlock(knl->kl_lock);
1639                 mtx_assert(knl->kl_lock, MA_NOTOWNED);
1640         }
1641 }
1642
1643 /*
1644  * remove all knotes referencing a specified fd
1645  * must be called with FILEDESC lock.  This prevents a race where a new fd
1646  * comes along and occupies the entry and we attach a knote to the fd.
1647  */
1648 void
1649 knote_fdclose(struct thread *td, int fd)
1650 {
1651         struct filedesc *fdp = td->td_proc->p_fd;
1652         struct kqueue *kq;
1653         struct knote *kn;
1654         int influx;
1655
1656         FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
1657
1658         /*
1659          * We shouldn't have to worry about new kevents appearing on fd
1660          * since filedesc is locked.
1661          */
1662         SLIST_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
1663                 KQ_LOCK(kq);
1664
1665 again:
1666                 influx = 0;
1667                 while (kq->kq_knlistsize > fd &&
1668                     (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
1669                         if (kn->kn_status & KN_INFLUX) {
1670                                 /* someone else might be waiting on our knote */
1671                                 if (influx)
1672                                         wakeup(kq);
1673                                 kq->kq_state |= KQ_FLUXWAIT;
1674                                 msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
1675                                 goto again;
1676                         }
1677                         kn->kn_status |= KN_INFLUX;
1678                         KQ_UNLOCK(kq);
1679                         if (!(kn->kn_status & KN_DETACHED))
1680                                 kn->kn_fop->f_detach(kn);
1681                         knote_drop(kn, td);
1682                         influx = 1;
1683                         KQ_LOCK(kq);
1684                 }
1685                 KQ_UNLOCK_FLUX(kq);
1686         }
1687 }
1688
1689 static int
1690 knote_attach(struct knote *kn, struct kqueue *kq)
1691 {
1692         struct klist *list;
1693
1694         KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
1695         KQ_OWNED(kq);
1696
1697         if (kn->kn_fop->f_isfd) {
1698                 if (kn->kn_id >= kq->kq_knlistsize)
1699                         return ENOMEM;
1700                 list = &kq->kq_knlist[kn->kn_id];
1701         } else {
1702                 if (kq->kq_knhash == NULL)
1703                         return ENOMEM;
1704                 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
1705         }
1706
1707         SLIST_INSERT_HEAD(list, kn, kn_link);
1708
1709         return 0;
1710 }
1711
1712 /*
1713  * knote must already have been detatched using the f_detach method.
1714  * no lock need to be held, it is assumed that the KN_INFLUX flag is set
1715  * to prevent other removal.
1716  */
1717 static void
1718 knote_drop(struct knote *kn, struct thread *td)
1719 {
1720         struct kqueue *kq;
1721         struct klist *list;
1722
1723         kq = kn->kn_kq;
1724
1725         KQ_NOTOWNED(kq);
1726         KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
1727             ("knote_drop called without KN_INFLUX set in kn_status"));
1728
1729         KQ_LOCK(kq);
1730         if (kn->kn_fop->f_isfd)
1731                 list = &kq->kq_knlist[kn->kn_id];
1732         else
1733                 list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
1734
1735         SLIST_REMOVE(list, kn, knote, kn_link);
1736         if (kn->kn_status & KN_QUEUED)
1737                 knote_dequeue(kn);
1738         KQ_UNLOCK_FLUX(kq);
1739
1740         if (kn->kn_fop->f_isfd) {
1741                 fdrop(kn->kn_fp, td);
1742                 kn->kn_fp = NULL;
1743         }
1744         kqueue_fo_release(kn->kn_kevent.filter);
1745         kn->kn_fop = NULL;
1746         knote_free(kn);
1747 }
1748
1749 static void
1750 knote_enqueue(struct knote *kn)
1751 {
1752         struct kqueue *kq = kn->kn_kq;
1753
1754         KQ_OWNED(kn->kn_kq);
1755         KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
1756
1757         TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
1758         kn->kn_status |= KN_QUEUED;
1759         kq->kq_count++;
1760         kqueue_wakeup(kq);
1761 }
1762
1763 static void
1764 knote_dequeue(struct knote *kn)
1765 {
1766         struct kqueue *kq = kn->kn_kq;
1767
1768         KQ_OWNED(kn->kn_kq);
1769         KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
1770
1771         TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
1772         kn->kn_status &= ~KN_QUEUED;
1773         kq->kq_count--;
1774 }
1775
1776 static void
1777 knote_init(void)
1778 {
1779
1780         knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
1781             NULL, NULL, UMA_ALIGN_PTR, 0);
1782 }
1783 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
1784
1785 static struct knote *
1786 knote_alloc(int waitok)
1787 {
1788         return ((struct knote *)uma_zalloc(knote_zone,
1789             (waitok ? M_WAITOK : M_NOWAIT)|M_ZERO));
1790 }
1791
1792 static void
1793 knote_free(struct knote *kn)
1794 {
1795         if (kn != NULL)
1796                 uma_zfree(knote_zone, kn);
1797 }