sys/kern/kern_synch.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1982, 1986, 1990, 1991, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  * (c) UNIX System Laboratories, Inc.
   7  * All or some portions of this file are derived from material licensed
   8  * to the University of California by American Telephone and Telegraph
   9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  10  * the permission of UNIX System Laboratories, Inc.
  11  *
  12  * Redistribution and use in source and binary forms, with or without
  13  * modification, are permitted provided that the following conditions
  14  * are met:
  15  * 1. Redistributions of source code must retain the above copyright
  16  *    notice, this list of conditions and the following disclaimer.
  17  * 2. Redistributions in binary form must reproduce the above copyright
  18  *    notice, this list of conditions and the following disclaimer in the
  19  *    documentation and/or other materials provided with the distribution.
  20  * 3. Neither the name of the University nor the names of its contributors
  21  *    may be used to endorse or promote products derived from this software
  22  *    without specific prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  34  * SUCH DAMAGE.
  35  */
  36
  37 #include <sys/cdefs.h>
  38 #include "opt_ktrace.h"
  39 #include "opt_sched.h"
  40
  41 #include <sys/param.h>
  42 #include <sys/systm.h>
  43 #include <sys/blockcount.h>
  44 #include <sys/condvar.h>
  45 #include <sys/kdb.h>
  46 #include <sys/kernel.h>
  47 #include <sys/ktr.h>
  48 #include <sys/lock.h>
  49 #include <sys/mutex.h>
  50 #include <sys/proc.h>
  51 #include <sys/resourcevar.h>
  52 #include <sys/sched.h>
  53 #include <sys/sdt.h>
  54 #include <sys/signalvar.h>
  55 #include <sys/sleepqueue.h>
  56 #include <sys/smp.h>
  57 #include <sys/sx.h>
  58 #include <sys/sysctl.h>
  59 #include <sys/sysproto.h>
  60 #include <sys/vmmeter.h>
  61 #ifdef KTRACE
  62 #include <sys/uio.h>
  63 #include <sys/ktrace.h>
  64 #endif
  65 #ifdef EPOCH_TRACE
  66 #include <sys/epoch.h>
  67 #endif
  68
  69 #include <machine/cpu.h>
  70
  71 static void synch_setup(void *dummy);
  72 SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup,
  73     NULL);
  74
  75 int     hogticks;
  76 static const char pause_wchan[MAXCPU];
  77
  78 static struct callout loadav_callout;
  79
  80 struct loadavg averunnable =
  81         { {0, 0, 0}, FSCALE };  /* load average, of runnable procs */
  82 /*
  83  * Constants for averages over 1, 5, and 15 minutes
  84  * when sampling at 5 second intervals.
  85  */
  86 static uint64_t cexp[3] = {
  87         0.9200444146293232 * FSCALE,    /* exp(-1/12) */
  88         0.9834714538216174 * FSCALE,    /* exp(-1/60) */
  89         0.9944598480048967 * FSCALE,    /* exp(-1/180) */
  90 };
  91
  92 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
  93 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, FSCALE,
  94     "Fixed-point scale factor used for calculating load average values");
  95
  96 static void     loadav(void *arg);
  97
  98 SDT_PROVIDER_DECLARE(sched);
  99 SDT_PROBE_DEFINE(sched, , , preempt);
 100
 101 static void
 102 sleepinit(void *unused)
 103 {
 104
 105         hogticks = (hz / 10) * 2;       /* Default only. */
 106         init_sleepqueues();
 107 }
 108
 109 /*
 110  * vmem tries to lock the sleepq mutexes when free'ing kva, so make sure
 111  * it is available.
 112  */
 113 SYSINIT(sleepinit, SI_SUB_KMEM, SI_ORDER_ANY, sleepinit, NULL);
 114
 115 /*
 116  * General sleep call.  Suspends the current thread until a wakeup is
 117  * performed on the specified identifier.  The thread will then be made
 118  * runnable with the specified priority.  Sleeps at most sbt units of time
 119  * (0 means no timeout).  If pri includes the PCATCH flag, let signals
 120  * interrupt the sleep, otherwise ignore them while sleeping.  Returns 0 if
 121  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
 122  * signal becomes pending, ERESTART is returned if the current system
 123  * call should be restarted if possible, and EINTR is returned if the system
 124  * call should be interrupted by the signal (return EINTR).
 125  *
 126  * The lock argument is unlocked before the caller is suspended, and
 127  * re-locked before _sleep() returns.  If priority includes the PDROP
 128  * flag the lock is not re-locked before returning.
 129  */
 130 int
 131 _sleep(const void *ident, struct lock_object *lock, int priority,
 132     const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
 133 {
 134         struct thread *td;
 135         struct lock_class *class;
 136         uintptr_t lock_state;
 137         int catch, pri, rval, sleepq_flags;
 138         WITNESS_SAVE_DECL(lock_witness);
 139
 140         TSENTER();
 141         td = curthread;
 142 #ifdef KTRACE
 143         if (KTRPOINT(td, KTR_CSW))
 144                 ktrcsw(1, 0, wmesg);
 145 #endif
 146         WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
 147             "Sleeping on \"%s\"", wmesg);
 148         KASSERT(sbt != 0 || mtx_owned(&Giant) || lock != NULL ||
 149             (priority & PNOLOCK) != 0,
 150             ("sleeping without a lock"));
 151         KASSERT(ident != NULL, ("_sleep: NULL ident"));
 152         KASSERT(TD_IS_RUNNING(td), ("_sleep: curthread not running"));
 153         if (priority & PDROP)
 154                 KASSERT(lock != NULL && lock != &Giant.lock_object,
 155                     ("PDROP requires a non-Giant lock"));
 156         if (lock != NULL)
 157                 class = LOCK_CLASS(lock);
 158         else
 159                 class = NULL;
 160
 161         if (SCHEDULER_STOPPED_TD(td)) {
 162                 if (lock != NULL && priority & PDROP)
 163                         class->lc_unlock(lock);
 164                 return (0);
 165         }
 166         catch = priority & PCATCH;
 167         pri = priority & PRIMASK;
 168
 169         KASSERT(!TD_ON_SLEEPQ(td), ("recursive sleep"));
 170
 171         if ((uintptr_t)ident >= (uintptr_t)&pause_wchan[0] &&
 172             (uintptr_t)ident <= (uintptr_t)&pause_wchan[MAXCPU - 1])
 173                 sleepq_flags = SLEEPQ_PAUSE;
 174         else
 175                 sleepq_flags = SLEEPQ_SLEEP;
 176         if (catch)
 177                 sleepq_flags |= SLEEPQ_INTERRUPTIBLE;
 178
 179         sleepq_lock(ident);
 180         CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)",
 181             td->td_tid, td->td_proc->p_pid, td->td_name, wmesg, ident);
 182
 183         if (lock == &Giant.lock_object)
 184                 mtx_assert(&Giant, MA_OWNED);
 185         DROP_GIANT();
 186         if (lock != NULL && lock != &Giant.lock_object &&
 187             !(class->lc_flags & LC_SLEEPABLE)) {
 188                 KASSERT(!(class->lc_flags & LC_SPINLOCK),
 189                     ("spin locks can only use msleep_spin"));
 190                 WITNESS_SAVE(lock, lock_witness);
 191                 lock_state = class->lc_unlock(lock);
 192         } else
 193                 /* GCC needs to follow the Yellow Brick Road */
 194                 lock_state = -1;
 195
 196         /*
 197          * We put ourselves on the sleep queue and start our timeout
 198          * before calling thread_suspend_check, as we could stop there,
 199          * and a wakeup or a SIGCONT (or both) could occur while we were
 200          * stopped without resuming us.  Thus, we must be ready for sleep
 201          * when cursig() is called.  If the wakeup happens while we're
 202          * stopped, then td will no longer be on a sleep queue upon
 203          * return from cursig().
 204          */
 205         sleepq_add(ident, lock, wmesg, sleepq_flags, 0);
 206         if (sbt != 0)
 207                 sleepq_set_timeout_sbt(ident, sbt, pr, flags);
 208         if (lock != NULL && class->lc_flags & LC_SLEEPABLE) {
 209                 sleepq_release(ident);
 210                 WITNESS_SAVE(lock, lock_witness);
 211                 lock_state = class->lc_unlock(lock);
 212                 sleepq_lock(ident);
 213         }
 214         if (sbt != 0 && catch)
 215                 rval = sleepq_timedwait_sig(ident, pri);
 216         else if (sbt != 0)
 217                 rval = sleepq_timedwait(ident, pri);
 218         else if (catch)
 219                 rval = sleepq_wait_sig(ident, pri);
 220         else {
 221                 sleepq_wait(ident, pri);
 222                 rval = 0;
 223         }
 224 #ifdef KTRACE
 225         if (KTRPOINT(td, KTR_CSW))
 226                 ktrcsw(0, 0, wmesg);
 227 #endif
 228         PICKUP_GIANT();
 229         if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) {
 230                 class->lc_lock(lock, lock_state);
 231                 WITNESS_RESTORE(lock, lock_witness);
 232         }
 233         TSEXIT();
 234         return (rval);
 235 }
 236
 237 int
 238 msleep_spin_sbt(const void *ident, struct mtx *mtx, const char *wmesg,
 239     sbintime_t sbt, sbintime_t pr, int flags)
 240 {
 241         struct thread *td;
 242         int rval;
 243         WITNESS_SAVE_DECL(mtx);
 244
 245         td = curthread;
 246         KASSERT(mtx != NULL, ("sleeping without a mutex"));
 247         KASSERT(ident != NULL, ("msleep_spin_sbt: NULL ident"));
 248         KASSERT(TD_IS_RUNNING(td), ("msleep_spin_sbt: curthread not running"));
 249
 250         if (SCHEDULER_STOPPED_TD(td))
 251                 return (0);
 252
 253         sleepq_lock(ident);
 254         CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)",
 255             td->td_tid, td->td_proc->p_pid, td->td_name, wmesg, ident);
 256
 257         DROP_GIANT();
 258         mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
 259         WITNESS_SAVE(&mtx->lock_object, mtx);
 260         mtx_unlock_spin(mtx);
 261
 262         /*
 263          * We put ourselves on the sleep queue and start our timeout.
 264          */
 265         sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0);
 266         if (sbt != 0)
 267                 sleepq_set_timeout_sbt(ident, sbt, pr, flags);
 268
 269         /*
 270          * Can't call ktrace with any spin locks held so it can lock the
 271          * ktrace_mtx lock, and WITNESS_WARN considers it an error to hold
 272          * any spin lock.  Thus, we have to drop the sleepq spin lock while
 273          * we handle those requests.  This is safe since we have placed our
 274          * thread on the sleep queue already.
 275          */
 276 #ifdef KTRACE
 277         if (KTRPOINT(td, KTR_CSW)) {
 278                 sleepq_release(ident);
 279                 ktrcsw(1, 0, wmesg);
 280                 sleepq_lock(ident);
 281         }
 282 #endif
 283 #ifdef WITNESS
 284         sleepq_release(ident);
 285         WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"",
 286             wmesg);
 287         sleepq_lock(ident);
 288 #endif
 289         if (sbt != 0)
 290                 rval = sleepq_timedwait(ident, 0);
 291         else {
 292                 sleepq_wait(ident, 0);
 293                 rval = 0;
 294         }
 295 #ifdef KTRACE
 296         if (KTRPOINT(td, KTR_CSW))
 297                 ktrcsw(0, 0, wmesg);
 298 #endif
 299         PICKUP_GIANT();
 300         mtx_lock_spin(mtx);
 301         WITNESS_RESTORE(&mtx->lock_object, mtx);
 302         return (rval);
 303 }
 304
 305 /*
 306  * pause_sbt() delays the calling thread by the given signed binary
 307  * time. During cold bootup, pause_sbt() uses the DELAY() function
 308  * instead of the _sleep() function to do the waiting. The "sbt"
 309  * argument must be greater than or equal to zero. A "sbt" value of
 310  * zero is equivalent to a "sbt" value of one tick.
 311  */
 312 int
 313 pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags)
 314 {
 315         KASSERT(sbt >= 0, ("pause_sbt: timeout must be >= 0"));
 316
 317         /* silently convert invalid timeouts */
 318         if (sbt == 0)
 319                 sbt = tick_sbt;
 320
 321         if ((cold && curthread == &thread0) || kdb_active ||
 322             SCHEDULER_STOPPED()) {
 323                 /*
 324                  * We delay one second at a time to avoid overflowing the
 325                  * system specific DELAY() function(s):
 326                  */
 327                 while (sbt >= SBT_1S) {
 328                         DELAY(1000000);
 329                         sbt -= SBT_1S;
 330                 }
 331                 /* Do the delay remainder, if any */
 332                 sbt = howmany(sbt, SBT_1US);
 333                 if (sbt > 0)
 334                         DELAY(sbt);
 335                 return (EWOULDBLOCK);
 336         }
 337         return (_sleep(&pause_wchan[curcpu], NULL,
 338             (flags & C_CATCH) ? PCATCH : 0, wmesg, sbt, pr, flags));
 339 }
 340
 341 /*
 342  * Make all threads sleeping on the specified identifier runnable.
 343  */
 344 void
 345 wakeup(const void *ident)
 346 {
 347         int wakeup_swapper;
 348
 349         sleepq_lock(ident);
 350         wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0);
 351         sleepq_release(ident);
 352         if (wakeup_swapper) {
 353                 KASSERT(ident != &proc0,
 354                     ("wakeup and wakeup_swapper and proc0"));
 355                 kick_proc0();
 356         }
 357 }
 358
 359 /*
 360  * Make a thread sleeping on the specified identifier runnable.
 361  * May wake more than one thread if a target thread is currently
 362  * swapped out.
 363  */
 364 void
 365 wakeup_one(const void *ident)
 366 {
 367         int wakeup_swapper;
 368
 369         sleepq_lock(ident);
 370         wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP | SLEEPQ_DROP, 0, 0);
 371         if (wakeup_swapper)
 372                 kick_proc0();
 373 }
 374
 375 void
 376 wakeup_any(const void *ident)
 377 {
 378         int wakeup_swapper;
 379
 380         sleepq_lock(ident);
 381         wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP | SLEEPQ_UNFAIR |
 382             SLEEPQ_DROP, 0, 0);
 383         if (wakeup_swapper)
 384                 kick_proc0();
 385 }
 386
 387 /*
 388  * Signal sleeping waiters after the counter has reached zero.
 389  */
 390 void
 391 _blockcount_wakeup(blockcount_t *bc, u_int old)
 392 {
 393
 394         KASSERT(_BLOCKCOUNT_WAITERS(old),
 395             ("%s: no waiters on %p", __func__, bc));
 396
 397         if (atomic_cmpset_int(&bc->__count, _BLOCKCOUNT_WAITERS_FLAG, 0))
 398                 wakeup(bc);
 399 }
 400
 401 /*
 402  * Wait for a wakeup or a signal.  This does not guarantee that the count is
 403  * still zero on return.  Callers wanting a precise answer should use
 404  * blockcount_wait() with an interlock.
 405  *
 406  * If there is no work to wait for, return 0.  If the sleep was interrupted by a
 407  * signal, return EINTR or ERESTART, and return EAGAIN otherwise.
 408  */
 409 int
 410 _blockcount_sleep(blockcount_t *bc, struct lock_object *lock, const char *wmesg,
 411     int prio)
 412 {
 413         void *wchan;
 414         uintptr_t lock_state;
 415         u_int old;
 416         int ret;
 417         bool catch, drop;
 418
 419         KASSERT(lock != &Giant.lock_object,
 420             ("%s: cannot use Giant as the interlock", __func__));
 421
 422         catch = (prio & PCATCH) != 0;
 423         drop = (prio & PDROP) != 0;
 424         prio &= PRIMASK;
 425
 426         /*
 427          * Synchronize with the fence in blockcount_release().  If we end up
 428          * waiting, the sleepqueue lock acquisition will provide the required
 429          * side effects.
 430          *
 431          * If there is no work to wait for, but waiters are present, try to put
 432          * ourselves to sleep to avoid jumping ahead.
 433          */
 434         if (atomic_load_acq_int(&bc->__count) == 0) {
 435                 if (lock != NULL && drop)
 436                         LOCK_CLASS(lock)->lc_unlock(lock);
 437                 return (0);
 438         }
 439         lock_state = 0;
 440         wchan = bc;
 441         sleepq_lock(wchan);
 442         DROP_GIANT();
 443         if (lock != NULL)
 444                 lock_state = LOCK_CLASS(lock)->lc_unlock(lock);
 445         old = blockcount_read(bc);
 446         ret = 0;
 447         do {
 448                 if (_BLOCKCOUNT_COUNT(old) == 0) {
 449                         sleepq_release(wchan);
 450                         goto out;
 451                 }
 452                 if (_BLOCKCOUNT_WAITERS(old))
 453                         break;
 454         } while (!atomic_fcmpset_int(&bc->__count, &old,
 455             old | _BLOCKCOUNT_WAITERS_FLAG));
 456         sleepq_add(wchan, NULL, wmesg, catch ? SLEEPQ_INTERRUPTIBLE : 0, 0);
 457         if (catch)
 458                 ret = sleepq_wait_sig(wchan, prio);
 459         else
 460                 sleepq_wait(wchan, prio);
 461         if (ret == 0)
 462                 ret = EAGAIN;
 463
 464 out:
 465         PICKUP_GIANT();
 466         if (lock != NULL && !drop)
 467                 LOCK_CLASS(lock)->lc_lock(lock, lock_state);
 468
 469         return (ret);
 470 }
 471
 472 static void
 473 kdb_switch(void)
 474 {
 475         thread_unlock(curthread);
 476         kdb_backtrace();
 477         kdb_reenter();
 478         panic("%s: did not reenter debugger", __func__);
 479 }
 480
 481 /*
 482  * mi_switch(9): The machine-independent parts of context switching.
 483  *
 484  * The thread lock is required on entry and is no longer held on return.
 485  */
 486 void
 487 mi_switch(int flags)
 488 {
 489         uint64_t runtime, new_switchtime;
 490         struct thread *td;
 491
 492         td = curthread;                 /* XXX */
 493         THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
 494         KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
 495 #ifdef INVARIANTS
 496         if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td))
 497                 mtx_assert(&Giant, MA_NOTOWNED);
 498 #endif
 499         /* thread_lock() performs spinlock_enter(). */
 500         KASSERT(td->td_critnest == 1 || KERNEL_PANICKED(),
 501             ("mi_switch: switch in a critical section"));
 502         KASSERT((flags & (SW_INVOL | SW_VOL)) != 0,
 503             ("mi_switch: switch must be voluntary or involuntary"));
 504         KASSERT((flags & SW_TYPE_MASK) != 0,
 505             ("mi_switch: a switch reason (type) must be specified"));
 506         KASSERT((flags & SW_TYPE_MASK) < SWT_COUNT,
 507             ("mi_switch: invalid switch reason %d", (flags & SW_TYPE_MASK)));
 508
 509         /*
 510          * Don't perform context switches from the debugger.
 511          */
 512         if (kdb_active)
 513                 kdb_switch();
 514         if (SCHEDULER_STOPPED_TD(td))
 515                 return;
 516         if (flags & SW_VOL) {
 517                 td->td_ru.ru_nvcsw++;
 518                 td->td_swvoltick = ticks;
 519         } else {
 520                 td->td_ru.ru_nivcsw++;
 521                 td->td_swinvoltick = ticks;
 522         }
 523 #ifdef SCHED_STATS
 524         SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]);
 525 #endif
 526         /*
 527          * Compute the amount of time during which the current
 528          * thread was running, and add that to its total so far.
 529          */
 530         new_switchtime = cpu_ticks();
 531         runtime = new_switchtime - PCPU_GET(switchtime);
 532         td->td_runtime += runtime;
 533         td->td_incruntime += runtime;
 534         PCPU_SET(switchtime, new_switchtime);
 535         td->td_generation++;    /* bump preempt-detect counter */
 536         VM_CNT_INC(v_swtch);
 537         PCPU_SET(switchticks, ticks);
 538         CTR4(KTR_PROC, "mi_switch: old thread %ld (td_sched %p, pid %ld, %s)",
 539             td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name);
 540 #ifdef KDTRACE_HOOKS
 541         if (SDT_PROBES_ENABLED() &&
 542             ((flags & SW_PREEMPT) != 0 || ((flags & SW_INVOL) != 0 &&
 543             (flags & SW_TYPE_MASK) == SWT_NEEDRESCHED)))
 544                 SDT_PROBE0(sched, , , preempt);
 545 #endif
 546         sched_switch(td, flags);
 547         CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)",
 548             td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name);
 549
 550         /*
 551          * If the last thread was exiting, finish cleaning it up.
 552          */
 553         if ((td = PCPU_GET(deadthread))) {
 554                 PCPU_SET(deadthread, NULL);
 555                 thread_stash(td);
 556         }
 557         spinlock_exit();
 558 }
 559
 560 /*
 561  * Change thread state to be runnable, placing it on the run queue if
 562  * it is in memory.  If it is swapped out, return true so our caller
 563  * will know to awaken the swapper.
 564  *
 565  * Requires the thread lock on entry, drops on exit.
 566  */
 567 int
 568 setrunnable(struct thread *td, int srqflags)
 569 {
 570         int swapin;
 571
 572         THREAD_LOCK_ASSERT(td, MA_OWNED);
 573         KASSERT(td->td_proc->p_state != PRS_ZOMBIE,
 574             ("setrunnable: pid %d is a zombie", td->td_proc->p_pid));
 575
 576         swapin = 0;
 577         switch (TD_GET_STATE(td)) {
 578         case TDS_RUNNING:
 579         case TDS_RUNQ:
 580                 break;
 581         case TDS_CAN_RUN:
 582                 KASSERT((td->td_flags & TDF_INMEM) != 0,
 583                     ("setrunnable: td %p not in mem, flags 0x%X inhibit 0x%X",
 584                     td, td->td_flags, td->td_inhibitors));
 585                 /* unlocks thread lock according to flags */
 586                 sched_wakeup(td, srqflags);
 587                 return (0);
 588         case TDS_INHIBITED:
 589                 /*
 590                  * If we are only inhibited because we are swapped out
 591                  * arrange to swap in this process.
 592                  */
 593                 if (td->td_inhibitors == TDI_SWAPPED &&
 594                     (td->td_flags & TDF_SWAPINREQ) == 0) {
 595                         td->td_flags |= TDF_SWAPINREQ;
 596                         swapin = 1;
 597                 }
 598                 break;
 599         default:
 600                 panic("setrunnable: state 0x%x", TD_GET_STATE(td));
 601         }
 602         if ((srqflags & (SRQ_HOLD | SRQ_HOLDTD)) == 0)
 603                 thread_unlock(td);
 604
 605         return (swapin);
 606 }
 607
 608 /*
 609  * Compute a tenex style load average of a quantity on
 610  * 1, 5 and 15 minute intervals.
 611  */
 612 static void
 613 loadav(void *arg)
 614 {
 615         int i;
 616         uint64_t nrun;
 617         struct loadavg *avg;
 618
 619         nrun = (uint64_t)sched_load();
 620         avg = &averunnable;
 621
 622         for (i = 0; i < 3; i++)
 623                 avg->ldavg[i] = (cexp[i] * (uint64_t)avg->ldavg[i] +
 624                     nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
 625
 626         /*
 627          * Schedule the next update to occur after 5 seconds, but add a
 628          * random variation to avoid synchronisation with processes that
 629          * run at regular intervals.
 630          */
 631         callout_reset_sbt(&loadav_callout,
 632             SBT_1US * (4000000 + (int)(random() % 2000001)), SBT_1US,
 633             loadav, NULL, C_DIRECT_EXEC | C_PREL(32));
 634 }
 635
 636 static void
 637 ast_scheduler(struct thread *td, int tda __unused)
 638 {
 639 #ifdef KTRACE
 640         if (KTRPOINT(td, KTR_CSW))
 641                 ktrcsw(1, 1, __func__);
 642 #endif
 643         thread_lock(td);
 644         sched_prio(td, td->td_user_pri);
 645         mi_switch(SW_INVOL | SWT_NEEDRESCHED);
 646 #ifdef KTRACE
 647         if (KTRPOINT(td, KTR_CSW))
 648                 ktrcsw(0, 1, __func__);
 649 #endif
 650 }
 651
 652 static void
 653 synch_setup(void *dummy __unused)
 654 {
 655         callout_init(&loadav_callout, 1);
 656         ast_register(TDA_SCHED, ASTR_ASTF_REQUIRED, 0, ast_scheduler);
 657
 658         /* Kick off timeout driven events by calling first time. */
 659         loadav(NULL);
 660 }
 661
 662 bool
 663 should_yield(void)
 664 {
 665
 666         return ((u_int)ticks - (u_int)curthread->td_swvoltick >= hogticks);
 667 }
 668
 669 void
 670 maybe_yield(void)
 671 {
 672
 673         if (should_yield())
 674                 kern_yield(PRI_USER);
 675 }
 676
 677 void
 678 kern_yield(int prio)
 679 {
 680         struct thread *td;
 681
 682         td = curthread;
 683         DROP_GIANT();
 684         thread_lock(td);
 685         if (prio == PRI_USER)
 686                 prio = td->td_user_pri;
 687         if (prio >= 0)
 688                 sched_prio(td, prio);
 689         mi_switch(SW_VOL | SWT_RELINQUISH);
 690         PICKUP_GIANT();
 691 }
 692
 693 /*
 694  * General purpose yield system call.
 695  */
 696 int
 697 sys_yield(struct thread *td, struct yield_args *uap)
 698 {
 699
 700         thread_lock(td);
 701         if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
 702                 sched_prio(td, PRI_MAX_TIMESHARE);
 703         mi_switch(SW_VOL | SWT_RELINQUISH);
 704         td->td_retval[0] = 0;
 705         return (0);
 706 }
 707
 708 int
 709 sys_sched_getcpu(struct thread *td, struct sched_getcpu_args *uap)
 710 {
 711         td->td_retval[0] = td->td_oncpu;
 712         return (0);
 713 }