sys/kern/sched_4bsd.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 4. Neither the name of the University nor the names of its contributors
  19  *    may be used to endorse or promote products derived from this software
  20  *    without specific prior written permission.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35 #include <sys/cdefs.h>
  36 __FBSDID("$FreeBSD$");
  37
  38 #include "opt_hwpmc_hooks.h"
  39
  40 #include <sys/param.h>
  41 #include <sys/systm.h>
  42 #include <sys/kernel.h>
  43 #include <sys/ktr.h>
  44 #include <sys/lock.h>
  45 #include <sys/kthread.h>
  46 #include <sys/mutex.h>
  47 #include <sys/proc.h>
  48 #include <sys/resourcevar.h>
  49 #include <sys/sched.h>
  50 #include <sys/smp.h>
  51 #include <sys/sysctl.h>
  52 #include <sys/sx.h>
  53 #include <sys/turnstile.h>
  54 #include <sys/umtx.h>
  55 #include <machine/pcb.h>
  56 #include <machine/smp.h>
  57
  58 #ifdef HWPMC_HOOKS
  59 #include <sys/pmckern.h>
  60 #endif
  61
  62 /*
  63  * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
  64  * the range 100-256 Hz (approximately).
  65  */
  66 #define ESTCPULIM(e) \
  67     min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
  68     RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
  69 #ifdef SMP
  70 #define INVERSE_ESTCPU_WEIGHT   (8 * smp_cpus)
  71 #else
  72 #define INVERSE_ESTCPU_WEIGHT   8       /* 1 / (priorities per estcpu level). */
  73 #endif
  74 #define NICE_WEIGHT             1       /* Priorities per nice level. */
  75
  76 /*
  77  * The schedulable entity that runs a context.
  78  * This is  an extension to the thread structure and is tailored to
  79  * the requirements of this scheduler
  80  */
  81 struct td_sched {
  82         TAILQ_ENTRY(td_sched) ts_procq; /* (j/z) Run queue. */
  83         struct thread   *ts_thread;     /* (*) Active associated thread. */
  84         fixpt_t         ts_pctcpu;      /* (j) %cpu during p_swtime. */
  85         u_char          ts_rqindex;     /* (j) Run queue index. */
  86         int             ts_cpticks;     /* (j) Ticks of cpu time. */
  87         int             ts_slptime;     /* (j) Seconds !RUNNING. */
  88         struct runq     *ts_runq;       /* runq the thread is currently on */
  89 };
  90
  91 /* flags kept in td_flags */
  92 #define TDF_DIDRUN      TDF_SCHED0      /* thread actually ran. */
  93 #define TDF_EXIT        TDF_SCHED1      /* thread is being killed. */
  94 #define TDF_BOUND       TDF_SCHED2
  95
  96 #define ts_flags        ts_thread->td_flags
  97 #define TSF_DIDRUN      TDF_DIDRUN /* thread actually ran. */
  98 #define TSF_EXIT        TDF_EXIT /* thread is being killed. */
  99 #define TSF_BOUND       TDF_BOUND /* stuck to one CPU */
 100
 101 #define SKE_RUNQ_PCPU(ts)                                               \
 102     ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
 103
 104 static struct td_sched td_sched0;
 105 struct mtx sched_lock;
 106
 107 static int      sched_tdcnt;    /* Total runnable threads in the system. */
 108 static int      sched_quantum;  /* Roundrobin scheduling quantum in ticks. */
 109 #define SCHED_QUANTUM   (hz / 10)       /* Default sched quantum */
 110
 111 static void     setup_runqs(void);
 112 static void     schedcpu(void);
 113 static void     schedcpu_thread(void);
 114 static void     sched_priority(struct thread *td, u_char prio);
 115 static void     sched_setup(void *dummy);
 116 static void     maybe_resched(struct thread *td);
 117 static void     updatepri(struct thread *td);
 118 static void     resetpriority(struct thread *td);
 119 static void     resetpriority_thread(struct thread *td);
 120 #ifdef SMP
 121 static int      forward_wakeup(int  cpunum);
 122 #endif
 123
 124 static struct kproc_desc sched_kp = {
 125         "schedcpu",
 126         schedcpu_thread,
 127         NULL
 128 };
 129 SYSINIT(schedcpu, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, kproc_start, &sched_kp)
 130 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
 131
 132 /*
 133  * Global run queue.
 134  */
 135 static struct runq runq;
 136
 137 #ifdef SMP
 138 /*
 139  * Per-CPU run queues
 140  */
 141 static struct runq runq_pcpu[MAXCPU];
 142 #endif
 143
 144 static void
 145 setup_runqs(void)
 146 {
 147 #ifdef SMP
 148         int i;
 149
 150         for (i = 0; i < MAXCPU; ++i)
 151                 runq_init(&runq_pcpu[i]);
 152 #endif
 153
 154         runq_init(&runq);
 155 }
 156
 157 static int
 158 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 159 {
 160         int error, new_val;
 161
 162         new_val = sched_quantum * tick;
 163         error = sysctl_handle_int(oidp, &new_val, 0, req);
 164         if (error != 0 || req->newptr == NULL)
 165                 return (error);
 166         if (new_val < tick)
 167                 return (EINVAL);
 168         sched_quantum = new_val / tick;
 169         hogticks = 2 * sched_quantum;
 170         return (0);
 171 }
 172
 173 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD, 0, "Scheduler");
 174
 175 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0,
 176     "Scheduler name");
 177
 178 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW,
 179     0, sizeof sched_quantum, sysctl_kern_quantum, "I",
 180     "Roundrobin scheduling quantum in microseconds");
 181
 182 #ifdef SMP
 183 /* Enable forwarding of wakeups to all other cpus */
 184 SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, "Kernel SMP");
 185
 186 static int forward_wakeup_enabled = 1;
 187 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
 188            &forward_wakeup_enabled, 0,
 189            "Forwarding of wakeup to idle CPUs");
 190
 191 static int forward_wakeups_requested = 0;
 192 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
 193            &forward_wakeups_requested, 0,
 194            "Requests for Forwarding of wakeup to idle CPUs");
 195
 196 static int forward_wakeups_delivered = 0;
 197 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
 198            &forward_wakeups_delivered, 0,
 199            "Completed Forwarding of wakeup to idle CPUs");
 200
 201 static int forward_wakeup_use_mask = 1;
 202 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
 203            &forward_wakeup_use_mask, 0,
 204            "Use the mask of idle cpus");
 205
 206 static int forward_wakeup_use_loop = 0;
 207 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
 208            &forward_wakeup_use_loop, 0,
 209            "Use a loop to find idle cpus");
 210
 211 static int forward_wakeup_use_single = 0;
 212 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, onecpu, CTLFLAG_RW,
 213            &forward_wakeup_use_single, 0,
 214            "Only signal one idle cpu");
 215
 216 static int forward_wakeup_use_htt = 0;
 217 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, htt2, CTLFLAG_RW,
 218            &forward_wakeup_use_htt, 0,
 219            "account for htt");
 220
 221 #endif
 222 #if 0
 223 static int sched_followon = 0;
 224 SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
 225            &sched_followon, 0,
 226            "allow threads to share a quantum");
 227 #endif
 228
 229 static __inline void
 230 sched_load_add(void)
 231 {
 232         sched_tdcnt++;
 233         CTR1(KTR_SCHED, "global load: %d", sched_tdcnt);
 234 }
 235
 236 static __inline void
 237 sched_load_rem(void)
 238 {
 239         sched_tdcnt--;
 240         CTR1(KTR_SCHED, "global load: %d", sched_tdcnt);
 241 }
 242 /*
 243  * Arrange to reschedule if necessary, taking the priorities and
 244  * schedulers into account.
 245  */
 246 static void
 247 maybe_resched(struct thread *td)
 248 {
 249
 250         THREAD_LOCK_ASSERT(td, MA_OWNED);
 251         if (td->td_priority < curthread->td_priority)
 252                 curthread->td_flags |= TDF_NEEDRESCHED;
 253 }
 254
 255 /*
 256  * Constants for digital decay and forget:
 257  *      90% of (td_estcpu) usage in 5 * loadav time
 258  *      95% of (ts_pctcpu) usage in 60 seconds (load insensitive)
 259  *          Note that, as ps(1) mentions, this can let percentages
 260  *          total over 100% (I've seen 137.9% for 3 processes).
 261  *
 262  * Note that schedclock() updates td_estcpu and p_cpticks asynchronously.
 263  *
 264  * We wish to decay away 90% of td_estcpu in (5 * loadavg) seconds.
 265  * That is, the system wants to compute a value of decay such
 266  * that the following for loop:
 267  *      for (i = 0; i < (5 * loadavg); i++)
 268  *              td_estcpu *= decay;
 269  * will compute
 270  *      td_estcpu *= 0.1;
 271  * for all values of loadavg:
 272  *
 273  * Mathematically this loop can be expressed by saying:
 274  *      decay ** (5 * loadavg) ~= .1
 275  *
 276  * The system computes decay as:
 277  *      decay = (2 * loadavg) / (2 * loadavg + 1)
 278  *
 279  * We wish to prove that the system's computation of decay
 280  * will always fulfill the equation:
 281  *      decay ** (5 * loadavg) ~= .1
 282  *
 283  * If we compute b as:
 284  *      b = 2 * loadavg
 285  * then
 286  *      decay = b / (b + 1)
 287  *
 288  * We now need to prove two things:
 289  *      1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
 290  *      2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
 291  *
 292  * Facts:
 293  *         For x close to zero, exp(x) =~ 1 + x, since
 294  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
 295  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
 296  *         For x close to zero, ln(1+x) =~ x, since
 297  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
 298  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
 299  *         ln(.1) =~ -2.30
 300  *
 301  * Proof of (1):
 302  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
 303  *      solving for factor,
 304  *      ln(factor) =~ (-2.30/5*loadav), or
 305  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
 306  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
 307  *
 308  * Proof of (2):
 309  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
 310  *      solving for power,
 311  *      power*ln(b/(b+1)) =~ -2.30, or
 312  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
 313  *
 314  * Actual power values for the implemented algorithm are as follows:
 315  *      loadav: 1       2       3       4
 316  *      power:  5.68    10.32   14.94   19.55
 317  */
 318
 319 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
 320 #define loadfactor(loadav)      (2 * (loadav))
 321 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 322
 323 /* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 324 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
 325 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 326
 327 /*
 328  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
 329  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
 330  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
 331  *
 332  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
 333  *      1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
 334  *
 335  * If you don't want to bother with the faster/more-accurate formula, you
 336  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
 337  * (more general) method of calculating the %age of CPU used by a process.
 338  */
 339 #define CCPU_SHIFT      11
 340
 341 /*
 342  * Recompute process priorities, every hz ticks.
 343  * MP-safe, called without the Giant mutex.
 344  */
 345 /* ARGSUSED */
 346 static void
 347 schedcpu(void)
 348 {
 349         register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 350         struct thread *td;
 351         struct proc *p;
 352         struct td_sched *ts;
 353         int awake, realstathz;
 354
 355         realstathz = stathz ? stathz : hz;
 356         sx_slock(&allproc_lock);
 357         FOREACH_PROC_IN_SYSTEM(p) {
 358                 PROC_SLOCK(p);
 359                 FOREACH_THREAD_IN_PROC(p, td) {
 360                         awake = 0;
 361                         thread_lock(td);
 362                         ts = td->td_sched;
 363                         /*
 364                          * Increment sleep time (if sleeping).  We
 365                          * ignore overflow, as above.
 366                          */
 367                         /*
 368                          * The td_sched slptimes are not touched in wakeup
 369                          * because the thread may not HAVE everything in
 370                          * memory? XXX I think this is out of date.
 371                          */
 372                         if (TD_ON_RUNQ(td)) {
 373                                 awake = 1;
 374                                 ts->ts_flags &= ~TSF_DIDRUN;
 375                         } else if (TD_IS_RUNNING(td)) {
 376                                 awake = 1;
 377                                 /* Do not clear TSF_DIDRUN */
 378                         } else if (ts->ts_flags & TSF_DIDRUN) {
 379                                 awake = 1;
 380                                 ts->ts_flags &= ~TSF_DIDRUN;
 381                         }
 382
 383                         /*
 384                          * ts_pctcpu is only for ps and ttyinfo().
 385                          * Do it per td_sched, and add them up at the end?
 386                          * XXXKSE
 387                          */
 388                         ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT;
 389                         /*
 390                          * If the td_sched has been idle the entire second,
 391                          * stop recalculating its priority until
 392                          * it wakes up.
 393                          */
 394                         if (ts->ts_cpticks != 0) {
 395 #if     (FSHIFT >= CCPU_SHIFT)
 396                                 ts->ts_pctcpu += (realstathz == 100)
 397                                     ? ((fixpt_t) ts->ts_cpticks) <<
 398                                     (FSHIFT - CCPU_SHIFT) :
 399                                     100 * (((fixpt_t) ts->ts_cpticks)
 400                                     << (FSHIFT - CCPU_SHIFT)) / realstathz;
 401 #else
 402                                 ts->ts_pctcpu += ((FSCALE - ccpu) *
 403                                     (ts->ts_cpticks *
 404                                     FSCALE / realstathz)) >> FSHIFT;
 405 #endif
 406                                 ts->ts_cpticks = 0;
 407                         }
 408                         /*
 409                          * If there are ANY running threads in this process,
 410                          * then don't count it as sleeping.
 411 XXX  this is broken
 412
 413                          */
 414                         if (awake) {
 415                                 if (ts->ts_slptime > 1) {
 416                                         /*
 417                                          * In an ideal world, this should not
 418                                          * happen, because whoever woke us
 419                                          * up from the long sleep should have
 420                                          * unwound the slptime and reset our
 421                                          * priority before we run at the stale
 422                                          * priority.  Should KASSERT at some
 423                                          * point when all the cases are fixed.
 424                                          */
 425                                         updatepri(td);
 426                                 }
 427                                 ts->ts_slptime = 0;
 428                         } else
 429                                 ts->ts_slptime++;
 430                         if (ts->ts_slptime > 1) {
 431                                 thread_unlock(td);
 432                                 continue;
 433                         }
 434                         td->td_estcpu = decay_cpu(loadfac, td->td_estcpu);
 435                         resetpriority(td);
 436                         resetpriority_thread(td);
 437                         thread_unlock(td);
 438                 } /* end of thread loop */
 439                 PROC_SUNLOCK(p);
 440         } /* end of process loop */
 441         sx_sunlock(&allproc_lock);
 442 }
 443
 444 /*
 445  * Main loop for a kthread that executes schedcpu once a second.
 446  */
 447 static void
 448 schedcpu_thread(void)
 449 {
 450
 451         for (;;) {
 452                 schedcpu();
 453                 pause("-", hz);
 454         }
 455 }
 456
 457 /*
 458  * Recalculate the priority of a process after it has slept for a while.
 459  * For all load averages >= 1 and max td_estcpu of 255, sleeping for at
 460  * least six times the loadfactor will decay td_estcpu to zero.
 461  */
 462 static void
 463 updatepri(struct thread *td)
 464 {
 465         struct td_sched *ts;
 466         fixpt_t loadfac;
 467         unsigned int newcpu;
 468
 469         ts = td->td_sched;
 470         loadfac = loadfactor(averunnable.ldavg[0]);
 471         if (ts->ts_slptime > 5 * loadfac)
 472                 td->td_estcpu = 0;
 473         else {
 474                 newcpu = td->td_estcpu;
 475                 ts->ts_slptime--;       /* was incremented in schedcpu() */
 476                 while (newcpu && --ts->ts_slptime)
 477                         newcpu = decay_cpu(loadfac, newcpu);
 478                 td->td_estcpu = newcpu;
 479         }
 480 }
 481
 482 /*
 483  * Compute the priority of a process when running in user mode.
 484  * Arrange to reschedule if the resulting priority is better
 485  * than that of the current process.
 486  */
 487 static void
 488 resetpriority(struct thread *td)
 489 {
 490         register unsigned int newpriority;
 491
 492         if (td->td_pri_class == PRI_TIMESHARE) {
 493                 newpriority = PUSER + td->td_estcpu / INVERSE_ESTCPU_WEIGHT +
 494                     NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN);
 495                 newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
 496                     PRI_MAX_TIMESHARE);
 497                 sched_user_prio(td, newpriority);
 498         }
 499 }
 500
 501 /*
 502  * Update the thread's priority when the associated process's user
 503  * priority changes.
 504  */
 505 static void
 506 resetpriority_thread(struct thread *td)
 507 {
 508
 509         /* Only change threads with a time sharing user priority. */
 510         if (td->td_priority < PRI_MIN_TIMESHARE ||
 511             td->td_priority > PRI_MAX_TIMESHARE)
 512                 return;
 513
 514         /* XXX the whole needresched thing is broken, but not silly. */
 515         maybe_resched(td);
 516
 517         sched_prio(td, td->td_user_pri);
 518 }
 519
 520 /* ARGSUSED */
 521 static void
 522 sched_setup(void *dummy)
 523 {
 524         setup_runqs();
 525
 526         if (sched_quantum == 0)
 527                 sched_quantum = SCHED_QUANTUM;
 528         hogticks = 2 * sched_quantum;
 529
 530         /* Account for thread0. */
 531         sched_load_add();
 532 }
 533
 534 /* External interfaces start here */
 535 /*
 536  * Very early in the boot some setup of scheduler-specific
 537  * parts of proc0 and of some scheduler resources needs to be done.
 538  * Called from:
 539  *  proc0_init()
 540  */
 541 void
 542 schedinit(void)
 543 {
 544         /*
 545          * Set up the scheduler specific parts of proc0.
 546          */
 547         proc0.p_sched = NULL; /* XXX */
 548         thread0.td_sched = &td_sched0;
 549         thread0.td_lock = &sched_lock;
 550         td_sched0.ts_thread = &thread0;
 551         mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
 552 }
 553
 554 int
 555 sched_runnable(void)
 556 {
 557 #ifdef SMP
 558         return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]);
 559 #else
 560         return runq_check(&runq);
 561 #endif
 562 }
 563
 564 int
 565 sched_rr_interval(void)
 566 {
 567         if (sched_quantum == 0)
 568                 sched_quantum = SCHED_QUANTUM;
 569         return (sched_quantum);
 570 }
 571
 572 /*
 573  * We adjust the priority of the current process.  The priority of
 574  * a process gets worse as it accumulates CPU time.  The cpu usage
 575  * estimator (td_estcpu) is increased here.  resetpriority() will
 576  * compute a different priority each time td_estcpu increases by
 577  * INVERSE_ESTCPU_WEIGHT
 578  * (until MAXPRI is reached).  The cpu usage estimator ramps up
 579  * quite quickly when the process is running (linearly), and decays
 580  * away exponentially, at a rate which is proportionally slower when
 581  * the system is busy.  The basic principle is that the system will
 582  * 90% forget that the process used a lot of CPU time in 5 * loadav
 583  * seconds.  This causes the system to favor processes which haven't
 584  * run much recently, and to round-robin among other processes.
 585  */
 586 void
 587 sched_clock(struct thread *td)
 588 {
 589         struct td_sched *ts;
 590
 591         THREAD_LOCK_ASSERT(td, MA_OWNED);
 592         ts = td->td_sched;
 593
 594         ts->ts_cpticks++;
 595         td->td_estcpu = ESTCPULIM(td->td_estcpu + 1);
 596         if ((td->td_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
 597                 resetpriority(td);
 598                 resetpriority_thread(td);
 599         }
 600
 601         /*
 602          * Force a context switch if the current thread has used up a full
 603          * quantum (default quantum is 100ms).
 604          */
 605         if (!TD_IS_IDLETHREAD(td) &&
 606             ticks - PCPU_GET(switchticks) >= sched_quantum)
 607                 td->td_flags |= TDF_NEEDRESCHED;
 608 }
 609
 610 /*
 611  * charge childs scheduling cpu usage to parent.
 612  */
 613 void
 614 sched_exit(struct proc *p, struct thread *td)
 615 {
 616
 617         CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
 618             td, td->td_name, td->td_priority);
 619         PROC_SLOCK_ASSERT(p, MA_OWNED);
 620         sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
 621 }
 622
 623 void
 624 sched_exit_thread(struct thread *td, struct thread *child)
 625 {
 626
 627         CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
 628             child, child->td_name, child->td_priority);
 629         thread_lock(td);
 630         td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu);
 631         thread_unlock(td);
 632         mtx_lock_spin(&sched_lock);
 633         if ((child->td_proc->p_flag & P_NOLOAD) == 0)
 634                 sched_load_rem();
 635         mtx_unlock_spin(&sched_lock);
 636 }
 637
 638 void
 639 sched_fork(struct thread *td, struct thread *childtd)
 640 {
 641         sched_fork_thread(td, childtd);
 642 }
 643
 644 void
 645 sched_fork_thread(struct thread *td, struct thread *childtd)
 646 {
 647         childtd->td_estcpu = td->td_estcpu;
 648         childtd->td_lock = &sched_lock;
 649         sched_newthread(childtd);
 650 }
 651
 652 void
 653 sched_nice(struct proc *p, int nice)
 654 {
 655         struct thread *td;
 656
 657         PROC_LOCK_ASSERT(p, MA_OWNED);
 658         PROC_SLOCK_ASSERT(p, MA_OWNED);
 659         p->p_nice = nice;
 660         FOREACH_THREAD_IN_PROC(p, td) {
 661                 thread_lock(td);
 662                 resetpriority(td);
 663                 resetpriority_thread(td);
 664                 thread_unlock(td);
 665         }
 666 }
 667
 668 void
 669 sched_class(struct thread *td, int class)
 670 {
 671         THREAD_LOCK_ASSERT(td, MA_OWNED);
 672         td->td_pri_class = class;
 673 }
 674
 675 /*
 676  * Adjust the priority of a thread.
 677  */
 678 static void
 679 sched_priority(struct thread *td, u_char prio)
 680 {
 681         CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
 682             td, td->td_name, td->td_priority, prio, curthread,
 683             curthread->td_name);
 684
 685         THREAD_LOCK_ASSERT(td, MA_OWNED);
 686         if (td->td_priority == prio)
 687                 return;
 688         td->td_priority = prio;
 689         if (TD_ON_RUNQ(td) &&
 690             td->td_sched->ts_rqindex != (prio / RQ_PPQ)) {
 691                 sched_rem(td);
 692                 sched_add(td, SRQ_BORING);
 693         }
 694 }
 695
 696 /*
 697  * Update a thread's priority when it is lent another thread's
 698  * priority.
 699  */
 700 void
 701 sched_lend_prio(struct thread *td, u_char prio)
 702 {
 703
 704         td->td_flags |= TDF_BORROWING;
 705         sched_priority(td, prio);
 706 }
 707
 708 /*
 709  * Restore a thread's priority when priority propagation is
 710  * over.  The prio argument is the minimum priority the thread
 711  * needs to have to satisfy other possible priority lending
 712  * requests.  If the thread's regulary priority is less
 713  * important than prio the thread will keep a priority boost
 714  * of prio.
 715  */
 716 void
 717 sched_unlend_prio(struct thread *td, u_char prio)
 718 {
 719         u_char base_pri;
 720
 721         if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
 722             td->td_base_pri <= PRI_MAX_TIMESHARE)
 723                 base_pri = td->td_user_pri;
 724         else
 725                 base_pri = td->td_base_pri;
 726         if (prio >= base_pri) {
 727                 td->td_flags &= ~TDF_BORROWING;
 728                 sched_prio(td, base_pri);
 729         } else
 730                 sched_lend_prio(td, prio);
 731 }
 732
 733 void
 734 sched_prio(struct thread *td, u_char prio)
 735 {
 736         u_char oldprio;
 737
 738         /* First, update the base priority. */
 739         td->td_base_pri = prio;
 740
 741         /*
 742          * If the thread is borrowing another thread's priority, don't ever
 743          * lower the priority.
 744          */
 745         if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
 746                 return;
 747
 748         /* Change the real priority. */
 749         oldprio = td->td_priority;
 750         sched_priority(td, prio);
 751
 752         /*
 753          * If the thread is on a turnstile, then let the turnstile update
 754          * its state.
 755          */
 756         if (TD_ON_LOCK(td) && oldprio != prio)
 757                 turnstile_adjust(td, oldprio);
 758 }
 759
 760 void
 761 sched_user_prio(struct thread *td, u_char prio)
 762 {
 763         u_char oldprio;
 764
 765         THREAD_LOCK_ASSERT(td, MA_OWNED);
 766         td->td_base_user_pri = prio;
 767         if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
 768                 return;
 769         oldprio = td->td_user_pri;
 770         td->td_user_pri = prio;
 771 }
 772
 773 void
 774 sched_lend_user_prio(struct thread *td, u_char prio)
 775 {
 776         u_char oldprio;
 777
 778         THREAD_LOCK_ASSERT(td, MA_OWNED);
 779         td->td_flags |= TDF_UBORROWING;
 780         oldprio = td->td_user_pri;
 781         td->td_user_pri = prio;
 782 }
 783
 784 void
 785 sched_unlend_user_prio(struct thread *td, u_char prio)
 786 {
 787         u_char base_pri;
 788
 789         THREAD_LOCK_ASSERT(td, MA_OWNED);
 790         base_pri = td->td_base_user_pri;
 791         if (prio >= base_pri) {
 792                 td->td_flags &= ~TDF_UBORROWING;
 793                 sched_user_prio(td, base_pri);
 794         } else {
 795                 sched_lend_user_prio(td, prio);
 796         }
 797 }
 798
 799 void
 800 sched_sleep(struct thread *td)
 801 {
 802
 803         THREAD_LOCK_ASSERT(td, MA_OWNED);
 804         td->td_slptick = ticks;
 805         td->td_sched->ts_slptime = 0;
 806 }
 807
 808 void
 809 sched_switch(struct thread *td, struct thread *newtd, int flags)
 810 {
 811         struct td_sched *ts;
 812         struct proc *p;
 813
 814         ts = td->td_sched;
 815         p = td->td_proc;
 816
 817         THREAD_LOCK_ASSERT(td, MA_OWNED);
 818         /*
 819          * Switch to the sched lock to fix things up and pick
 820          * a new thread.
 821          */
 822         if (td->td_lock != &sched_lock) {
 823                 mtx_lock_spin(&sched_lock);
 824                 thread_unlock(td);
 825         }
 826
 827         if ((p->p_flag & P_NOLOAD) == 0)
 828                 sched_load_rem();
 829
 830         if (newtd)
 831                 newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED);
 832
 833         td->td_lastcpu = td->td_oncpu;
 834         td->td_flags &= ~TDF_NEEDRESCHED;
 835         td->td_owepreempt = 0;
 836         td->td_oncpu = NOCPU;
 837         /*
 838          * At the last moment, if this thread is still marked RUNNING,
 839          * then put it back on the run queue as it has not been suspended
 840          * or stopped or any thing else similar.  We never put the idle
 841          * threads on the run queue, however.
 842          */
 843         if (td->td_flags & TDF_IDLETD) {
 844                 TD_SET_CAN_RUN(td);
 845 #ifdef SMP
 846                 idle_cpus_mask &= ~PCPU_GET(cpumask);
 847 #endif
 848         } else {
 849                 if (TD_IS_RUNNING(td)) {
 850                         /* Put us back on the run queue. */
 851                         sched_add(td, (flags & SW_PREEMPT) ?
 852                             SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 853                             SRQ_OURSELF|SRQ_YIELDING);
 854                 }
 855         }
 856         if (newtd) {
 857                 /*
 858                  * The thread we are about to run needs to be counted
 859                  * as if it had been added to the run queue and selected.
 860                  * It came from:
 861                  * * A preemption
 862                  * * An upcall
 863                  * * A followon
 864                  */
 865                 KASSERT((newtd->td_inhibitors == 0),
 866                         ("trying to run inhibited thread"));
 867                 newtd->td_sched->ts_flags |= TSF_DIDRUN;
 868                 TD_SET_RUNNING(newtd);
 869                 if ((newtd->td_proc->p_flag & P_NOLOAD) == 0)
 870                         sched_load_add();
 871         } else {
 872                 newtd = choosethread();
 873         }
 874         MPASS(newtd->td_lock == &sched_lock);
 875
 876         if (td != newtd) {
 877 #ifdef  HWPMC_HOOKS
 878                 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 879                         PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 880 #endif
 881                 /* I feel sleepy */
 882                 lock_profile_release_lock(&sched_lock.lock_object);
 883                 cpu_switch(td, newtd, td->td_lock);
 884                 lock_profile_obtain_lock_success(&sched_lock.lock_object,
 885                     0, 0, __FILE__, __LINE__);
 886                 /*
 887                  * Where am I?  What year is it?
 888                  * We are in the same thread that went to sleep above,
 889                  * but any amount of time may have passed. All out context
 890                  * will still be available as will local variables.
 891                  * PCPU values however may have changed as we may have
 892                  * changed CPU so don't trust cached values of them.
 893                  * New threads will go to fork_exit() instead of here
 894                  * so if you change things here you may need to change
 895                  * things there too.
 896                  * If the thread above was exiting it will never wake
 897                  * up again here, so either it has saved everything it
 898                  * needed to, or the thread_wait() or wait() will
 899                  * need to reap it.
 900                  */
 901 #ifdef  HWPMC_HOOKS
 902                 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 903                         PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 904 #endif
 905         }
 906
 907 #ifdef SMP
 908         if (td->td_flags & TDF_IDLETD)
 909                 idle_cpus_mask |= PCPU_GET(cpumask);
 910 #endif
 911         sched_lock.mtx_lock = (uintptr_t)td;
 912         td->td_oncpu = PCPU_GET(cpuid);
 913         MPASS(td->td_lock == &sched_lock);
 914 }
 915
 916 void
 917 sched_wakeup(struct thread *td)
 918 {
 919         struct td_sched *ts;
 920
 921         THREAD_LOCK_ASSERT(td, MA_OWNED);
 922         ts = td->td_sched;
 923         if (ts->ts_slptime > 1) {
 924                 updatepri(td);
 925                 resetpriority(td);
 926         }
 927         td->td_slptick = ticks;
 928         ts->ts_slptime = 0;
 929         sched_add(td, SRQ_BORING);
 930 }
 931
 932 #ifdef SMP
 933 /* enable HTT_2 if you have a 2-way HTT cpu.*/
 934 static int
 935 forward_wakeup(int  cpunum)
 936 {
 937         cpumask_t map, me, dontuse;
 938         cpumask_t map2;
 939         struct pcpu *pc;
 940         cpumask_t id, map3;
 941
 942         mtx_assert(&sched_lock, MA_OWNED);
 943
 944         CTR0(KTR_RUNQ, "forward_wakeup()");
 945
 946         if ((!forward_wakeup_enabled) ||
 947              (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
 948                 return (0);
 949         if (!smp_started || cold || panicstr)
 950                 return (0);
 951
 952         forward_wakeups_requested++;
 953
 954 /*
 955  * check the idle mask we received against what we calculated before
 956  * in the old version.
 957  */
 958         me = PCPU_GET(cpumask);
 959         /*
 960          * don't bother if we should be doing it ourself..
 961          */
 962         if ((me & idle_cpus_mask) && (cpunum == NOCPU || me == (1 << cpunum)))
 963                 return (0);
 964
 965         dontuse = me | stopped_cpus | hlt_cpus_mask;
 966         map3 = 0;
 967         if (forward_wakeup_use_loop) {
 968                 SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
 969                         id = pc->pc_cpumask;
 970                         if ( (id & dontuse) == 0 &&
 971                             pc->pc_curthread == pc->pc_idlethread) {
 972                                 map3 |= id;
 973                         }
 974                 }
 975         }
 976
 977         if (forward_wakeup_use_mask) {
 978                 map = 0;
 979                 map = idle_cpus_mask & ~dontuse;
 980
 981                 /* If they are both on, compare and use loop if different */
 982                 if (forward_wakeup_use_loop) {
 983                         if (map != map3) {
 984                                 printf("map (%02X) != map3 (%02X)\n",
 985                                                 map, map3);
 986                                 map = map3;
 987                         }
 988                 }
 989         } else {
 990                 map = map3;
 991         }
 992         /* If we only allow a specific CPU, then mask off all the others */
 993         if (cpunum != NOCPU) {
 994                 KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
 995                 map &= (1 << cpunum);
 996         } else {
 997                 /* Try choose an idle die. */
 998                 if (forward_wakeup_use_htt) {
 999                         map2 =  (map & (map >> 1)) & 0x5555;
1000                         if (map2) {
1001                                 map = map2;
1002                         }
1003                 }
1004
1005                 /* set only one bit */
1006                 if (forward_wakeup_use_single) {
1007                         map = map & ((~map) + 1);
1008                 }
1009         }
1010         if (map) {
1011                 forward_wakeups_delivered++;
1012                 ipi_selected(map, IPI_AST);
1013                 return (1);
1014         }
1015         if (cpunum == NOCPU)
1016                 printf("forward_wakeup: Idle processor not found\n");
1017         return (0);
1018 }
1019 #endif
1020
1021 #ifdef SMP
1022 static void kick_other_cpu(int pri,int cpuid);
1023
1024 static void
1025 kick_other_cpu(int pri,int cpuid)
1026 {
1027         struct pcpu * pcpu = pcpu_find(cpuid);
1028         int cpri = pcpu->pc_curthread->td_priority;
1029
1030         if (idle_cpus_mask & pcpu->pc_cpumask) {
1031                 forward_wakeups_delivered++;
1032                 ipi_selected(pcpu->pc_cpumask, IPI_AST);
1033                 return;
1034         }
1035
1036         if (pri >= cpri)
1037                 return;
1038
1039 #if defined(IPI_PREEMPTION) && defined(PREEMPTION)
1040 #if !defined(FULL_PREEMPTION)
1041         if (pri <= PRI_MAX_ITHD)
1042 #endif /* ! FULL_PREEMPTION */
1043         {
1044                 ipi_selected(pcpu->pc_cpumask, IPI_PREEMPT);
1045                 return;
1046         }
1047 #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
1048
1049         pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
1050         ipi_selected( pcpu->pc_cpumask , IPI_AST);
1051         return;
1052 }
1053 #endif /* SMP */
1054
1055 void
1056 sched_add(struct thread *td, int flags)
1057 #ifdef SMP
1058 {
1059         struct td_sched *ts;
1060         int forwarded = 0;
1061         int cpu;
1062         int single_cpu = 0;
1063
1064         ts = td->td_sched;
1065         THREAD_LOCK_ASSERT(td, MA_OWNED);
1066         KASSERT((td->td_inhibitors == 0),
1067             ("sched_add: trying to run inhibited thread"));
1068         KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
1069             ("sched_add: bad thread state"));
1070         KASSERT(td->td_flags & TDF_INMEM,
1071             ("sched_add: thread swapped out"));
1072         CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
1073             td, td->td_name, td->td_priority, curthread,
1074             curthread->td_name);
1075         /*
1076          * Now that the thread is moving to the run-queue, set the lock
1077          * to the scheduler's lock.
1078          */
1079         if (td->td_lock != &sched_lock) {
1080                 mtx_lock_spin(&sched_lock);
1081                 thread_lock_set(td, &sched_lock);
1082         }
1083         TD_SET_RUNQ(td);
1084
1085         if (td->td_pinned != 0) {
1086                 cpu = td->td_lastcpu;
1087                 ts->ts_runq = &runq_pcpu[cpu];
1088                 single_cpu = 1;
1089                 CTR3(KTR_RUNQ,
1090                     "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu);
1091         } else if ((ts)->ts_flags & TSF_BOUND) {
1092                 /* Find CPU from bound runq */
1093                 KASSERT(SKE_RUNQ_PCPU(ts),("sched_add: bound td_sched not on cpu runq"));
1094                 cpu = ts->ts_runq - &runq_pcpu[0];
1095                 single_cpu = 1;
1096                 CTR3(KTR_RUNQ,
1097                     "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu);
1098         } else {
1099                 CTR2(KTR_RUNQ,
1100                     "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts, td);
1101                 cpu = NOCPU;
1102                 ts->ts_runq = &runq;
1103         }
1104
1105         if (single_cpu && (cpu != PCPU_GET(cpuid))) {
1106                 kick_other_cpu(td->td_priority,cpu);
1107         } else {
1108
1109                 if (!single_cpu) {
1110                         cpumask_t me = PCPU_GET(cpumask);
1111                         int idle = idle_cpus_mask & me;
1112
1113                         if (!idle && ((flags & SRQ_INTR) == 0) &&
1114                             (idle_cpus_mask & ~(hlt_cpus_mask | me)))
1115                                 forwarded = forward_wakeup(cpu);
1116                 }
1117
1118                 if (!forwarded) {
1119                         if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td))
1120                                 return;
1121                         else
1122                                 maybe_resched(td);
1123                 }
1124         }
1125
1126         if ((td->td_proc->p_flag & P_NOLOAD) == 0)
1127                 sched_load_add();
1128         runq_add(ts->ts_runq, ts, flags);
1129 }
1130 #else /* SMP */
1131 {
1132         struct td_sched *ts;
1133         ts = td->td_sched;
1134         THREAD_LOCK_ASSERT(td, MA_OWNED);
1135         KASSERT((td->td_inhibitors == 0),
1136             ("sched_add: trying to run inhibited thread"));
1137         KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
1138             ("sched_add: bad thread state"));
1139         KASSERT(td->td_flags & TDF_INMEM,
1140             ("sched_add: thread swapped out"));
1141         CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
1142             td, td->td_name, td->td_priority, curthread,
1143             curthread->td_name);
1144         /*
1145          * Now that the thread is moving to the run-queue, set the lock
1146          * to the scheduler's lock.
1147          */
1148         if (td->td_lock != &sched_lock) {
1149                 mtx_lock_spin(&sched_lock);
1150                 thread_lock_set(td, &sched_lock);
1151         }
1152         TD_SET_RUNQ(td);
1153         CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
1154         ts->ts_runq = &runq;
1155
1156         /*
1157          * If we are yielding (on the way out anyhow)
1158          * or the thread being saved is US,
1159          * then don't try be smart about preemption
1160          * or kicking off another CPU
1161          * as it won't help and may hinder.
1162          * In the YIEDLING case, we are about to run whoever is
1163          * being put in the queue anyhow, and in the
1164          * OURSELF case, we are puting ourself on the run queue
1165          * which also only happens when we are about to yield.
1166          */
1167         if((flags & SRQ_YIELDING) == 0) {
1168                 if (maybe_preempt(td))
1169                         return;
1170         }
1171         if ((td->td_proc->p_flag & P_NOLOAD) == 0)
1172                 sched_load_add();
1173         runq_add(ts->ts_runq, ts, flags);
1174         maybe_resched(td);
1175 }
1176 #endif /* SMP */
1177
1178 void
1179 sched_rem(struct thread *td)
1180 {
1181         struct td_sched *ts;
1182
1183         ts = td->td_sched;
1184         KASSERT(td->td_flags & TDF_INMEM,
1185             ("sched_rem: thread swapped out"));
1186         KASSERT(TD_ON_RUNQ(td),
1187             ("sched_rem: thread not on run queue"));
1188         mtx_assert(&sched_lock, MA_OWNED);
1189         CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
1190             td, td->td_name, td->td_priority, curthread,
1191             curthread->td_name);
1192
1193         if ((td->td_proc->p_flag & P_NOLOAD) == 0)
1194                 sched_load_rem();
1195         runq_remove(ts->ts_runq, ts);
1196         TD_SET_CAN_RUN(td);
1197 }
1198
1199 /*
1200  * Select threads to run.
1201  * Notice that the running threads still consume a slot.
1202  */
1203 struct thread *
1204 sched_choose(void)
1205 {
1206         struct td_sched *ts;
1207         struct runq *rq;
1208
1209         mtx_assert(&sched_lock,  MA_OWNED);
1210 #ifdef SMP
1211         struct td_sched *kecpu;
1212
1213         rq = &runq;
1214         ts = runq_choose(&runq);
1215         kecpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
1216
1217         if (ts == NULL ||
1218             (kecpu != NULL &&
1219              kecpu->ts_thread->td_priority < ts->ts_thread->td_priority)) {
1220                 CTR2(KTR_RUNQ, "choosing td_sched %p from pcpu runq %d", kecpu,
1221                      PCPU_GET(cpuid));
1222                 ts = kecpu;
1223                 rq = &runq_pcpu[PCPU_GET(cpuid)];
1224         } else {
1225                 CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", ts);
1226         }
1227
1228 #else
1229         rq = &runq;
1230         ts = runq_choose(&runq);
1231 #endif
1232
1233         if (ts) {
1234                 runq_remove(rq, ts);
1235                 ts->ts_flags |= TSF_DIDRUN;
1236
1237                 KASSERT(ts->ts_thread->td_flags & TDF_INMEM,
1238                     ("sched_choose: thread swapped out"));
1239                 return (ts->ts_thread);
1240         }
1241         return (PCPU_GET(idlethread));
1242 }
1243
1244 void
1245 sched_userret(struct thread *td)
1246 {
1247         /*
1248          * XXX we cheat slightly on the locking here to avoid locking in
1249          * the usual case.  Setting td_priority here is essentially an
1250          * incomplete workaround for not setting it properly elsewhere.
1251          * Now that some interrupt handlers are threads, not setting it
1252          * properly elsewhere can clobber it in the window between setting
1253          * it here and returning to user mode, so don't waste time setting
1254          * it perfectly here.
1255          */
1256         KASSERT((td->td_flags & TDF_BORROWING) == 0,
1257             ("thread with borrowed priority returning to userland"));
1258         if (td->td_priority != td->td_user_pri) {
1259                 thread_lock(td);
1260                 td->td_priority = td->td_user_pri;
1261                 td->td_base_pri = td->td_user_pri;
1262                 thread_unlock(td);
1263         }
1264 }
1265
1266 void
1267 sched_bind(struct thread *td, int cpu)
1268 {
1269         struct td_sched *ts;
1270
1271         THREAD_LOCK_ASSERT(td, MA_OWNED);
1272         KASSERT(TD_IS_RUNNING(td),
1273             ("sched_bind: cannot bind non-running thread"));
1274
1275         ts = td->td_sched;
1276
1277         ts->ts_flags |= TSF_BOUND;
1278 #ifdef SMP
1279         ts->ts_runq = &runq_pcpu[cpu];
1280         if (PCPU_GET(cpuid) == cpu)
1281                 return;
1282
1283         mi_switch(SW_VOL, NULL);
1284 #endif
1285 }
1286
1287 void
1288 sched_unbind(struct thread* td)
1289 {
1290         THREAD_LOCK_ASSERT(td, MA_OWNED);
1291         td->td_sched->ts_flags &= ~TSF_BOUND;
1292 }
1293
1294 int
1295 sched_is_bound(struct thread *td)
1296 {
1297         THREAD_LOCK_ASSERT(td, MA_OWNED);
1298         return (td->td_sched->ts_flags & TSF_BOUND);
1299 }
1300
1301 void
1302 sched_relinquish(struct thread *td)
1303 {
1304         thread_lock(td);
1305         SCHED_STAT_INC(switch_relinquish);
1306         mi_switch(SW_VOL, NULL);
1307         thread_unlock(td);
1308 }
1309
1310 int
1311 sched_load(void)
1312 {
1313         return (sched_tdcnt);
1314 }
1315
1316 int
1317 sched_sizeof_proc(void)
1318 {
1319         return (sizeof(struct proc));
1320 }
1321
1322 int
1323 sched_sizeof_thread(void)
1324 {
1325         return (sizeof(struct thread) + sizeof(struct td_sched));
1326 }
1327
1328 fixpt_t
1329 sched_pctcpu(struct thread *td)
1330 {
1331         struct td_sched *ts;
1332
1333         ts = td->td_sched;
1334         return (ts->ts_pctcpu);
1335 }
1336
1337 void
1338 sched_tick(void)
1339 {
1340 }
1341
1342 /*
1343  * The actual idle process.
1344  */
1345 void
1346 sched_idletd(void *dummy)
1347 {
1348
1349         for (;;) {
1350                 mtx_assert(&Giant, MA_NOTOWNED);
1351
1352                 while (sched_runnable() == 0)
1353                         cpu_idle();
1354
1355                 mtx_lock_spin(&sched_lock);
1356                 mi_switch(SW_VOL, NULL);
1357                 mtx_unlock_spin(&sched_lock);
1358         }
1359 }
1360
1361 /*
1362  * A CPU is entering for the first time or a thread is exiting.
1363  */
1364 void
1365 sched_throw(struct thread *td)
1366 {
1367         /*
1368          * Correct spinlock nesting.  The idle thread context that we are
1369          * borrowing was created so that it would start out with a single
1370          * spin lock (sched_lock) held in fork_trampoline().  Since we've
1371          * explicitly acquired locks in this function, the nesting count
1372          * is now 2 rather than 1.  Since we are nested, calling
1373          * spinlock_exit() will simply adjust the counts without allowing
1374          * spin lock using code to interrupt us.
1375          */
1376         if (td == NULL) {
1377                 mtx_lock_spin(&sched_lock);
1378                 spinlock_exit();
1379         } else {
1380                 lock_profile_release_lock(&sched_lock.lock_object);
1381                 MPASS(td->td_lock == &sched_lock);
1382         }
1383         mtx_assert(&sched_lock, MA_OWNED);
1384         KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
1385         PCPU_SET(switchtime, cpu_ticks());
1386         PCPU_SET(switchticks, ticks);
1387         cpu_throw(td, choosethread());  /* doesn't return */
1388 }
1389
1390 void
1391 sched_fork_exit(struct thread *td)
1392 {
1393
1394         /*
1395          * Finish setting up thread glue so that it begins execution in a
1396          * non-nested critical section with sched_lock held but not recursed.
1397          */
1398         td->td_oncpu = PCPU_GET(cpuid);
1399         sched_lock.mtx_lock = (uintptr_t)td;
1400         lock_profile_obtain_lock_success(&sched_lock.lock_object,
1401             0, 0, __FILE__, __LINE__);
1402         THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
1403 }
1404
1405 #define KERN_SWITCH_INCLUDE 1
1406 #include "kern/kern_switch.c"