2 * SPDX-License-Identifier: BSD-2-Clause
4 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include <sys/cdefs.h>
30 #include "opt_sched.h"
32 #include <sys/param.h>
33 #include <sys/systm.h>
35 #include <sys/kernel.h>
38 #include <sys/mutex.h>
40 #include <sys/queue.h>
41 #include <sys/sched.h>
43 #include <sys/sysctl.h>
45 #include <machine/cpu.h>
47 /* Uncomment this to enable logging of critical_enter/exit. */
49 #define KTR_CRITICAL KTR_SCHED
51 #define KTR_CRITICAL 0
54 #ifdef FULL_PREEMPTION
56 #error "The FULL_PREEMPTION option requires the PREEMPTION option"
60 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
63 * kern.sched.preemption allows user space to determine if preemption support
64 * is compiled in or not. It is not currently a boot or runtime flag that
68 static int kern_sched_preemption = 1;
70 static int kern_sched_preemption = 0;
72 SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
73 &kern_sched_preemption, 0, "Kernel preemption enabled");
76 * Support for scheduler stats exported via kern.sched.stats. All stats may
77 * be reset with kern.sched.stats.reset = 1. Stats may be defined elsewhere
78 * with SCHED_STAT_DEFINE().
81 SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
84 /* Switch reasons from mi_switch(9). */
85 DPCPU_DEFINE(long, sched_switch_stats[SWT_COUNT]);
86 SCHED_STAT_DEFINE_VAR(owepreempt,
87 &DPCPU_NAME(sched_switch_stats[SWT_OWEPREEMPT]), "");
88 SCHED_STAT_DEFINE_VAR(turnstile,
89 &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), "");
90 SCHED_STAT_DEFINE_VAR(sleepq,
91 &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), "");
92 SCHED_STAT_DEFINE_VAR(relinquish,
93 &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), "");
94 SCHED_STAT_DEFINE_VAR(needresched,
95 &DPCPU_NAME(sched_switch_stats[SWT_NEEDRESCHED]), "");
96 SCHED_STAT_DEFINE_VAR(idle,
97 &DPCPU_NAME(sched_switch_stats[SWT_IDLE]), "");
98 SCHED_STAT_DEFINE_VAR(iwait,
99 &DPCPU_NAME(sched_switch_stats[SWT_IWAIT]), "");
100 SCHED_STAT_DEFINE_VAR(suspend,
101 &DPCPU_NAME(sched_switch_stats[SWT_SUSPEND]), "");
102 SCHED_STAT_DEFINE_VAR(remotepreempt,
103 &DPCPU_NAME(sched_switch_stats[SWT_REMOTEPREEMPT]), "");
104 SCHED_STAT_DEFINE_VAR(remotewakeidle,
105 &DPCPU_NAME(sched_switch_stats[SWT_REMOTEWAKEIDLE]), "");
106 SCHED_STAT_DEFINE_VAR(bind,
107 &DPCPU_NAME(sched_switch_stats[SWT_BIND]), "");
110 sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
112 struct sysctl_oid *p;
119 error = sysctl_handle_int(oidp, &val, 0, req);
120 if (error != 0 || req->newptr == NULL)
125 * Traverse the list of children of _kern_sched_stats and reset each
126 * to 0. Skip the reset entry.
128 RB_FOREACH(p, sysctl_oid_list, oidp->oid_parent) {
129 if (p == oidp || p->oid_arg1 == NULL)
131 counter = (uintptr_t)p->oid_arg1;
133 *(long *)(dpcpu_off[i] + counter) = 0;
139 SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset,
140 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE, NULL, 0,
141 sysctl_stats_reset, "I",
142 "Reset scheduler statistics");
145 /************************************************************************
146 * Functions that manipulate runnability from a thread perspective. *
147 ************************************************************************/
149 * Select the thread that will be run next.
152 static __noinline struct thread *
153 choosethread_panic(struct thread *td)
157 * If we are in panic, only allow system threads,
158 * plus the one we are running in, to be run.
161 if (((td->td_proc->p_flag & P_SYSTEM) == 0 &&
162 (td->td_flags & TDF_INPANIC) == 0)) {
163 /* note that it is no longer on the run queue */
180 if (KERNEL_PANICKED())
181 return (choosethread_panic(td));
188 * Kernel thread preemption implementation. Critical sections mark
189 * regions of code in which preemptions are not allowed.
191 * It might seem a good idea to inline critical_enter() but, in order
192 * to prevent instructions reordering by the compiler, a __compiler_membar()
193 * would have to be used here (the same as sched_pin()). The performance
194 * penalty imposed by the membar could, then, produce slower code than
195 * the function call itself, for most cases.
198 critical_enter_KBI(void)
201 struct thread *td = curthread;
204 CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
205 (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
209 critical_exit_preempt(void)
215 * If td_critnest is 0, it is possible that we are going to get
216 * preempted again before reaching the code below. This happens
217 * rarely and is harmless. However, this means td_owepreempt may
221 if (td->td_critnest != 0)
227 * Microoptimization: we committed to switch,
228 * disable preemption in interrupt handlers
229 * while spinning for the thread lock.
234 flags = SW_INVOL | SW_PREEMPT;
235 if (TD_IS_IDLETHREAD(td))
238 flags |= SWT_OWEPREEMPT;
243 critical_exit_KBI(void)
246 struct thread *td = curthread;
249 CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
250 (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
253 /************************************************************************
254 * SYSTEM RUN QUEUE manipulations and tests *
255 ************************************************************************/
257 * Initialize a run structure.
260 runq_init(struct runq *rq)
264 bzero(rq, sizeof *rq);
265 for (i = 0; i < RQ_NQS; i++)
266 TAILQ_INIT(&rq->rq_queues[i]);
270 * Clear the status bit of the queue corresponding to priority level pri,
271 * indicating that it is empty.
274 runq_clrbit(struct runq *rq, int pri)
278 rqb = &rq->rq_status;
279 CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
280 rqb->rqb_bits[RQB_WORD(pri)],
281 rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
282 RQB_BIT(pri), RQB_WORD(pri));
283 rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
287 * Find the index of the first non-empty run queue. This is done by
288 * scanning the status bits, a set bit indicates a non-empty queue.
291 runq_findbit(struct runq *rq)
297 rqb = &rq->rq_status;
298 for (i = 0; i < RQB_LEN; i++)
299 if (rqb->rqb_bits[i]) {
300 pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
301 CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
302 rqb->rqb_bits[i], i, pri);
310 runq_findbit_from(struct runq *rq, u_char pri)
317 * Set the mask for the first word so we ignore priorities before 'pri'.
319 mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1));
320 rqb = &rq->rq_status;
322 for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) {
323 mask = rqb->rqb_bits[i] & mask;
326 pri = RQB_FFS(mask) + (i << RQB_L2BPW);
327 CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
334 * Wrap back around to the beginning of the list just once so we
335 * scan the whole thing.
342 * Set the status bit of the queue corresponding to priority level pri,
343 * indicating that it is non-empty.
346 runq_setbit(struct runq *rq, int pri)
350 rqb = &rq->rq_status;
351 CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
352 rqb->rqb_bits[RQB_WORD(pri)],
353 rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
354 RQB_BIT(pri), RQB_WORD(pri));
355 rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
359 * Add the thread to the queue specified by its priority, and set the
360 * corresponding status bit.
363 runq_add(struct runq *rq, struct thread *td, int flags)
368 pri = td->td_priority / RQ_PPQ;
369 td->td_rqindex = pri;
370 runq_setbit(rq, pri);
371 rqh = &rq->rq_queues[pri];
372 CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p",
373 td, td->td_priority, pri, rqh);
374 if (flags & SRQ_PREEMPTED) {
375 TAILQ_INSERT_HEAD(rqh, td, td_runq);
377 TAILQ_INSERT_TAIL(rqh, td, td_runq);
382 runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags)
386 KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
387 td->td_rqindex = pri;
388 runq_setbit(rq, pri);
389 rqh = &rq->rq_queues[pri];
390 CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p",
391 td, td->td_priority, pri, rqh);
392 if (flags & SRQ_PREEMPTED) {
393 TAILQ_INSERT_HEAD(rqh, td, td_runq);
395 TAILQ_INSERT_TAIL(rqh, td, td_runq);
399 * Return true if there are runnable processes of any priority on the run
400 * queue, false otherwise. Has no side effects, does not modify the run
404 runq_check(struct runq *rq)
409 rqb = &rq->rq_status;
410 for (i = 0; i < RQB_LEN; i++)
411 if (rqb->rqb_bits[i]) {
412 CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
413 rqb->rqb_bits[i], i);
416 CTR0(KTR_RUNQ, "runq_check: empty");
422 * Find the highest priority process on the run queue.
425 runq_choose_fuzz(struct runq *rq, int fuzz)
431 while ((pri = runq_findbit(rq)) != -1) {
432 rqh = &rq->rq_queues[pri];
433 /* fuzz == 1 is normal.. 0 or less are ignored */
436 * In the first couple of entries, check if
437 * there is one for our CPU as a preference.
440 int cpu = PCPU_GET(cpuid);
442 td2 = td = TAILQ_FIRST(rqh);
444 while (count-- && td2) {
445 if (td2->td_lastcpu == cpu) {
449 td2 = TAILQ_NEXT(td2, td_runq);
452 td = TAILQ_FIRST(rqh);
453 KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue"));
455 "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh);
458 CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri);
464 * Find the highest priority process on the run queue.
467 runq_choose(struct runq *rq)
473 while ((pri = runq_findbit(rq)) != -1) {
474 rqh = &rq->rq_queues[pri];
475 td = TAILQ_FIRST(rqh);
476 KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
478 "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh);
481 CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri);
487 runq_choose_from(struct runq *rq, u_char idx)
493 if ((pri = runq_findbit_from(rq, idx)) != -1) {
494 rqh = &rq->rq_queues[pri];
495 td = TAILQ_FIRST(rqh);
496 KASSERT(td != NULL, ("runq_choose: no thread on busy queue"));
498 "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p",
499 pri, td, td->td_rqindex, rqh);
502 CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri);
507 * Remove the thread from the queue specified by its priority, and clear the
508 * corresponding status bit if the queue becomes empty.
509 * Caller must set state afterwards.
512 runq_remove(struct runq *rq, struct thread *td)
515 runq_remove_idx(rq, td, NULL);
519 runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx)
524 KASSERT(td->td_flags & TDF_INMEM,
525 ("runq_remove_idx: thread swapped out"));
526 pri = td->td_rqindex;
527 KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
528 rqh = &rq->rq_queues[pri];
529 CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p",
530 td, td->td_priority, pri, rqh);
531 TAILQ_REMOVE(rqh, td, td_runq);
532 if (TAILQ_EMPTY(rqh)) {
533 CTR0(KTR_RUNQ, "runq_remove_idx: empty");
534 runq_clrbit(rq, pri);
535 if (idx != NULL && *idx == pri)
536 *idx = (pri + 1) % RQ_NQS;