2 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
31 #include "opt_sched.h"
33 #ifndef KERN_SWITCH_INCLUDE
34 #include <sys/param.h>
35 #include <sys/systm.h>
37 #include <sys/kernel.h>
40 #include <sys/mutex.h>
42 #include <sys/queue.h>
43 #include <sys/sched.h>
44 #else /* KERN_SWITCH_INCLUDE */
45 #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
48 #if defined(SMP) && defined(SCHED_4BSD)
49 #include <sys/sysctl.h>
52 /* Uncomment this to enable logging of critical_enter/exit. */
54 #define KTR_CRITICAL KTR_SCHED
56 #define KTR_CRITICAL 0
59 #ifdef FULL_PREEMPTION
61 #error "The FULL_PREEMPTION option requires the PREEMPTION option"
65 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
68 * kern.sched.preemption allows user space to determine if preemption support
69 * is compiled in or not. It is not currently a boot or runtime flag that
73 static int kern_sched_preemption = 1;
75 static int kern_sched_preemption = 0;
77 SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
78 &kern_sched_preemption, 0, "Kernel preemption enabled");
80 /************************************************************************
81 * Functions that manipulate runnability from a thread perspective. *
82 ************************************************************************/
84 * Select the thread that will be run next.
91 #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
92 if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
93 /* Shutting down, run idlethread on AP's */
94 td = PCPU_GET(idlethread);
95 CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
105 * If we are in panic, only allow system threads,
106 * plus the one we are running in, to be run.
108 if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
109 (td->td_flags & TDF_INPANIC) == 0)) {
110 /* note that it is no longer on the run queue */
120 * Kernel thread preemption implementation. Critical sections mark
121 * regions of code in which preemptions are not allowed.
130 CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
131 (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
140 KASSERT(td->td_critnest != 0,
141 ("critical_exit: td_critnest == 0"));
143 if (td->td_critnest == 1) {
145 mtx_assert(&sched_lock, MA_NOTOWNED);
146 if (td->td_owepreempt) {
148 mtx_lock_spin(&sched_lock);
150 mi_switch(SW_INVOL, NULL);
151 mtx_unlock_spin(&sched_lock);
157 CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
158 (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
162 * This function is called when a thread is about to be put on run queue
163 * because it has been made runnable or its priority has been adjusted. It
164 * determines if the new thread should be immediately preempted to. If so,
165 * it switches to it and eventually returns true. If not, it returns false
166 * so that the caller may place the thread on an appropriate run queue.
169 maybe_preempt(struct thread *td)
176 mtx_assert(&sched_lock, MA_OWNED);
179 * The new thread should not preempt the current thread if any of the
180 * following conditions are true:
182 * - The kernel is in the throes of crashing (panicstr).
183 * - The current thread has a higher (numerically lower) or
184 * equivalent priority. Note that this prevents curthread from
185 * trying to preempt to itself.
186 * - It is too early in the boot for context switches (cold is set).
187 * - The current thread has an inhibitor set or is in the process of
188 * exiting. In this case, the current thread is about to switch
189 * out anyways, so there's no point in preempting. If we did,
190 * the current thread would not be properly resumed as well, so
191 * just avoid that whole landmine.
192 * - If the new thread's priority is not a realtime priority and
193 * the current thread's priority is not an idle priority and
194 * FULL_PREEMPTION is disabled.
196 * If all of these conditions are false, but the current thread is in
197 * a nested critical section, then we have to defer the preemption
198 * until we exit the critical section. Otherwise, switch immediately
202 KASSERT ((ctd->td_sched != NULL && ctd->td_sched->ts_thread == ctd),
203 ("thread has no (or wrong) sched-private part."));
204 KASSERT((td->td_inhibitors == 0),
205 ("maybe_preempt: trying to run inhibited thread"));
206 pri = td->td_priority;
207 cpri = ctd->td_priority;
208 if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
209 TD_IS_INHIBITED(ctd))
211 #ifndef FULL_PREEMPTION
212 if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
216 if (ctd->td_critnest > 1) {
217 CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
219 ctd->td_owepreempt = 1;
224 * Thread is runnable but not yet put on system run queue.
226 MPASS(TD_ON_RUNQ(td));
228 CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
229 td->td_proc->p_pid, td->td_proc->p_comm);
230 mi_switch(SW_INVOL|SW_PREEMPT, td);
239 /* XXX: There should be a non-static version of this. */
241 printf_caddr_t(void *data)
243 printf("%s", (char *)data);
245 static char preempt_warning[] =
246 "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
247 SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
252 /************************************************************************
253 * SYSTEM RUN QUEUE manipulations and tests *
254 ************************************************************************/
256 * Initialize a run structure.
259 runq_init(struct runq *rq)
263 bzero(rq, sizeof *rq);
264 for (i = 0; i < RQ_NQS; i++)
265 TAILQ_INIT(&rq->rq_queues[i]);
269 * Clear the status bit of the queue corresponding to priority level pri,
270 * indicating that it is empty.
273 runq_clrbit(struct runq *rq, int pri)
277 rqb = &rq->rq_status;
278 CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
279 rqb->rqb_bits[RQB_WORD(pri)],
280 rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
281 RQB_BIT(pri), RQB_WORD(pri));
282 rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
286 * Find the index of the first non-empty run queue. This is done by
287 * scanning the status bits, a set bit indicates a non-empty queue.
290 runq_findbit(struct runq *rq)
296 rqb = &rq->rq_status;
297 for (i = 0; i < RQB_LEN; i++)
298 if (rqb->rqb_bits[i]) {
299 pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
300 CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
301 rqb->rqb_bits[i], i, pri);
309 runq_findbit_from(struct runq *rq, u_char start)
316 rqb = &rq->rq_status;
317 bit = start & (RQB_BPW -1);
319 CTR1(KTR_RUNQ, "runq_findbit_from: start %d", start);
321 for (i = RQB_WORD(start); i < RQB_LEN; i++) {
322 CTR3(KTR_RUNQ, "runq_findbit_from: bits %d = %#x bit = %d",
323 i, rqb->rqb_bits[i], bit);
324 if (rqb->rqb_bits[i]) {
326 for (pri = bit; pri < RQB_BPW; pri++)
327 if (rqb->rqb_bits[i] & (1ul << pri))
333 pri = RQB_FFS(rqb->rqb_bits[i]);
334 pri += (i << RQB_L2BPW);
335 CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
336 rqb->rqb_bits[i], i, pri);
342 CTR0(KTR_RUNQ, "runq_findbit_from: restarting");
351 * Set the status bit of the queue corresponding to priority level pri,
352 * indicating that it is non-empty.
355 runq_setbit(struct runq *rq, int pri)
359 rqb = &rq->rq_status;
360 CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
361 rqb->rqb_bits[RQB_WORD(pri)],
362 rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
363 RQB_BIT(pri), RQB_WORD(pri));
364 rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
368 * Add the thread to the queue specified by its priority, and set the
369 * corresponding status bit.
372 runq_add(struct runq *rq, struct td_sched *ts, int flags)
377 pri = ts->ts_thread->td_priority / RQ_PPQ;
378 ts->ts_rqindex = pri;
379 runq_setbit(rq, pri);
380 rqh = &rq->rq_queues[pri];
381 CTR5(KTR_RUNQ, "runq_add: td=%p ts=%p pri=%d %d rqh=%p",
382 ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
383 if (flags & SRQ_PREEMPTED) {
384 TAILQ_INSERT_HEAD(rqh, ts, ts_procq);
386 TAILQ_INSERT_TAIL(rqh, ts, ts_procq);
391 runq_add_pri(struct runq *rq, struct td_sched *ts, u_char pri, int flags)
395 KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
396 ts->ts_rqindex = pri;
397 runq_setbit(rq, pri);
398 rqh = &rq->rq_queues[pri];
399 CTR5(KTR_RUNQ, "runq_add_pri: td=%p ke=%p pri=%d idx=%d rqh=%p",
400 ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
401 if (flags & SRQ_PREEMPTED) {
402 TAILQ_INSERT_HEAD(rqh, ts, ts_procq);
404 TAILQ_INSERT_TAIL(rqh, ts, ts_procq);
408 * Return true if there are runnable processes of any priority on the run
409 * queue, false otherwise. Has no side effects, does not modify the run
413 runq_check(struct runq *rq)
418 rqb = &rq->rq_status;
419 for (i = 0; i < RQB_LEN; i++)
420 if (rqb->rqb_bits[i]) {
421 CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
422 rqb->rqb_bits[i], i);
425 CTR0(KTR_RUNQ, "runq_check: empty");
430 #if defined(SMP) && defined(SCHED_4BSD)
432 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
436 * Find the highest priority process on the run queue.
439 runq_choose(struct runq *rq)
445 mtx_assert(&sched_lock, MA_OWNED);
446 while ((pri = runq_findbit(rq)) != -1) {
447 rqh = &rq->rq_queues[pri];
448 #if defined(SMP) && defined(SCHED_4BSD)
449 /* fuzz == 1 is normal.. 0 or less are ignored */
452 * In the first couple of entries, check if
453 * there is one for our CPU as a preference.
455 int count = runq_fuzz;
456 int cpu = PCPU_GET(cpuid);
457 struct td_sched *ts2;
458 ts2 = ts = TAILQ_FIRST(rqh);
460 while (count-- && ts2) {
461 if (ts->ts_thread->td_lastcpu == cpu) {
465 ts2 = TAILQ_NEXT(ts2, ts_procq);
469 ts = TAILQ_FIRST(rqh);
470 KASSERT(ts != NULL, ("runq_choose: no proc on busy queue"));
472 "runq_choose: pri=%d td_sched=%p rqh=%p", pri, ts, rqh);
475 CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
481 runq_choose_from(struct runq *rq, u_char idx)
487 mtx_assert(&sched_lock, MA_OWNED);
488 if ((pri = runq_findbit_from(rq, idx)) != -1) {
489 rqh = &rq->rq_queues[pri];
490 ts = TAILQ_FIRST(rqh);
491 KASSERT(ts != NULL, ("runq_choose: no proc on busy queue"));
493 "runq_choose_from: pri=%d kse=%p idx=%d rqh=%p",
494 pri, ts, ts->ts_rqindex, rqh);
497 CTR1(KTR_RUNQ, "runq_choose_from: idleproc pri=%d", pri);
502 * Remove the thread from the queue specified by its priority, and clear the
503 * corresponding status bit if the queue becomes empty.
504 * Caller must set state afterwards.
507 runq_remove(struct runq *rq, struct td_sched *ts)
510 runq_remove_idx(rq, ts, NULL);
514 runq_remove_idx(struct runq *rq, struct td_sched *ts, u_char *idx)
519 KASSERT(ts->ts_thread->td_proc->p_sflag & PS_INMEM,
520 ("runq_remove_idx: process swapped out"));
521 pri = ts->ts_rqindex;
522 rqh = &rq->rq_queues[pri];
523 CTR5(KTR_RUNQ, "runq_remove_idx: td=%p, ts=%p pri=%d %d rqh=%p",
524 ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
525 TAILQ_REMOVE(rqh, ts, ts_procq);
526 if (TAILQ_EMPTY(rqh)) {
527 CTR0(KTR_RUNQ, "runq_remove_idx: empty");
528 runq_clrbit(rq, pri);
529 if (idx != NULL && *idx == pri)
530 *idx = (pri + 1) % RQ_NQS;
534 /****** functions that are temporarily here ***********/
536 extern struct mtx kse_zombie_lock;
539 * Allocate scheduler specific per-process resources.
540 * The thread and proc have already been linked in.
543 * proc_init() (UMA init method)
546 sched_newproc(struct proc *p, struct thread *td)
551 * thread is being either created or recycled.
552 * Fix up the per-scheduler resources associated with it.
554 * sched_fork_thread()
555 * thread_dtor() (*may go away)
556 * thread_init() (*may go away)
559 sched_newthread(struct thread *td)
563 ts = (struct td_sched *) (td + 1);
564 bzero(ts, sizeof(*ts));
572 * proc_init() (UMA) via sched_newproc()
575 sched_init_concurrency(struct proc *p)
580 * Change the concurrency of an existing proc to N
588 sched_set_concurrency(struct proc *p, int concurrency)
593 * Called from thread_exit() for all exiting thread
595 * Not to be confused with sched_exit_thread()
596 * that is only called from thread_exit() for threads exiting
597 * without the rest of the process exiting because it is also called from
598 * sched_exit() and we wouldn't want to call it twice.
599 * XXX This can probably be fixed.
602 sched_thread_exit(struct thread *td)
606 #endif /* KERN_SWITCH_INCLUDE */