sys/kern/kern_switch.c

   1 /*-
   2  * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26
  27 /***
  28 Here is the logic..
  29
  30 If there are N processors, then there are at most N KSEs (kernel
  31 schedulable entities) working to process threads that belong to a
  32 KSEGROUP (kg). If there are X of these KSEs actually running at the
  33 moment in question, then there are at most M (N-X) of these KSEs on
  34 the run queue, as running KSEs are not on the queue.
  35
  36 Runnable threads are queued off the KSEGROUP in priority order.
  37 If there are M or more threads runnable, the top M threads
  38 (by priority) are 'preassigned' to the M KSEs not running. The KSEs take
  39 their priority from those threads and are put on the run queue.
  40
  41 The last thread that had a priority high enough to have a KSE associated
  42 with it, AND IS ON THE RUN QUEUE is pointed to by
  43 kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
  44 assigned as all the available KSEs are activly running, or because there
  45 are no threads queued, that pointer is NULL.
  46
  47 When a KSE is removed from the run queue to become runnable, we know
  48 it was associated with the highest priority thread in the queue (at the head
  49 of the queue). If it is also the last assigned we know M was 1 and must
  50 now be 0. Since the thread is no longer queued that pointer must be
  51 removed from it. Since we know there were no more KSEs available,
  52 (M was 1 and is now 0) and since we are not FREEING our KSE
  53 but using it, we know there are STILL no more KSEs available, we can prove
  54 that the next thread in the ksegrp list will not have a KSE to assign to
  55 it, so we can show that the pointer must be made 'invalid' (NULL).
  56
  57 The pointer exists so that when a new thread is made runnable, it can
  58 have its priority compared with the last assigned thread to see if
  59 it should 'steal' its KSE or not.. i.e. is it 'earlier'
  60 on the list than that thread or later.. If it's earlier, then the KSE is
  61 removed from the last assigned (which is now not assigned a KSE)
  62 and reassigned to the new thread, which is placed earlier in the list.
  63 The pointer is then backed up to the previous thread (which may or may not
  64 be the new thread).
  65
  66 When a thread sleeps or is removed, the KSE becomes available and if there
  67 are queued threads that are not assigned KSEs, the highest priority one of
  68 them is assigned the KSE, which is then placed back on the run queue at
  69 the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
  70 to point to it.
  71
  72 The following diagram shows 2 KSEs and 3 threads from a single process.
  73
  74  RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
  75               \    \____
  76                \        \
  77     KSEGROUP---thread--thread--thread    (queued in priority order)
  78         \                 /
  79          \_______________/
  80           (last_assigned)
  81
  82 The result of this scheme is that the M available KSEs are always
  83 queued at the priorities they have inherrited from the M highest priority
  84 threads for that KSEGROUP. If this situation changes, the KSEs are
  85 reassigned to keep this true.
  86 ***/
  87
  88 #include <sys/cdefs.h>
  89 __FBSDID("$FreeBSD$");
  90
  91 #include "opt_sched.h"
  92
  93 #ifndef KERN_SWITCH_INCLUDE
  94 #include <sys/param.h>
  95 #include <sys/systm.h>
  96 #include <sys/kdb.h>
  97 #include <sys/kernel.h>
  98 #include <sys/ktr.h>
  99 #include <sys/lock.h>
 100 #include <sys/mutex.h>
 101 #include <sys/proc.h>
 102 #include <sys/queue.h>
 103 #include <sys/sched.h>
 104 #else  /* KERN_SWITCH_INCLUDE */
 105 #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
 106 #include <sys/smp.h>
 107 #endif
 108 #if defined(SMP) && defined(SCHED_4BSD)
 109 #include <sys/sysctl.h>
 110 #endif
 111
 112 /* Uncomment this to enable logging of critical_enter/exit. */
 113 #if 0
 114 #define KTR_CRITICAL    KTR_SCHED
 115 #else
 116 #define KTR_CRITICAL    0
 117 #endif
 118
 119 #ifdef FULL_PREEMPTION
 120 #ifndef PREEMPTION
 121 #error "The FULL_PREEMPTION option requires the PREEMPTION option"
 122 #endif
 123 #endif
 124
 125 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
 126
 127 #define td_kse td_sched
 128
 129 /*
 130  * kern.sched.preemption allows user space to determine if preemption support
 131  * is compiled in or not.  It is not currently a boot or runtime flag that
 132  * can be changed.
 133  */
 134 #ifdef PREEMPTION
 135 static int kern_sched_preemption = 1;
 136 #else
 137 static int kern_sched_preemption = 0;
 138 #endif
 139 SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
 140     &kern_sched_preemption, 0, "Kernel preemption enabled");
 141
 142 /************************************************************************
 143  * Functions that manipulate runnability from a thread perspective.     *
 144  ************************************************************************/
 145 /*
 146  * Select the KSE that will be run next.  From that find the thread, and
 147  * remove it from the KSEGRP's run queue.  If there is thread clustering,
 148  * this will be what does it.
 149  */
 150 struct thread *
 151 choosethread(void)
 152 {
 153         struct kse *ke;
 154         struct thread *td;
 155         struct ksegrp *kg;
 156
 157 #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
 158         if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
 159                 /* Shutting down, run idlethread on AP's */
 160                 td = PCPU_GET(idlethread);
 161                 ke = td->td_kse;
 162                 CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
 163                 ke->ke_flags |= KEF_DIDRUN;
 164                 TD_SET_RUNNING(td);
 165                 return (td);
 166         }
 167 #endif
 168
 169 retry:
 170         ke = sched_choose();
 171         if (ke) {
 172                 td = ke->ke_thread;
 173                 KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
 174                 kg = ke->ke_ksegrp;
 175                 if (td->td_proc->p_flag & P_HADTHREADS) {
 176                         if (kg->kg_last_assigned == td) {
 177                                 kg->kg_last_assigned = TAILQ_PREV(td,
 178                                     threadqueue, td_runq);
 179                         }
 180                         TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
 181                 }
 182                 CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
 183                     td, td->td_priority);
 184         } else {
 185                 /* Simulate runq_choose() having returned the idle thread */
 186                 td = PCPU_GET(idlethread);
 187                 ke = td->td_kse;
 188                 CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
 189         }
 190         ke->ke_flags |= KEF_DIDRUN;
 191
 192         /*
 193          * If we are in panic, only allow system threads,
 194          * plus the one we are running in, to be run.
 195          */
 196         if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
 197             (td->td_flags & TDF_INPANIC) == 0)) {
 198                 /* note that it is no longer on the run queue */
 199                 TD_SET_CAN_RUN(td);
 200                 goto retry;
 201         }
 202
 203         TD_SET_RUNNING(td);
 204         return (td);
 205 }
 206
 207 /*
 208  * Given a surplus system slot, try assign a new runnable thread to it.
 209  * Called from:
 210  *  sched_thread_exit()  (local)
 211  *  sched_switch()  (local)
 212  *  sched_thread_exit()  (local)
 213  *  remrunqueue()  (local)  (not at the moment)
 214  */
 215 static void
 216 slot_fill(struct ksegrp *kg)
 217 {
 218         struct thread *td;
 219
 220         mtx_assert(&sched_lock, MA_OWNED);
 221         while (kg->kg_avail_opennings > 0) {
 222                 /*
 223                  * Find the first unassigned thread
 224                  */
 225                 if ((td = kg->kg_last_assigned) != NULL)
 226                         td = TAILQ_NEXT(td, td_runq);
 227                 else
 228                         td = TAILQ_FIRST(&kg->kg_runq);
 229
 230                 /*
 231                  * If we found one, send it to the system scheduler.
 232                  */
 233                 if (td) {
 234                         kg->kg_last_assigned = td;
 235                         sched_add(td, SRQ_YIELDING);
 236                         CTR2(KTR_RUNQ, "slot_fill: td%p -> kg%p", td, kg);
 237                 } else {
 238                         /* no threads to use up the slots. quit now */
 239                         break;
 240                 }
 241         }
 242 }
 243
 244 #ifdef  SCHED_4BSD
 245 /*
 246  * Remove a thread from its KSEGRP's run queue.
 247  * This in turn may remove it from a KSE if it was already assigned
 248  * to one, possibly causing a new thread to be assigned to the KSE
 249  * and the KSE getting a new priority.
 250  */
 251 static void
 252 remrunqueue(struct thread *td)
 253 {
 254         struct thread *td2, *td3;
 255         struct ksegrp *kg;
 256         struct kse *ke;
 257
 258         mtx_assert(&sched_lock, MA_OWNED);
 259         KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
 260         kg = td->td_ksegrp;
 261         ke = td->td_kse;
 262         CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
 263         TD_SET_CAN_RUN(td);
 264         /*
 265          * If it is not a threaded process, take the shortcut.
 266          */
 267         if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
 268                 /* remve from sys run queue and free up a slot */
 269                 sched_rem(td);
 270                 ke->ke_state = KES_THREAD;
 271                 return;
 272         }
 273         td3 = TAILQ_PREV(td, threadqueue, td_runq);
 274         TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
 275         if (ke->ke_state == KES_ONRUNQ) {
 276                 /*
 277                  * This thread has been assigned to the system run queue.
 278                  * We need to dissociate it and try assign the
 279                  * KSE to the next available thread. Then, we should
 280                  * see if we need to move the KSE in the run queues.
 281                  */
 282                 sched_rem(td);
 283                 ke->ke_state = KES_THREAD;
 284                 td2 = kg->kg_last_assigned;
 285                 KASSERT((td2 != NULL), ("last assigned has wrong value"));
 286                 if (td2 == td)
 287                         kg->kg_last_assigned = td3;
 288                 /* slot_fill(kg); */ /* will replace it with another */
 289         }
 290 }
 291 #endif
 292
 293 /*
 294  * Change the priority of a thread that is on the run queue.
 295  */
 296 void
 297 adjustrunqueue( struct thread *td, int newpri)
 298 {
 299         struct ksegrp *kg;
 300         struct kse *ke;
 301
 302         mtx_assert(&sched_lock, MA_OWNED);
 303         KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
 304
 305         ke = td->td_kse;
 306         CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
 307         /*
 308          * If it is not a threaded process, take the shortcut.
 309          */
 310         if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
 311                 /* We only care about the kse in the run queue. */
 312                 td->td_priority = newpri;
 313                 if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
 314                         sched_rem(td);
 315                         sched_add(td, SRQ_BORING);
 316                 }
 317                 return;
 318         }
 319
 320         /* It is a threaded process */
 321         kg = td->td_ksegrp;
 322         if (ke->ke_state == KES_ONRUNQ
 323 #ifdef SCHED_ULE
 324          || ((ke->ke_flags & KEF_ASSIGNED) != 0 &&
 325              (ke->ke_flags & KEF_REMOVED) == 0)
 326 #endif
 327            ) {
 328                 if (kg->kg_last_assigned == td) {
 329                         kg->kg_last_assigned =
 330                             TAILQ_PREV(td, threadqueue, td_runq);
 331                 }
 332                 sched_rem(td);
 333         }
 334         TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
 335         TD_SET_CAN_RUN(td);
 336         td->td_priority = newpri;
 337         setrunqueue(td, SRQ_BORING);
 338 }
 339
 340 /*
 341  * This function is called when a thread is about to be put on a
 342  * ksegrp run queue because it has been made runnable or its
 343  * priority has been adjusted and the ksegrp does not have a
 344  * free kse slot.  It determines if a thread from the same ksegrp
 345  * should be preempted.  If so, it tries to switch threads
 346  * if the thread is on the same cpu or notifies another cpu that
 347  * it should switch threads.
 348  */
 349
 350 static void
 351 maybe_preempt_in_ksegrp(struct thread *td)
 352 #if  !defined(SMP)
 353 {
 354         struct thread *running_thread;
 355
 356         mtx_assert(&sched_lock, MA_OWNED);
 357         running_thread = curthread;
 358
 359         if (running_thread->td_ksegrp != td->td_ksegrp)
 360                 return;
 361
 362         if (td->td_priority >= running_thread->td_priority)
 363                 return;
 364 #ifdef PREEMPTION
 365 #ifndef FULL_PREEMPTION
 366         if (td->td_priority > PRI_MAX_ITHD) {
 367                 running_thread->td_flags |= TDF_NEEDRESCHED;
 368                 return;
 369         }
 370 #endif /* FULL_PREEMPTION */
 371
 372         if (running_thread->td_critnest > 1)
 373                 running_thread->td_owepreempt = 1;
 374          else
 375                  mi_switch(SW_INVOL, NULL);
 376
 377 #else /* PREEMPTION */
 378         running_thread->td_flags |= TDF_NEEDRESCHED;
 379 #endif /* PREEMPTION */
 380         return;
 381 }
 382
 383 #else /* SMP */
 384 {
 385         struct thread *running_thread;
 386         int worst_pri;
 387         struct ksegrp *kg;
 388         cpumask_t cpumask,dontuse;
 389         struct pcpu *pc;
 390         struct pcpu *best_pcpu;
 391         struct thread *cputhread;
 392
 393         mtx_assert(&sched_lock, MA_OWNED);
 394
 395         running_thread = curthread;
 396
 397 #if !defined(KSEG_PEEMPT_BEST_CPU)
 398         if (running_thread->td_ksegrp != td->td_ksegrp) {
 399 #endif
 400                 kg = td->td_ksegrp;
 401
 402                 /* if someone is ahead of this thread, wait our turn */
 403                 if (td != TAILQ_FIRST(&kg->kg_runq))
 404                         return;
 405
 406                 worst_pri = td->td_priority;
 407                 best_pcpu = NULL;
 408                 dontuse   = stopped_cpus | idle_cpus_mask;
 409
 410                 /*
 411                  * Find a cpu with the worst priority that runs at thread from
 412                  * the same  ksegrp - if multiple exist give first the last run
 413                  * cpu and then the current cpu priority
 414                  */
 415
 416                 SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
 417                         cpumask   = pc->pc_cpumask;
 418                         cputhread = pc->pc_curthread;
 419
 420                         if ((cpumask & dontuse)  ||
 421                             cputhread->td_ksegrp != kg)
 422                                 continue;
 423
 424                         if (cputhread->td_priority > worst_pri) {
 425                                 worst_pri = cputhread->td_priority;
 426                                 best_pcpu = pc;
 427                                 continue;
 428                         }
 429
 430                         if (cputhread->td_priority == worst_pri &&
 431                             best_pcpu != NULL &&
 432                             (td->td_lastcpu == pc->pc_cpuid ||
 433                                 (PCPU_GET(cpumask) == cpumask &&
 434                                     td->td_lastcpu != best_pcpu->pc_cpuid)))
 435                             best_pcpu = pc;
 436                 }
 437
 438                 /* Check if we need to preempt someone */
 439                 if (best_pcpu == NULL)
 440                         return;
 441
 442 #if defined(IPI_PREEMPTION) && defined(PREEMPTION)
 443 #if !defined(FULL_PREEMPTION)
 444                 if (td->td_priority <= PRI_MAX_ITHD)
 445 #endif /* ! FULL_PREEMPTION */
 446                         {
 447                                 ipi_selected(best_pcpu->pc_cpumask, IPI_PREEMPT);
 448                                 return;
 449                         }
 450 #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
 451
 452                 if (PCPU_GET(cpuid) != best_pcpu->pc_cpuid) {
 453                         best_pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
 454                         ipi_selected(best_pcpu->pc_cpumask, IPI_AST);
 455                         return;
 456                 }
 457 #if !defined(KSEG_PEEMPT_BEST_CPU)
 458         }
 459 #endif
 460
 461         if (td->td_priority >= running_thread->td_priority)
 462                 return;
 463 #ifdef PREEMPTION
 464
 465 #if !defined(FULL_PREEMPTION)
 466         if (td->td_priority > PRI_MAX_ITHD) {
 467                 running_thread->td_flags |= TDF_NEEDRESCHED;
 468         }
 469 #endif /* ! FULL_PREEMPTION */
 470
 471         if (running_thread->td_critnest > 1)
 472                 running_thread->td_owepreempt = 1;
 473          else
 474                  mi_switch(SW_INVOL, NULL);
 475
 476 #else /* PREEMPTION */
 477         running_thread->td_flags |= TDF_NEEDRESCHED;
 478 #endif /* PREEMPTION */
 479         return;
 480 }
 481 #endif /* !SMP */
 482
 483
 484 int limitcount;
 485 void
 486 setrunqueue(struct thread *td, int flags)
 487 {
 488         struct ksegrp *kg;
 489         struct thread *td2;
 490         struct thread *tda;
 491
 492         CTR3(KTR_RUNQ, "setrunqueue: td:%p kg:%p pid:%d",
 493             td, td->td_ksegrp, td->td_proc->p_pid);
 494         CTR5(KTR_SCHED, "setrunqueue: %p(%s) prio %d by %p(%s)",
 495             td, td->td_proc->p_comm, td->td_priority, curthread,
 496             curthread->td_proc->p_comm);
 497         mtx_assert(&sched_lock, MA_OWNED);
 498         KASSERT((td->td_inhibitors == 0),
 499                         ("setrunqueue: trying to run inhibitted thread"));
 500         KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 501             ("setrunqueue: bad thread state"));
 502         TD_SET_RUNQ(td);
 503         kg = td->td_ksegrp;
 504         if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
 505                 /*
 506                  * Common path optimisation: Only one of everything
 507                  * and the KSE is always already attached.
 508                  * Totally ignore the ksegrp run queue.
 509                  */
 510                 if (kg->kg_avail_opennings != 1) {
 511                         if (limitcount < 1) {
 512                                 limitcount++;
 513                                 printf("pid %d: corrected slot count (%d->1)\n",
 514                                     td->td_proc->p_pid, kg->kg_avail_opennings);
 515
 516                         }
 517                         kg->kg_avail_opennings = 1;
 518                 }
 519                 sched_add(td, flags);
 520                 return;
 521         }
 522
 523         /*
 524          * If the concurrency has reduced, and we would go in the
 525          * assigned section, then keep removing entries from the
 526          * system run queue, until we are not in that section
 527          * or there is room for us to be put in that section.
 528          * What we MUST avoid is the case where there are threads of less
 529          * priority than the new one scheduled, but it can not
 530          * be scheduled itself. That would lead to a non contiguous set
 531          * of scheduled threads, and everything would break.
 532          */
 533         tda = kg->kg_last_assigned;
 534         while ((kg->kg_avail_opennings <= 0) &&
 535             (tda && (tda->td_priority > td->td_priority))) {
 536                 /*
 537                  * None free, but there is one we can commandeer.
 538                  */
 539                 CTR2(KTR_RUNQ,
 540                     "setrunqueue: kg:%p: take slot from td: %p", kg, tda);
 541                 sched_rem(tda);
 542                 tda = kg->kg_last_assigned =
 543                     TAILQ_PREV(tda, threadqueue, td_runq);
 544         }
 545
 546         /*
 547          * Add the thread to the ksegrp's run queue at
 548          * the appropriate place.
 549          */
 550         TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
 551                 if (td2->td_priority > td->td_priority) {
 552                         TAILQ_INSERT_BEFORE(td2, td, td_runq);
 553                         break;
 554                 }
 555         }
 556         if (td2 == NULL) {
 557                 /* We ran off the end of the TAILQ or it was empty. */
 558                 TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
 559         }
 560
 561         /*
 562          * If we have a slot to use, then put the thread on the system
 563          * run queue and if needed, readjust the last_assigned pointer.
 564          * it may be that we need to schedule something anyhow
 565          * even if the availabel slots are -ve so that
 566          * all the items < last_assigned are scheduled.
 567          */
 568         if (kg->kg_avail_opennings > 0) {
 569                 if (tda == NULL) {
 570                         /*
 571                          * No pre-existing last assigned so whoever is first
 572                          * gets the slot.. (maybe us)
 573                          */
 574                         td2 = TAILQ_FIRST(&kg->kg_runq);
 575                         kg->kg_last_assigned = td2;
 576                 } else if (tda->td_priority > td->td_priority) {
 577                         td2 = td;
 578                 } else {
 579                         /*
 580                          * We are past last_assigned, so
 581                          * give the next slot to whatever is next,
 582                          * which may or may not be us.
 583                          */
 584                         td2 = TAILQ_NEXT(tda, td_runq);
 585                         kg->kg_last_assigned = td2;
 586                 }
 587                 sched_add(td2, flags);
 588         } else {
 589                 CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
 590                         td, td->td_ksegrp, td->td_proc->p_pid);
 591                 if ((flags & SRQ_YIELDING) == 0)
 592                         maybe_preempt_in_ksegrp(td);
 593         }
 594 }
 595
 596 /*
 597  * Kernel thread preemption implementation.  Critical sections mark
 598  * regions of code in which preemptions are not allowed.
 599  */
 600 void
 601 critical_enter(void)
 602 {
 603         struct thread *td;
 604
 605         td = curthread;
 606         td->td_critnest++;
 607         CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
 608             (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
 609 }
 610
 611 void
 612 critical_exit(void)
 613 {
 614         struct thread *td;
 615
 616         td = curthread;
 617         KASSERT(td->td_critnest != 0,
 618             ("critical_exit: td_critnest == 0"));
 619 #ifdef PREEMPTION
 620         if (td->td_critnest == 1) {
 621                 td->td_critnest = 0;
 622                 mtx_assert(&sched_lock, MA_NOTOWNED);
 623                 if (td->td_owepreempt) {
 624                         td->td_critnest = 1;
 625                         mtx_lock_spin(&sched_lock);
 626                         td->td_critnest--;
 627                         mi_switch(SW_INVOL, NULL);
 628                         mtx_unlock_spin(&sched_lock);
 629                 }
 630         } else
 631 #endif
 632                 td->td_critnest--;
 633
 634         CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
 635             (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
 636 }
 637
 638 /*
 639  * This function is called when a thread is about to be put on run queue
 640  * because it has been made runnable or its priority has been adjusted.  It
 641  * determines if the new thread should be immediately preempted to.  If so,
 642  * it switches to it and eventually returns true.  If not, it returns false
 643  * so that the caller may place the thread on an appropriate run queue.
 644  */
 645 int
 646 maybe_preempt(struct thread *td)
 647 {
 648 #ifdef PREEMPTION
 649         struct thread *ctd;
 650         int cpri, pri;
 651 #endif
 652
 653         mtx_assert(&sched_lock, MA_OWNED);
 654 #ifdef PREEMPTION
 655         /*
 656          * The new thread should not preempt the current thread if any of the
 657          * following conditions are true:
 658          *
 659          *  - The kernel is in the throes of crashing (panicstr).
 660          *  - The current thread has a higher (numerically lower) or
 661          *    equivalent priority.  Note that this prevents curthread from
 662          *    trying to preempt to itself.
 663          *  - It is too early in the boot for context switches (cold is set).
 664          *  - The current thread has an inhibitor set or is in the process of
 665          *    exiting.  In this case, the current thread is about to switch
 666          *    out anyways, so there's no point in preempting.  If we did,
 667          *    the current thread would not be properly resumed as well, so
 668          *    just avoid that whole landmine.
 669          *  - If the new thread's priority is not a realtime priority and
 670          *    the current thread's priority is not an idle priority and
 671          *    FULL_PREEMPTION is disabled.
 672          *
 673          * If all of these conditions are false, but the current thread is in
 674          * a nested critical section, then we have to defer the preemption
 675          * until we exit the critical section.  Otherwise, switch immediately
 676          * to the new thread.
 677          */
 678         ctd = curthread;
 679         KASSERT ((ctd->td_kse != NULL && ctd->td_kse->ke_thread == ctd),
 680           ("thread has no (or wrong) sched-private part."));
 681         KASSERT((td->td_inhibitors == 0),
 682                         ("maybe_preempt: trying to run inhibitted thread"));
 683         pri = td->td_priority;
 684         cpri = ctd->td_priority;
 685         if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
 686             TD_IS_INHIBITED(ctd) || td->td_kse->ke_state != KES_THREAD)
 687                 return (0);
 688 #ifndef FULL_PREEMPTION
 689         if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
 690                 return (0);
 691 #endif
 692
 693         if (ctd->td_critnest > 1) {
 694                 CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
 695                     ctd->td_critnest);
 696                 ctd->td_owepreempt = 1;
 697                 return (0);
 698         }
 699
 700         /*
 701          * Thread is runnable but not yet put on system run queue.
 702          */
 703         MPASS(TD_ON_RUNQ(td));
 704         MPASS(td->td_sched->ke_state != KES_ONRUNQ);
 705         if (td->td_proc->p_flag & P_HADTHREADS) {
 706                 /*
 707                  * If this is a threaded process we actually ARE on the
 708                  * ksegrp run queue so take it off that first.
 709                  * Also undo any damage done to the last_assigned pointer.
 710                  * XXX Fix setrunqueue so this isn't needed
 711                  */
 712                 struct ksegrp *kg;
 713
 714                 kg = td->td_ksegrp;
 715                 if (kg->kg_last_assigned == td)
 716                         kg->kg_last_assigned =
 717                             TAILQ_PREV(td, threadqueue, td_runq);
 718                 TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
 719         }
 720
 721         TD_SET_RUNNING(td);
 722         CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
 723             td->td_proc->p_pid, td->td_proc->p_comm);
 724         mi_switch(SW_INVOL|SW_PREEMPT, td);
 725         return (1);
 726 #else
 727         return (0);
 728 #endif
 729 }
 730
 731 #if 0
 732 #ifndef PREEMPTION
 733 /* XXX: There should be a non-static version of this. */
 734 static void
 735 printf_caddr_t(void *data)
 736 {
 737         printf("%s", (char *)data);
 738 }
 739 static char preempt_warning[] =
 740     "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
 741 SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
 742     preempt_warning)
 743 #endif
 744 #endif
 745
 746 /************************************************************************
 747  * SYSTEM RUN QUEUE manipulations and tests                             *
 748  ************************************************************************/
 749 /*
 750  * Initialize a run structure.
 751  */
 752 void
 753 runq_init(struct runq *rq)
 754 {
 755         int i;
 756
 757         bzero(rq, sizeof *rq);
 758         for (i = 0; i < RQ_NQS; i++)
 759                 TAILQ_INIT(&rq->rq_queues[i]);
 760 }
 761
 762 /*
 763  * Clear the status bit of the queue corresponding to priority level pri,
 764  * indicating that it is empty.
 765  */
 766 static __inline void
 767 runq_clrbit(struct runq *rq, int pri)
 768 {
 769         struct rqbits *rqb;
 770
 771         rqb = &rq->rq_status;
 772         CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
 773             rqb->rqb_bits[RQB_WORD(pri)],
 774             rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
 775             RQB_BIT(pri), RQB_WORD(pri));
 776         rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
 777 }
 778
 779 /*
 780  * Find the index of the first non-empty run queue.  This is done by
 781  * scanning the status bits, a set bit indicates a non-empty queue.
 782  */
 783 static __inline int
 784 runq_findbit(struct runq *rq)
 785 {
 786         struct rqbits *rqb;
 787         int pri;
 788         int i;
 789
 790         rqb = &rq->rq_status;
 791         for (i = 0; i < RQB_LEN; i++)
 792                 if (rqb->rqb_bits[i]) {
 793                         pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
 794                         CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
 795                             rqb->rqb_bits[i], i, pri);
 796                         return (pri);
 797                 }
 798
 799         return (-1);
 800 }
 801
 802 /*
 803  * Set the status bit of the queue corresponding to priority level pri,
 804  * indicating that it is non-empty.
 805  */
 806 static __inline void
 807 runq_setbit(struct runq *rq, int pri)
 808 {
 809         struct rqbits *rqb;
 810
 811         rqb = &rq->rq_status;
 812         CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
 813             rqb->rqb_bits[RQB_WORD(pri)],
 814             rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
 815             RQB_BIT(pri), RQB_WORD(pri));
 816         rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
 817 }
 818
 819 /*
 820  * Add the KSE to the queue specified by its priority, and set the
 821  * corresponding status bit.
 822  */
 823 void
 824 runq_add(struct runq *rq, struct kse *ke, int flags)
 825 {
 826         struct rqhead *rqh;
 827         int pri;
 828
 829         pri = ke->ke_thread->td_priority / RQ_PPQ;
 830         ke->ke_rqindex = pri;
 831         runq_setbit(rq, pri);
 832         rqh = &rq->rq_queues[pri];
 833         CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
 834             ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
 835         if (flags & SRQ_PREEMPTED) {
 836                 TAILQ_INSERT_HEAD(rqh, ke, ke_procq);
 837         } else {
 838                 TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
 839         }
 840 }
 841
 842 /*
 843  * Return true if there are runnable processes of any priority on the run
 844  * queue, false otherwise.  Has no side effects, does not modify the run
 845  * queue structure.
 846  */
 847 int
 848 runq_check(struct runq *rq)
 849 {
 850         struct rqbits *rqb;
 851         int i;
 852
 853         rqb = &rq->rq_status;
 854         for (i = 0; i < RQB_LEN; i++)
 855                 if (rqb->rqb_bits[i]) {
 856                         CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
 857                             rqb->rqb_bits[i], i);
 858                         return (1);
 859                 }
 860         CTR0(KTR_RUNQ, "runq_check: empty");
 861
 862         return (0);
 863 }
 864
 865 #if defined(SMP) && defined(SCHED_4BSD)
 866 int runq_fuzz = 1;
 867 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
 868 #endif
 869
 870 /*
 871  * Find the highest priority process on the run queue.
 872  */
 873 struct kse *
 874 runq_choose(struct runq *rq)
 875 {
 876         struct rqhead *rqh;
 877         struct kse *ke;
 878         int pri;
 879
 880         mtx_assert(&sched_lock, MA_OWNED);
 881         while ((pri = runq_findbit(rq)) != -1) {
 882                 rqh = &rq->rq_queues[pri];
 883 #if defined(SMP) && defined(SCHED_4BSD)
 884                 /* fuzz == 1 is normal.. 0 or less are ignored */
 885                 if (runq_fuzz > 1) {
 886                         /*
 887                          * In the first couple of entries, check if
 888                          * there is one for our CPU as a preference.
 889                          */
 890                         int count = runq_fuzz;
 891                         int cpu = PCPU_GET(cpuid);
 892                         struct kse *ke2;
 893                         ke2 = ke = TAILQ_FIRST(rqh);
 894
 895                         while (count-- && ke2) {
 896                                 if (ke->ke_thread->td_lastcpu == cpu) {
 897                                         ke = ke2;
 898                                         break;
 899                                 }
 900                                 ke2 = TAILQ_NEXT(ke2, ke_procq);
 901                         }
 902                 } else
 903 #endif
 904                         ke = TAILQ_FIRST(rqh);
 905                 KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
 906                 CTR3(KTR_RUNQ,
 907                     "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
 908                 return (ke);
 909         }
 910         CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
 911
 912         return (NULL);
 913 }
 914
 915 /*
 916  * Remove the KSE from the queue specified by its priority, and clear the
 917  * corresponding status bit if the queue becomes empty.
 918  * Caller must set ke->ke_state afterwards.
 919  */
 920 void
 921 runq_remove(struct runq *rq, struct kse *ke)
 922 {
 923         struct rqhead *rqh;
 924         int pri;
 925
 926         KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 927                 ("runq_remove: process swapped out"));
 928         pri = ke->ke_rqindex;
 929         rqh = &rq->rq_queues[pri];
 930         CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
 931             ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
 932         KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
 933         TAILQ_REMOVE(rqh, ke, ke_procq);
 934         if (TAILQ_EMPTY(rqh)) {
 935                 CTR0(KTR_RUNQ, "runq_remove: empty");
 936                 runq_clrbit(rq, pri);
 937         }
 938 }
 939
 940 /****** functions that are temporarily here ***********/
 941 #include <vm/uma.h>
 942 extern struct mtx kse_zombie_lock;
 943
 944 /*
 945  *  Allocate scheduler specific per-process resources.
 946  * The thread and ksegrp have already been linked in.
 947  * In this case just set the default concurrency value.
 948  *
 949  * Called from:
 950  *  proc_init() (UMA init method)
 951  */
 952 void
 953 sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
 954 {
 955
 956         /* This can go in sched_fork */
 957         sched_init_concurrency(kg);
 958 }
 959
 960 /*
 961  * thread is being either created or recycled.
 962  * Fix up the per-scheduler resources associated with it.
 963  * Called from:
 964  *  sched_fork_thread()
 965  *  thread_dtor()  (*may go away)
 966  *  thread_init()  (*may go away)
 967  */
 968 void
 969 sched_newthread(struct thread *td)
 970 {
 971         struct td_sched *ke;
 972
 973         ke = (struct td_sched *) (td + 1);
 974         bzero(ke, sizeof(*ke));
 975         td->td_sched     = ke;
 976         ke->ke_thread   = td;
 977         ke->ke_state    = KES_THREAD;
 978 }
 979
 980 /*
 981  * Set up an initial concurrency of 1
 982  * and set the given thread (if given) to be using that
 983  * concurrency slot.
 984  * May be used "offline"..before the ksegrp is attached to the world
 985  * and thus wouldn't need schedlock in that case.
 986  * Called from:
 987  *  thr_create()
 988  *  proc_init() (UMA) via sched_newproc()
 989  */
 990 void
 991 sched_init_concurrency(struct ksegrp *kg)
 992 {
 993
 994         CTR1(KTR_RUNQ,"kg %p init slots and concurrency to 1", kg);
 995         kg->kg_concurrency = 1;
 996         kg->kg_avail_opennings = 1;
 997 }
 998
 999 /*
1000  * Change the concurrency of an existing ksegrp to N
1001  * Called from:
1002  *  kse_create()
1003  *  kse_exit()
1004  *  thread_exit()
1005  *  thread_single()
1006  */
1007 void
1008 sched_set_concurrency(struct ksegrp *kg, int concurrency)
1009 {
1010
1011         CTR4(KTR_RUNQ,"kg %p set concurrency to %d, slots %d -> %d",
1012             kg,
1013             concurrency,
1014             kg->kg_avail_opennings,
1015             kg->kg_avail_opennings + (concurrency - kg->kg_concurrency));
1016         kg->kg_avail_opennings += (concurrency - kg->kg_concurrency);
1017         kg->kg_concurrency = concurrency;
1018 }
1019
1020 /*
1021  * Called from thread_exit() for all exiting thread
1022  *
1023  * Not to be confused with sched_exit_thread()
1024  * that is only called from thread_exit() for threads exiting
1025  * without the rest of the process exiting because it is also called from
1026  * sched_exit() and we wouldn't want to call it twice.
1027  * XXX This can probably be fixed.
1028  */
1029 void
1030 sched_thread_exit(struct thread *td)
1031 {
1032
1033         SLOT_RELEASE(td->td_ksegrp);
1034         slot_fill(td->td_ksegrp);
1035 }
1036
1037 #endif /* KERN_SWITCH_INCLUDE */