sys/kern/sched_ule.c

   1 /*-
   2  * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice unmodified, this list of conditions, and the following
  10  *    disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 #include <opt_sched.h>
  31
  32 #define kse td_sched
  33
  34 #include <sys/param.h>
  35 #include <sys/systm.h>
  36 #include <sys/kdb.h>
  37 #include <sys/kernel.h>
  38 #include <sys/ktr.h>
  39 #include <sys/lock.h>
  40 #include <sys/mutex.h>
  41 #include <sys/proc.h>
  42 #include <sys/resource.h>
  43 #include <sys/resourcevar.h>
  44 #include <sys/sched.h>
  45 #include <sys/smp.h>
  46 #include <sys/sx.h>
  47 #include <sys/sysctl.h>
  48 #include <sys/sysproto.h>
  49 #include <sys/turnstile.h>
  50 #include <sys/vmmeter.h>
  51 #ifdef KTRACE
  52 #include <sys/uio.h>
  53 #include <sys/ktrace.h>
  54 #endif
  55
  56 #ifdef HWPMC_HOOKS
  57 #include <sys/pmckern.h>
  58 #endif
  59
  60 #include <machine/cpu.h>
  61 #include <machine/smp.h>
  62
  63 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
  64 /* XXX This is bogus compatability crap for ps */
  65 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
  66 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
  67
  68 static void sched_setup(void *dummy);
  69 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
  70
  71 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
  72
  73 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
  74     "Scheduler name");
  75
  76 static int slice_min = 1;
  77 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
  78
  79 static int slice_max = 10;
  80 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
  81
  82 int realstathz;
  83 int tickincr = 1;
  84
  85 /*
  86  * The schedulable entity that can be given a context to run.
  87  * A process may have several of these. Probably one per processor
  88  * but posibly a few more. In this universe they are grouped
  89  * with a KSEG that contains the priority and niceness
  90  * for the group.
  91  */
  92 struct kse {
  93         TAILQ_ENTRY(kse) ke_procq;      /* (j/z) Run queue. */
  94         int             ke_flags;       /* (j) KEF_* flags. */
  95         struct thread   *ke_thread;     /* (*) Active associated thread. */
  96         fixpt_t         ke_pctcpu;      /* (j) %cpu during p_swtime. */
  97         char            ke_rqindex;     /* (j) Run queue index. */
  98         enum {
  99                 KES_THREAD = 0x0,       /* slaved to thread state */
 100                 KES_ONRUNQ
 101         } ke_state;                     /* (j) thread sched specific status. */
 102         int             ke_slptime;
 103         int             ke_slice;
 104         struct runq     *ke_runq;
 105         u_char          ke_cpu;         /* CPU that we have affinity for. */
 106         /* The following variables are only used for pctcpu calculation */
 107         int             ke_ltick;       /* Last tick that we were running on */
 108         int             ke_ftick;       /* First tick that we were running on */
 109         int             ke_ticks;       /* Tick count */
 110
 111 };
 112
 113
 114 #define td_kse td_sched
 115 #define td_slptime              td_kse->ke_slptime
 116 #define ke_proc                 ke_thread->td_proc
 117 #define ke_ksegrp               ke_thread->td_ksegrp
 118
 119 /* flags kept in ke_flags */
 120 #define KEF_SCHED0      0x00001 /* For scheduler-specific use. */
 121 #define KEF_SCHED1      0x00002 /* For scheduler-specific use. */
 122 #define KEF_SCHED2      0x00004 /* For scheduler-specific use. */
 123 #define KEF_SCHED3      0x00008 /* For scheduler-specific use. */
 124 #define KEF_SCHED4      0x00010
 125 #define KEF_SCHED5      0x00020
 126 #define KEF_DIDRUN      0x02000 /* Thread actually ran. */
 127 #define KEF_EXIT        0x04000 /* Thread is being killed. */
 128
 129 /*
 130  * These datastructures are allocated within their parent datastructure but
 131  * are scheduler specific.
 132  */
 133
 134 #define ke_assign       ke_procq.tqe_next
 135
 136 #define KEF_ASSIGNED    0x0001          /* Thread is being migrated. */
 137 #define KEF_BOUND       0x0002          /* Thread can not migrate. */
 138 #define KEF_XFERABLE    0x0004          /* Thread was added as transferable. */
 139 #define KEF_HOLD        0x0008          /* Thread is temporarily bound. */
 140 #define KEF_REMOVED     0x0010          /* Thread was removed while ASSIGNED */
 141 #define KEF_INTERNAL    0x0020
 142
 143 struct kg_sched {
 144         struct thread   *skg_last_assigned; /* (j) Last thread assigned to */
 145                                            /* the system scheduler */
 146         int     skg_slptime;            /* Number of ticks we vol. slept */
 147         int     skg_runtime;            /* Number of ticks we were running */
 148         int     skg_avail_opennings;    /* (j) Num unfilled slots in group.*/
 149         int     skg_concurrency;        /* (j) Num threads requested in group.*/
 150 };
 151 #define kg_last_assigned        kg_sched->skg_last_assigned
 152 #define kg_avail_opennings      kg_sched->skg_avail_opennings
 153 #define kg_concurrency          kg_sched->skg_concurrency
 154 #define kg_runtime              kg_sched->skg_runtime
 155 #define kg_slptime              kg_sched->skg_slptime
 156
 157 #define SLOT_RELEASE(kg)                                                \
 158 do {                                                                    \
 159         kg->kg_avail_opennings++;                                       \
 160         CTR3(KTR_RUNQ, "kg %p(%d) Slot released (->%d)",                \
 161         kg,                                                             \
 162         kg->kg_concurrency,                                             \
 163          kg->kg_avail_opennings);                                       \
 164         /*KASSERT((kg->kg_avail_opennings <= kg->kg_concurrency),       \
 165             ("slots out of whack")); */                                 \
 166 } while (0)
 167
 168 #define SLOT_USE(kg)                                                    \
 169 do {                                                                    \
 170         kg->kg_avail_opennings--;                                       \
 171         CTR3(KTR_RUNQ, "kg %p(%d) Slot used (->%d)",                    \
 172         kg,                                                             \
 173         kg->kg_concurrency,                                             \
 174          kg->kg_avail_opennings);                                       \
 175         /*KASSERT((kg->kg_avail_opennings >= 0),                        \
 176             ("slots out of whack"));*/                                  \
 177 } while (0)
 178
 179 static struct kse kse0;
 180 static struct kg_sched kg_sched0;
 181
 182 /*
 183  * The priority is primarily determined by the interactivity score.  Thus, we
 184  * give lower(better) priorities to kse groups that use less CPU.  The nice
 185  * value is then directly added to this to allow nice to have some effect
 186  * on latency.
 187  *
 188  * PRI_RANGE:   Total priority range for timeshare threads.
 189  * PRI_NRESV:   Number of nice values.
 190  * PRI_BASE:    The start of the dynamic range.
 191  */
 192 #define SCHED_PRI_RANGE         (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
 193 #define SCHED_PRI_NRESV         ((PRIO_MAX - PRIO_MIN) + 1)
 194 #define SCHED_PRI_NHALF         (SCHED_PRI_NRESV / 2)
 195 #define SCHED_PRI_BASE          (PRI_MIN_TIMESHARE)
 196 #define SCHED_PRI_INTERACT(score)                                       \
 197     ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX)
 198
 199 /*
 200  * These determine the interactivity of a process.
 201  *
 202  * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate
 203  *              before throttling back.
 204  * SLP_RUN_FORK:        Maximum slp+run time to inherit at fork time.
 205  * INTERACT_MAX:        Maximum interactivity value.  Smaller is better.
 206  * INTERACT_THRESH:     Threshhold for placement on the current runq.
 207  */
 208 #define SCHED_SLP_RUN_MAX       ((hz * 5) << 10)
 209 #define SCHED_SLP_RUN_FORK      ((hz / 2) << 10)
 210 #define SCHED_INTERACT_MAX      (100)
 211 #define SCHED_INTERACT_HALF     (SCHED_INTERACT_MAX / 2)
 212 #define SCHED_INTERACT_THRESH   (30)
 213
 214 /*
 215  * These parameters and macros determine the size of the time slice that is
 216  * granted to each thread.
 217  *
 218  * SLICE_MIN:   Minimum time slice granted, in units of ticks.
 219  * SLICE_MAX:   Maximum time slice granted.
 220  * SLICE_RANGE: Range of available time slices scaled by hz.
 221  * SLICE_SCALE: The number slices granted per val in the range of [0, max].
 222  * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
 223  * SLICE_NTHRESH:       The nice cutoff point for slice assignment.
 224  */
 225 #define SCHED_SLICE_MIN                 (slice_min)
 226 #define SCHED_SLICE_MAX                 (slice_max)
 227 #define SCHED_SLICE_INTERACTIVE         (slice_max)
 228 #define SCHED_SLICE_NTHRESH     (SCHED_PRI_NHALF - 1)
 229 #define SCHED_SLICE_RANGE               (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
 230 #define SCHED_SLICE_SCALE(val, max)     (((val) * SCHED_SLICE_RANGE) / (max))
 231 #define SCHED_SLICE_NICE(nice)                                          \
 232     (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH))
 233
 234 /*
 235  * This macro determines whether or not the thread belongs on the current or
 236  * next run queue.
 237  */
 238 #define SCHED_INTERACTIVE(kg)                                           \
 239     (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
 240 #define SCHED_CURR(kg, ke)                                              \
 241     ((ke->ke_thread->td_flags & TDF_BORROWING) || SCHED_INTERACTIVE(kg))
 242
 243 /*
 244  * Cpu percentage computation macros and defines.
 245  *
 246  * SCHED_CPU_TIME:      Number of seconds to average the cpu usage across.
 247  * SCHED_CPU_TICKS:     Number of hz ticks to average the cpu usage across.
 248  */
 249
 250 #define SCHED_CPU_TIME  10
 251 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME)
 252
 253 /*
 254  * kseq - per processor runqs and statistics.
 255  */
 256 struct kseq {
 257         struct runq     ksq_idle;               /* Queue of IDLE threads. */
 258         struct runq     ksq_timeshare[2];       /* Run queues for !IDLE. */
 259         struct runq     *ksq_next;              /* Next timeshare queue. */
 260         struct runq     *ksq_curr;              /* Current queue. */
 261         int             ksq_load_timeshare;     /* Load for timeshare. */
 262         int             ksq_load;               /* Aggregate load. */
 263         short           ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */
 264         short           ksq_nicemin;            /* Least nice. */
 265 #ifdef SMP
 266         int                     ksq_transferable;
 267         LIST_ENTRY(kseq)        ksq_siblings;   /* Next in kseq group. */
 268         struct kseq_group       *ksq_group;     /* Our processor group. */
 269         volatile struct kse     *ksq_assigned;  /* assigned by another CPU. */
 270 #else
 271         int             ksq_sysload;            /* For loadavg, !ITHD load. */
 272 #endif
 273 };
 274
 275 #ifdef SMP
 276 /*
 277  * kseq groups are groups of processors which can cheaply share threads.  When
 278  * one processor in the group goes idle it will check the runqs of the other
 279  * processors in its group prior to halting and waiting for an interrupt.
 280  * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
 281  * In a numa environment we'd want an idle bitmap per group and a two tiered
 282  * load balancer.
 283  */
 284 struct kseq_group {
 285         int     ksg_cpus;               /* Count of CPUs in this kseq group. */
 286         cpumask_t ksg_cpumask;          /* Mask of cpus in this group. */
 287         cpumask_t ksg_idlemask;         /* Idle cpus in this group. */
 288         cpumask_t ksg_mask;             /* Bit mask for first cpu. */
 289         int     ksg_load;               /* Total load of this group. */
 290         int     ksg_transferable;       /* Transferable load of this group. */
 291         LIST_HEAD(, kseq)       ksg_members; /* Linked list of all members. */
 292 };
 293 #endif
 294
 295 /*
 296  * One kse queue per processor.
 297  */
 298 #ifdef SMP
 299 static cpumask_t kseq_idle;
 300 static int ksg_maxid;
 301 static struct kseq      kseq_cpu[MAXCPU];
 302 static struct kseq_group kseq_groups[MAXCPU];
 303 static int bal_tick;
 304 static int gbal_tick;
 305 static int balance_groups;
 306
 307 #define KSEQ_SELF()     (&kseq_cpu[PCPU_GET(cpuid)])
 308 #define KSEQ_CPU(x)     (&kseq_cpu[(x)])
 309 #define KSEQ_ID(x)      ((x) - kseq_cpu)
 310 #define KSEQ_GROUP(x)   (&kseq_groups[(x)])
 311 #else   /* !SMP */
 312 static struct kseq      kseq_cpu;
 313
 314 #define KSEQ_SELF()     (&kseq_cpu)
 315 #define KSEQ_CPU(x)     (&kseq_cpu)
 316 #endif
 317
 318 static void     slot_fill(struct ksegrp *kg);
 319 static struct kse *sched_choose(void);          /* XXX Should be thread * */
 320 static void sched_slice(struct kse *ke);
 321 static void sched_priority(struct ksegrp *kg);
 322 static void sched_thread_priority(struct thread *td, u_char prio);
 323 static int sched_interact_score(struct ksegrp *kg);
 324 static void sched_interact_update(struct ksegrp *kg);
 325 static void sched_interact_fork(struct ksegrp *kg);
 326 static void sched_pctcpu_update(struct kse *ke);
 327
 328 /* Operations on per processor queues */
 329 static struct kse * kseq_choose(struct kseq *kseq);
 330 static void kseq_setup(struct kseq *kseq);
 331 static void kseq_load_add(struct kseq *kseq, struct kse *ke);
 332 static void kseq_load_rem(struct kseq *kseq, struct kse *ke);
 333 static __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke, int);
 334 static __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke);
 335 static void kseq_nice_add(struct kseq *kseq, int nice);
 336 static void kseq_nice_rem(struct kseq *kseq, int nice);
 337 void kseq_print(int cpu);
 338 #ifdef SMP
 339 static int kseq_transfer(struct kseq *ksq, struct kse *ke, int class);
 340 static struct kse *runq_steal(struct runq *rq);
 341 static void sched_balance(void);
 342 static void sched_balance_groups(void);
 343 static void sched_balance_group(struct kseq_group *ksg);
 344 static void sched_balance_pair(struct kseq *high, struct kseq *low);
 345 static void kseq_move(struct kseq *from, int cpu);
 346 static int kseq_idled(struct kseq *kseq);
 347 static void kseq_notify(struct kse *ke, int cpu);
 348 static void kseq_assign(struct kseq *);
 349 static struct kse *kseq_steal(struct kseq *kseq, int stealidle);
 350 #define KSE_CAN_MIGRATE(ke)                                             \
 351     ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
 352 #endif
 353
 354 void
 355 kseq_print(int cpu)
 356 {
 357         struct kseq *kseq;
 358         int i;
 359
 360         kseq = KSEQ_CPU(cpu);
 361
 362         printf("kseq:\n");
 363         printf("\tload:           %d\n", kseq->ksq_load);
 364         printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare);
 365 #ifdef SMP
 366         printf("\tload transferable: %d\n", kseq->ksq_transferable);
 367 #endif
 368         printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
 369         printf("\tnice counts:\n");
 370         for (i = 0; i < SCHED_PRI_NRESV; i++)
 371                 if (kseq->ksq_nice[i])
 372                         printf("\t\t%d = %d\n",
 373                             i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
 374 }
 375
 376 static __inline void
 377 kseq_runq_add(struct kseq *kseq, struct kse *ke, int flags)
 378 {
 379 #ifdef SMP
 380         if (KSE_CAN_MIGRATE(ke)) {
 381                 kseq->ksq_transferable++;
 382                 kseq->ksq_group->ksg_transferable++;
 383                 ke->ke_flags |= KEF_XFERABLE;
 384         }
 385 #endif
 386         runq_add(ke->ke_runq, ke, flags);
 387 }
 388
 389 static __inline void
 390 kseq_runq_rem(struct kseq *kseq, struct kse *ke)
 391 {
 392 #ifdef SMP
 393         if (ke->ke_flags & KEF_XFERABLE) {
 394                 kseq->ksq_transferable--;
 395                 kseq->ksq_group->ksg_transferable--;
 396                 ke->ke_flags &= ~KEF_XFERABLE;
 397         }
 398 #endif
 399         runq_remove(ke->ke_runq, ke);
 400 }
 401
 402 static void
 403 kseq_load_add(struct kseq *kseq, struct kse *ke)
 404 {
 405         int class;
 406         mtx_assert(&sched_lock, MA_OWNED);
 407         class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
 408         if (class == PRI_TIMESHARE)
 409                 kseq->ksq_load_timeshare++;
 410         kseq->ksq_load++;
 411         CTR1(KTR_SCHED, "load: %d", kseq->ksq_load);
 412         if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
 413 #ifdef SMP
 414                 kseq->ksq_group->ksg_load++;
 415 #else
 416                 kseq->ksq_sysload++;
 417 #endif
 418         if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
 419                 kseq_nice_add(kseq, ke->ke_proc->p_nice);
 420 }
 421
 422 static void
 423 kseq_load_rem(struct kseq *kseq, struct kse *ke)
 424 {
 425         int class;
 426         mtx_assert(&sched_lock, MA_OWNED);
 427         class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
 428         if (class == PRI_TIMESHARE)
 429                 kseq->ksq_load_timeshare--;
 430         if (class != PRI_ITHD  && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
 431 #ifdef SMP
 432                 kseq->ksq_group->ksg_load--;
 433 #else
 434                 kseq->ksq_sysload--;
 435 #endif
 436         kseq->ksq_load--;
 437         CTR1(KTR_SCHED, "load: %d", kseq->ksq_load);
 438         ke->ke_runq = NULL;
 439         if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
 440                 kseq_nice_rem(kseq, ke->ke_proc->p_nice);
 441 }
 442
 443 static void
 444 kseq_nice_add(struct kseq *kseq, int nice)
 445 {
 446         mtx_assert(&sched_lock, MA_OWNED);
 447         /* Normalize to zero. */
 448         kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
 449         if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1)
 450                 kseq->ksq_nicemin = nice;
 451 }
 452
 453 static void
 454 kseq_nice_rem(struct kseq *kseq, int nice)
 455 {
 456         int n;
 457
 458         mtx_assert(&sched_lock, MA_OWNED);
 459         /* Normalize to zero. */
 460         n = nice + SCHED_PRI_NHALF;
 461         kseq->ksq_nice[n]--;
 462         KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
 463
 464         /*
 465          * If this wasn't the smallest nice value or there are more in
 466          * this bucket we can just return.  Otherwise we have to recalculate
 467          * the smallest nice.
 468          */
 469         if (nice != kseq->ksq_nicemin ||
 470             kseq->ksq_nice[n] != 0 ||
 471             kseq->ksq_load_timeshare == 0)
 472                 return;
 473
 474         for (; n < SCHED_PRI_NRESV; n++)
 475                 if (kseq->ksq_nice[n]) {
 476                         kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
 477                         return;
 478                 }
 479 }
 480
 481 #ifdef SMP
 482 /*
 483  * sched_balance is a simple CPU load balancing algorithm.  It operates by
 484  * finding the least loaded and most loaded cpu and equalizing their load
 485  * by migrating some processes.
 486  *
 487  * Dealing only with two CPUs at a time has two advantages.  Firstly, most
 488  * installations will only have 2 cpus.  Secondly, load balancing too much at
 489  * once can have an unpleasant effect on the system.  The scheduler rarely has
 490  * enough information to make perfect decisions.  So this algorithm chooses
 491  * algorithm simplicity and more gradual effects on load in larger systems.
 492  *
 493  * It could be improved by considering the priorities and slices assigned to
 494  * each task prior to balancing them.  There are many pathological cases with
 495  * any approach and so the semi random algorithm below may work as well as any.
 496  *
 497  */
 498 static void
 499 sched_balance(void)
 500 {
 501         struct kseq_group *high;
 502         struct kseq_group *low;
 503         struct kseq_group *ksg;
 504         int cnt;
 505         int i;
 506
 507         bal_tick = ticks + (random() % (hz * 2));
 508         if (smp_started == 0)
 509                 return;
 510         low = high = NULL;
 511         i = random() % (ksg_maxid + 1);
 512         for (cnt = 0; cnt <= ksg_maxid; cnt++) {
 513                 ksg = KSEQ_GROUP(i);
 514                 /*
 515                  * Find the CPU with the highest load that has some
 516                  * threads to transfer.
 517                  */
 518                 if ((high == NULL || ksg->ksg_load > high->ksg_load)
 519                     && ksg->ksg_transferable)
 520                         high = ksg;
 521                 if (low == NULL || ksg->ksg_load < low->ksg_load)
 522                         low = ksg;
 523                 if (++i > ksg_maxid)
 524                         i = 0;
 525         }
 526         if (low != NULL && high != NULL && high != low)
 527                 sched_balance_pair(LIST_FIRST(&high->ksg_members),
 528                     LIST_FIRST(&low->ksg_members));
 529 }
 530
 531 static void
 532 sched_balance_groups(void)
 533 {
 534         int i;
 535
 536         gbal_tick = ticks + (random() % (hz * 2));
 537         mtx_assert(&sched_lock, MA_OWNED);
 538         if (smp_started)
 539                 for (i = 0; i <= ksg_maxid; i++)
 540                         sched_balance_group(KSEQ_GROUP(i));
 541 }
 542
 543 static void
 544 sched_balance_group(struct kseq_group *ksg)
 545 {
 546         struct kseq *kseq;
 547         struct kseq *high;
 548         struct kseq *low;
 549         int load;
 550
 551         if (ksg->ksg_transferable == 0)
 552                 return;
 553         low = NULL;
 554         high = NULL;
 555         LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
 556                 load = kseq->ksq_load;
 557                 if (high == NULL || load > high->ksq_load)
 558                         high = kseq;
 559                 if (low == NULL || load < low->ksq_load)
 560                         low = kseq;
 561         }
 562         if (high != NULL && low != NULL && high != low)
 563                 sched_balance_pair(high, low);
 564 }
 565
 566 static void
 567 sched_balance_pair(struct kseq *high, struct kseq *low)
 568 {
 569         int transferable;
 570         int high_load;
 571         int low_load;
 572         int move;
 573         int diff;
 574         int i;
 575
 576         /*
 577          * If we're transfering within a group we have to use this specific
 578          * kseq's transferable count, otherwise we can steal from other members
 579          * of the group.
 580          */
 581         if (high->ksq_group == low->ksq_group) {
 582                 transferable = high->ksq_transferable;
 583                 high_load = high->ksq_load;
 584                 low_load = low->ksq_load;
 585         } else {
 586                 transferable = high->ksq_group->ksg_transferable;
 587                 high_load = high->ksq_group->ksg_load;
 588                 low_load = low->ksq_group->ksg_load;
 589         }
 590         if (transferable == 0)
 591                 return;
 592         /*
 593          * Determine what the imbalance is and then adjust that to how many
 594          * kses we actually have to give up (transferable).
 595          */
 596         diff = high_load - low_load;
 597         move = diff / 2;
 598         if (diff & 0x1)
 599                 move++;
 600         move = min(move, transferable);
 601         for (i = 0; i < move; i++)
 602                 kseq_move(high, KSEQ_ID(low));
 603         return;
 604 }
 605
 606 static void
 607 kseq_move(struct kseq *from, int cpu)
 608 {
 609         struct kseq *kseq;
 610         struct kseq *to;
 611         struct kse *ke;
 612
 613         kseq = from;
 614         to = KSEQ_CPU(cpu);
 615         ke = kseq_steal(kseq, 1);
 616         if (ke == NULL) {
 617                 struct kseq_group *ksg;
 618
 619                 ksg = kseq->ksq_group;
 620                 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
 621                         if (kseq == from || kseq->ksq_transferable == 0)
 622                                 continue;
 623                         ke = kseq_steal(kseq, 1);
 624                         break;
 625                 }
 626                 if (ke == NULL)
 627                         panic("kseq_move: No KSEs available with a "
 628                             "transferable count of %d\n",
 629                             ksg->ksg_transferable);
 630         }
 631         if (kseq == to)
 632                 return;
 633         ke->ke_state = KES_THREAD;
 634         kseq_runq_rem(kseq, ke);
 635         kseq_load_rem(kseq, ke);
 636         kseq_notify(ke, cpu);
 637 }
 638
 639 static int
 640 kseq_idled(struct kseq *kseq)
 641 {
 642         struct kseq_group *ksg;
 643         struct kseq *steal;
 644         struct kse *ke;
 645
 646         ksg = kseq->ksq_group;
 647         /*
 648          * If we're in a cpu group, try and steal kses from another cpu in
 649          * the group before idling.
 650          */
 651         if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) {
 652                 LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) {
 653                         if (steal == kseq || steal->ksq_transferable == 0)
 654                                 continue;
 655                         ke = kseq_steal(steal, 0);
 656                         if (ke == NULL)
 657                                 continue;
 658                         ke->ke_state = KES_THREAD;
 659                         kseq_runq_rem(steal, ke);
 660                         kseq_load_rem(steal, ke);
 661                         ke->ke_cpu = PCPU_GET(cpuid);
 662                         ke->ke_flags |= KEF_INTERNAL | KEF_HOLD;
 663                         sched_add(ke->ke_thread, SRQ_YIELDING);
 664                         return (0);
 665                 }
 666         }
 667         /*
 668          * We only set the idled bit when all of the cpus in the group are
 669          * idle.  Otherwise we could get into a situation where a KSE bounces
 670          * back and forth between two idle cores on seperate physical CPUs.
 671          */
 672         ksg->ksg_idlemask |= PCPU_GET(cpumask);
 673         if (ksg->ksg_idlemask != ksg->ksg_cpumask)
 674                 return (1);
 675         atomic_set_int(&kseq_idle, ksg->ksg_mask);
 676         return (1);
 677 }
 678
 679 static void
 680 kseq_assign(struct kseq *kseq)
 681 {
 682         struct kse *nke;
 683         struct kse *ke;
 684
 685         do {
 686                 *(volatile struct kse **)&ke = kseq->ksq_assigned;
 687         } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL));
 688         for (; ke != NULL; ke = nke) {
 689                 nke = ke->ke_assign;
 690                 kseq->ksq_group->ksg_load--;
 691                 kseq->ksq_load--;
 692                 ke->ke_flags &= ~KEF_ASSIGNED;
 693                 ke->ke_flags |= KEF_INTERNAL | KEF_HOLD;
 694                 sched_add(ke->ke_thread, SRQ_YIELDING);
 695         }
 696 }
 697
 698 static void
 699 kseq_notify(struct kse *ke, int cpu)
 700 {
 701         struct kseq *kseq;
 702         struct thread *td;
 703         struct pcpu *pcpu;
 704         int class;
 705         int prio;
 706
 707         kseq = KSEQ_CPU(cpu);
 708         /* XXX */
 709         class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
 710         if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
 711             (kseq_idle & kseq->ksq_group->ksg_mask))
 712                 atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
 713         kseq->ksq_group->ksg_load++;
 714         kseq->ksq_load++;
 715         ke->ke_cpu = cpu;
 716         ke->ke_flags |= KEF_ASSIGNED;
 717         prio = ke->ke_thread->td_priority;
 718
 719         /*
 720          * Place a KSE on another cpu's queue and force a resched.
 721          */
 722         do {
 723                 *(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned;
 724         } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke));
 725         /*
 726          * Without sched_lock we could lose a race where we set NEEDRESCHED
 727          * on a thread that is switched out before the IPI is delivered.  This
 728          * would lead us to miss the resched.  This will be a problem once
 729          * sched_lock is pushed down.
 730          */
 731         pcpu = pcpu_find(cpu);
 732         td = pcpu->pc_curthread;
 733         if (ke->ke_thread->td_priority < td->td_priority ||
 734             td == pcpu->pc_idlethread) {
 735                 td->td_flags |= TDF_NEEDRESCHED;
 736                 ipi_selected(1 << cpu, IPI_AST);
 737         }
 738 }
 739
 740 static struct kse *
 741 runq_steal(struct runq *rq)
 742 {
 743         struct rqhead *rqh;
 744         struct rqbits *rqb;
 745         struct kse *ke;
 746         int word;
 747         int bit;
 748
 749         mtx_assert(&sched_lock, MA_OWNED);
 750         rqb = &rq->rq_status;
 751         for (word = 0; word < RQB_LEN; word++) {
 752                 if (rqb->rqb_bits[word] == 0)
 753                         continue;
 754                 for (bit = 0; bit < RQB_BPW; bit++) {
 755                         if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
 756                                 continue;
 757                         rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
 758                         TAILQ_FOREACH(ke, rqh, ke_procq) {
 759                                 if (KSE_CAN_MIGRATE(ke))
 760                                         return (ke);
 761                         }
 762                 }
 763         }
 764         return (NULL);
 765 }
 766
 767 static struct kse *
 768 kseq_steal(struct kseq *kseq, int stealidle)
 769 {
 770         struct kse *ke;
 771
 772         /*
 773          * Steal from next first to try to get a non-interactive task that
 774          * may not have run for a while.
 775          */
 776         if ((ke = runq_steal(kseq->ksq_next)) != NULL)
 777                 return (ke);
 778         if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
 779                 return (ke);
 780         if (stealidle)
 781                 return (runq_steal(&kseq->ksq_idle));
 782         return (NULL);
 783 }
 784
 785 int
 786 kseq_transfer(struct kseq *kseq, struct kse *ke, int class)
 787 {
 788         struct kseq_group *nksg;
 789         struct kseq_group *ksg;
 790         struct kseq *old;
 791         int cpu;
 792         int idx;
 793
 794         if (smp_started == 0)
 795                 return (0);
 796         cpu = 0;
 797         /*
 798          * If our load exceeds a certain threshold we should attempt to
 799          * reassign this thread.  The first candidate is the cpu that
 800          * originally ran the thread.  If it is idle, assign it there,
 801          * otherwise, pick an idle cpu.
 802          *
 803          * The threshold at which we start to reassign kses has a large impact
 804          * on the overall performance of the system.  Tuned too high and
 805          * some CPUs may idle.  Too low and there will be excess migration
 806          * and context switches.
 807          */
 808         old = KSEQ_CPU(ke->ke_cpu);
 809         nksg = old->ksq_group;
 810         ksg = kseq->ksq_group;
 811         if (kseq_idle) {
 812                 if (kseq_idle & nksg->ksg_mask) {
 813                         cpu = ffs(nksg->ksg_idlemask);
 814                         if (cpu) {
 815                                 CTR2(KTR_SCHED,
 816                                     "kseq_transfer: %p found old cpu %X "
 817                                     "in idlemask.", ke, cpu);
 818                                 goto migrate;
 819                         }
 820                 }
 821                 /*
 822                  * Multiple cpus could find this bit simultaneously
 823                  * but the race shouldn't be terrible.
 824                  */
 825                 cpu = ffs(kseq_idle);
 826                 if (cpu) {
 827                         CTR2(KTR_SCHED, "kseq_transfer: %p found %X "
 828                             "in idlemask.", ke, cpu);
 829                         goto migrate;
 830                 }
 831         }
 832         idx = 0;
 833 #if 0
 834         if (old->ksq_load < kseq->ksq_load) {
 835                 cpu = ke->ke_cpu + 1;
 836                 CTR2(KTR_SCHED, "kseq_transfer: %p old cpu %X "
 837                     "load less than ours.", ke, cpu);
 838                 goto migrate;
 839         }
 840         /*
 841          * No new CPU was found, look for one with less load.
 842          */
 843         for (idx = 0; idx <= ksg_maxid; idx++) {
 844                 nksg = KSEQ_GROUP(idx);
 845                 if (nksg->ksg_load /*+ (nksg->ksg_cpus  * 2)*/ < ksg->ksg_load) {
 846                         cpu = ffs(nksg->ksg_cpumask);
 847                         CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X load less "
 848                             "than ours.", ke, cpu);
 849                         goto migrate;
 850                 }
 851         }
 852 #endif
 853         /*
 854          * If another cpu in this group has idled, assign a thread over
 855          * to them after checking to see if there are idled groups.
 856          */
 857         if (ksg->ksg_idlemask) {
 858                 cpu = ffs(ksg->ksg_idlemask);
 859                 if (cpu) {
 860                         CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X idle in "
 861                             "group.", ke, cpu);
 862                         goto migrate;
 863                 }
 864         }
 865         return (0);
 866 migrate:
 867         /*
 868          * Now that we've found an idle CPU, migrate the thread.
 869          */
 870         cpu--;
 871         ke->ke_runq = NULL;
 872         kseq_notify(ke, cpu);
 873
 874         return (1);
 875 }
 876
 877 #endif  /* SMP */
 878
 879 /*
 880  * Pick the highest priority task we have and return it.
 881  */
 882
 883 static struct kse *
 884 kseq_choose(struct kseq *kseq)
 885 {
 886         struct runq *swap;
 887         struct kse *ke;
 888         int nice;
 889
 890         mtx_assert(&sched_lock, MA_OWNED);
 891         swap = NULL;
 892
 893         for (;;) {
 894                 ke = runq_choose(kseq->ksq_curr);
 895                 if (ke == NULL) {
 896                         /*
 897                          * We already swapped once and didn't get anywhere.
 898                          */
 899                         if (swap)
 900                                 break;
 901                         swap = kseq->ksq_curr;
 902                         kseq->ksq_curr = kseq->ksq_next;
 903                         kseq->ksq_next = swap;
 904                         continue;
 905                 }
 906                 /*
 907                  * If we encounter a slice of 0 the kse is in a
 908                  * TIMESHARE kse group and its nice was too far out
 909                  * of the range that receives slices.
 910                  */
 911                 nice = ke->ke_proc->p_nice + (0 - kseq->ksq_nicemin);
 912                 if (ke->ke_slice == 0 || (nice > SCHED_SLICE_NTHRESH &&
 913                     ke->ke_proc->p_nice != 0)) {
 914                         runq_remove(ke->ke_runq, ke);
 915                         sched_slice(ke);
 916                         ke->ke_runq = kseq->ksq_next;
 917                         runq_add(ke->ke_runq, ke, 0);
 918                         continue;
 919                 }
 920                 return (ke);
 921         }
 922
 923         return (runq_choose(&kseq->ksq_idle));
 924 }
 925
 926 static void
 927 kseq_setup(struct kseq *kseq)
 928 {
 929         runq_init(&kseq->ksq_timeshare[0]);
 930         runq_init(&kseq->ksq_timeshare[1]);
 931         runq_init(&kseq->ksq_idle);
 932         kseq->ksq_curr = &kseq->ksq_timeshare[0];
 933         kseq->ksq_next = &kseq->ksq_timeshare[1];
 934         kseq->ksq_load = 0;
 935         kseq->ksq_load_timeshare = 0;
 936 }
 937
 938 static void
 939 sched_setup(void *dummy)
 940 {
 941 #ifdef SMP
 942         int i;
 943 #endif
 944
 945         slice_min = (hz/100);   /* 10ms */
 946         slice_max = (hz/7);     /* ~140ms */
 947
 948 #ifdef SMP
 949         balance_groups = 0;
 950         /*
 951          * Initialize the kseqs.
 952          */
 953         for (i = 0; i < MAXCPU; i++) {
 954                 struct kseq *ksq;
 955
 956                 ksq = &kseq_cpu[i];
 957                 ksq->ksq_assigned = NULL;
 958                 kseq_setup(&kseq_cpu[i]);
 959         }
 960         if (smp_topology == NULL) {
 961                 struct kseq_group *ksg;
 962                 struct kseq *ksq;
 963                 int cpus;
 964
 965                 for (cpus = 0, i = 0; i < MAXCPU; i++) {
 966                         if (CPU_ABSENT(i))
 967                                 continue;
 968                         ksq = &kseq_cpu[cpus];
 969                         ksg = &kseq_groups[cpus];
 970                         /*
 971                          * Setup a kseq group with one member.
 972                          */
 973                         ksq->ksq_transferable = 0;
 974                         ksq->ksq_group = ksg;
 975                         ksg->ksg_cpus = 1;
 976                         ksg->ksg_idlemask = 0;
 977                         ksg->ksg_cpumask = ksg->ksg_mask = 1 << i;
 978                         ksg->ksg_load = 0;
 979                         ksg->ksg_transferable = 0;
 980                         LIST_INIT(&ksg->ksg_members);
 981                         LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings);
 982                         cpus++;
 983                 }
 984                 ksg_maxid = cpus - 1;
 985         } else {
 986                 struct kseq_group *ksg;
 987                 struct cpu_group *cg;
 988                 int j;
 989
 990                 for (i = 0; i < smp_topology->ct_count; i++) {
 991                         cg = &smp_topology->ct_group[i];
 992                         ksg = &kseq_groups[i];
 993                         /*
 994                          * Initialize the group.
 995                          */
 996                         ksg->ksg_idlemask = 0;
 997                         ksg->ksg_load = 0;
 998                         ksg->ksg_transferable = 0;
 999                         ksg->ksg_cpus = cg->cg_count;
1000                         ksg->ksg_cpumask = cg->cg_mask;
1001                         LIST_INIT(&ksg->ksg_members);
1002                         /*
1003                          * Find all of the group members and add them.
1004                          */
1005                         for (j = 0; j < MAXCPU; j++) {
1006                                 if ((cg->cg_mask & (1 << j)) != 0) {
1007                                         if (ksg->ksg_mask == 0)
1008                                                 ksg->ksg_mask = 1 << j;
1009                                         kseq_cpu[j].ksq_transferable = 0;
1010                                         kseq_cpu[j].ksq_group = ksg;
1011                                         LIST_INSERT_HEAD(&ksg->ksg_members,
1012                                             &kseq_cpu[j], ksq_siblings);
1013                                 }
1014                         }
1015                         if (ksg->ksg_cpus > 1)
1016                                 balance_groups = 1;
1017                 }
1018                 ksg_maxid = smp_topology->ct_count - 1;
1019         }
1020         /*
1021          * Stagger the group and global load balancer so they do not
1022          * interfere with each other.
1023          */
1024         bal_tick = ticks + hz;
1025         if (balance_groups)
1026                 gbal_tick = ticks + (hz / 2);
1027 #else
1028         kseq_setup(KSEQ_SELF());
1029 #endif
1030         mtx_lock_spin(&sched_lock);
1031         kseq_load_add(KSEQ_SELF(), &kse0);
1032         mtx_unlock_spin(&sched_lock);
1033 }
1034
1035 /*
1036  * Scale the scheduling priority according to the "interactivity" of this
1037  * process.
1038  */
1039 static void
1040 sched_priority(struct ksegrp *kg)
1041 {
1042         int pri;
1043
1044         if (kg->kg_pri_class != PRI_TIMESHARE)
1045                 return;
1046
1047         pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
1048         pri += SCHED_PRI_BASE;
1049         pri += kg->kg_proc->p_nice;
1050
1051         if (pri > PRI_MAX_TIMESHARE)
1052                 pri = PRI_MAX_TIMESHARE;
1053         else if (pri < PRI_MIN_TIMESHARE)
1054                 pri = PRI_MIN_TIMESHARE;
1055
1056         kg->kg_user_pri = pri;
1057
1058         return;
1059 }
1060
1061 /*
1062  * Calculate a time slice based on the properties of the kseg and the runq
1063  * that we're on.  This is only for PRI_TIMESHARE ksegrps.
1064  */
1065 static void
1066 sched_slice(struct kse *ke)
1067 {
1068         struct kseq *kseq;
1069         struct ksegrp *kg;
1070
1071         kg = ke->ke_ksegrp;
1072         kseq = KSEQ_CPU(ke->ke_cpu);
1073
1074         if (ke->ke_thread->td_flags & TDF_BORROWING) {
1075                 ke->ke_slice = SCHED_SLICE_MIN;
1076                 return;
1077         }
1078
1079         /*
1080          * Rationale:
1081          * KSEs in interactive ksegs get a minimal slice so that we
1082          * quickly notice if it abuses its advantage.
1083          *
1084          * KSEs in non-interactive ksegs are assigned a slice that is
1085          * based on the ksegs nice value relative to the least nice kseg
1086          * on the run queue for this cpu.
1087          *
1088          * If the KSE is less nice than all others it gets the maximum
1089          * slice and other KSEs will adjust their slice relative to
1090          * this when they first expire.
1091          *
1092          * There is 20 point window that starts relative to the least
1093          * nice kse on the run queue.  Slice size is determined by
1094          * the kse distance from the last nice ksegrp.
1095          *
1096          * If the kse is outside of the window it will get no slice
1097          * and will be reevaluated each time it is selected on the
1098          * run queue.  The exception to this is nice 0 ksegs when
1099          * a nice -20 is running.  They are always granted a minimum
1100          * slice.
1101          */
1102         if (!SCHED_INTERACTIVE(kg)) {
1103                 int nice;
1104
1105                 nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin);
1106                 if (kseq->ksq_load_timeshare == 0 ||
1107                     kg->kg_proc->p_nice < kseq->ksq_nicemin)
1108                         ke->ke_slice = SCHED_SLICE_MAX;
1109                 else if (nice <= SCHED_SLICE_NTHRESH)
1110                         ke->ke_slice = SCHED_SLICE_NICE(nice);
1111                 else if (kg->kg_proc->p_nice == 0)
1112                         ke->ke_slice = SCHED_SLICE_MIN;
1113                 else
1114                         ke->ke_slice = 0;
1115         } else
1116                 ke->ke_slice = SCHED_SLICE_INTERACTIVE;
1117
1118         return;
1119 }
1120
1121 /*
1122  * This routine enforces a maximum limit on the amount of scheduling history
1123  * kept.  It is called after either the slptime or runtime is adjusted.
1124  * This routine will not operate correctly when slp or run times have been
1125  * adjusted to more than double their maximum.
1126  */
1127 static void
1128 sched_interact_update(struct ksegrp *kg)
1129 {
1130         int sum;
1131
1132         sum = kg->kg_runtime + kg->kg_slptime;
1133         if (sum < SCHED_SLP_RUN_MAX)
1134                 return;
1135         /*
1136          * If we have exceeded by more than 1/5th then the algorithm below
1137          * will not bring us back into range.  Dividing by two here forces
1138          * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
1139          */
1140         if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
1141                 kg->kg_runtime /= 2;
1142                 kg->kg_slptime /= 2;
1143                 return;
1144         }
1145         kg->kg_runtime = (kg->kg_runtime / 5) * 4;
1146         kg->kg_slptime = (kg->kg_slptime / 5) * 4;
1147 }
1148
1149 static void
1150 sched_interact_fork(struct ksegrp *kg)
1151 {
1152         int ratio;
1153         int sum;
1154
1155         sum = kg->kg_runtime + kg->kg_slptime;
1156         if (sum > SCHED_SLP_RUN_FORK) {
1157                 ratio = sum / SCHED_SLP_RUN_FORK;
1158                 kg->kg_runtime /= ratio;
1159                 kg->kg_slptime /= ratio;
1160         }
1161 }
1162
1163 static int
1164 sched_interact_score(struct ksegrp *kg)
1165 {
1166         int div;
1167
1168         if (kg->kg_runtime > kg->kg_slptime) {
1169                 div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF);
1170                 return (SCHED_INTERACT_HALF +
1171                     (SCHED_INTERACT_HALF - (kg->kg_slptime / div)));
1172         } if (kg->kg_slptime > kg->kg_runtime) {
1173                 div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF);
1174                 return (kg->kg_runtime / div);
1175         }
1176
1177         /*
1178          * This can happen if slptime and runtime are 0.
1179          */
1180         return (0);
1181
1182 }
1183
1184 /*
1185  * Very early in the boot some setup of scheduler-specific
1186  * parts of proc0 and of soem scheduler resources needs to be done.
1187  * Called from:
1188  *  proc0_init()
1189  */
1190 void
1191 schedinit(void)
1192 {
1193         /*
1194          * Set up the scheduler specific parts of proc0.
1195          */
1196         proc0.p_sched = NULL; /* XXX */
1197         ksegrp0.kg_sched = &kg_sched0;
1198         thread0.td_sched = &kse0;
1199         kse0.ke_thread = &thread0;
1200         kse0.ke_state = KES_THREAD;
1201         kg_sched0.skg_concurrency = 1;
1202         kg_sched0.skg_avail_opennings = 0; /* we are already running */
1203 }
1204
1205 /*
1206  * This is only somewhat accurate since given many processes of the same
1207  * priority they will switch when their slices run out, which will be
1208  * at most SCHED_SLICE_MAX.
1209  */
1210 int
1211 sched_rr_interval(void)
1212 {
1213         return (SCHED_SLICE_MAX);
1214 }
1215
1216 static void
1217 sched_pctcpu_update(struct kse *ke)
1218 {
1219         /*
1220          * Adjust counters and watermark for pctcpu calc.
1221          */
1222         if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) {
1223                 /*
1224                  * Shift the tick count out so that the divide doesn't
1225                  * round away our results.
1226                  */
1227                 ke->ke_ticks <<= 10;
1228                 ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) *
1229                             SCHED_CPU_TICKS;
1230                 ke->ke_ticks >>= 10;
1231         } else
1232                 ke->ke_ticks = 0;
1233         ke->ke_ltick = ticks;
1234         ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
1235 }
1236
1237 void
1238 sched_thread_priority(struct thread *td, u_char prio)
1239 {
1240         struct kse *ke;
1241
1242         CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
1243             td, td->td_proc->p_comm, td->td_priority, prio, curthread,
1244             curthread->td_proc->p_comm);
1245         ke = td->td_kse;
1246         mtx_assert(&sched_lock, MA_OWNED);
1247         if (td->td_priority == prio)
1248                 return;
1249         if (TD_ON_RUNQ(td)) {
1250                 /*
1251                  * If the priority has been elevated due to priority
1252                  * propagation, we may have to move ourselves to a new
1253                  * queue.  We still call adjustrunqueue below in case kse
1254                  * needs to fix things up.
1255                  */
1256                 if (prio < td->td_priority && ke->ke_runq != NULL &&
1257                     (ke->ke_flags & KEF_ASSIGNED) == 0 &&
1258                     ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) {
1259                         runq_remove(ke->ke_runq, ke);
1260                         ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr;
1261                         runq_add(ke->ke_runq, ke, 0);
1262                 }
1263                 /*
1264                  * Hold this kse on this cpu so that sched_prio() doesn't
1265                  * cause excessive migration.  We only want migration to
1266                  * happen as the result of a wakeup.
1267                  */
1268                 ke->ke_flags |= KEF_HOLD;
1269                 adjustrunqueue(td, prio);
1270                 ke->ke_flags &= ~KEF_HOLD;
1271         } else
1272                 td->td_priority = prio;
1273 }
1274
1275 /*
1276  * Update a thread's priority when it is lent another thread's
1277  * priority.
1278  */
1279 void
1280 sched_lend_prio(struct thread *td, u_char prio)
1281 {
1282
1283         td->td_flags |= TDF_BORROWING;
1284         sched_thread_priority(td, prio);
1285 }
1286
1287 /*
1288  * Restore a thread's priority when priority propagation is
1289  * over.  The prio argument is the minimum priority the thread
1290  * needs to have to satisfy other possible priority lending
1291  * requests.  If the thread's regular priority is less
1292  * important than prio, the thread will keep a priority boost
1293  * of prio.
1294  */
1295 void
1296 sched_unlend_prio(struct thread *td, u_char prio)
1297 {
1298         u_char base_pri;
1299
1300         if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
1301             td->td_base_pri <= PRI_MAX_TIMESHARE)
1302                 base_pri = td->td_ksegrp->kg_user_pri;
1303         else
1304                 base_pri = td->td_base_pri;
1305         if (prio >= base_pri) {
1306                 td->td_flags &= ~TDF_BORROWING;
1307                 sched_thread_priority(td, base_pri);
1308         } else
1309                 sched_lend_prio(td, prio);
1310 }
1311
1312 void
1313 sched_prio(struct thread *td, u_char prio)
1314 {
1315         u_char oldprio;
1316
1317         /* First, update the base priority. */
1318         td->td_base_pri = prio;
1319
1320         /*
1321          * If the thread is borrowing another thread's priority, don't
1322          * ever lower the priority.
1323          */
1324         if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
1325                 return;
1326
1327         /* Change the real priority. */
1328         oldprio = td->td_priority;
1329         sched_thread_priority(td, prio);
1330
1331         /*
1332          * If the thread is on a turnstile, then let the turnstile update
1333          * its state.
1334          */
1335         if (TD_ON_LOCK(td) && oldprio != prio)
1336                 turnstile_adjust(td, oldprio);
1337 }
1338
1339 void
1340 sched_switch(struct thread *td, struct thread *newtd, int flags)
1341 {
1342         struct kseq *ksq;
1343         struct kse *ke;
1344
1345         mtx_assert(&sched_lock, MA_OWNED);
1346
1347         ke = td->td_kse;
1348         ksq = KSEQ_SELF();
1349
1350         td->td_lastcpu = td->td_oncpu;
1351         td->td_oncpu = NOCPU;
1352         td->td_flags &= ~TDF_NEEDRESCHED;
1353         td->td_owepreempt = 0;
1354
1355         /*
1356          * If the KSE has been assigned it may be in the process of switching
1357          * to the new cpu.  This is the case in sched_bind().
1358          */
1359         if (td == PCPU_GET(idlethread)) {
1360                 TD_SET_CAN_RUN(td);
1361         } else if ((ke->ke_flags & KEF_ASSIGNED) == 0) {
1362                 /* We are ending our run so make our slot available again */
1363                 SLOT_RELEASE(td->td_ksegrp);
1364                 kseq_load_rem(ksq, ke);
1365                 if (TD_IS_RUNNING(td)) {
1366                         /*
1367                          * Don't allow the thread to migrate
1368                          * from a preemption.
1369                          */
1370                         ke->ke_flags |= KEF_HOLD;
1371                         setrunqueue(td, (flags & SW_PREEMPT) ?
1372                             SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1373                             SRQ_OURSELF|SRQ_YIELDING);
1374                         ke->ke_flags &= ~KEF_HOLD;
1375                 } else if ((td->td_proc->p_flag & P_HADTHREADS) &&
1376                     (newtd == NULL || newtd->td_ksegrp != td->td_ksegrp))
1377                         /*
1378                          * We will not be on the run queue.
1379                          * So we must be sleeping or similar.
1380                          * Don't use the slot if we will need it
1381                          * for newtd.
1382                          */
1383                         slot_fill(td->td_ksegrp);
1384         }
1385         if (newtd != NULL) {
1386                 /*
1387                  * If we bring in a thread,
1388                  * then account for it as if it had been added to the
1389                  * run queue and then chosen.
1390                  */
1391                 newtd->td_kse->ke_flags |= KEF_DIDRUN;
1392                 newtd->td_kse->ke_runq = ksq->ksq_curr;
1393                 SLOT_USE(newtd->td_ksegrp);
1394                 TD_SET_RUNNING(newtd);
1395                 kseq_load_add(KSEQ_SELF(), newtd->td_kse);
1396         } else
1397                 newtd = choosethread();
1398         if (td != newtd) {
1399 #ifdef  HWPMC_HOOKS
1400                 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1401                         PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1402 #endif
1403                 cpu_switch(td, newtd);
1404 #ifdef  HWPMC_HOOKS
1405                 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1406                         PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1407 #endif
1408         }
1409
1410         sched_lock.mtx_lock = (uintptr_t)td;
1411
1412         td->td_oncpu = PCPU_GET(cpuid);
1413 }
1414
1415 void
1416 sched_nice(struct proc *p, int nice)
1417 {
1418         struct ksegrp *kg;
1419         struct kse *ke;
1420         struct thread *td;
1421         struct kseq *kseq;
1422
1423         PROC_LOCK_ASSERT(p, MA_OWNED);
1424         mtx_assert(&sched_lock, MA_OWNED);
1425         /*
1426          * We need to adjust the nice counts for running KSEs.
1427          */
1428         FOREACH_KSEGRP_IN_PROC(p, kg) {
1429                 if (kg->kg_pri_class == PRI_TIMESHARE) {
1430                         FOREACH_THREAD_IN_GROUP(kg, td) {
1431                                 ke = td->td_kse;
1432                                 if (ke->ke_runq == NULL)
1433                                         continue;
1434                                 kseq = KSEQ_CPU(ke->ke_cpu);
1435                                 kseq_nice_rem(kseq, p->p_nice);
1436                                 kseq_nice_add(kseq, nice);
1437                         }
1438                 }
1439         }
1440         p->p_nice = nice;
1441         FOREACH_KSEGRP_IN_PROC(p, kg) {
1442                 sched_priority(kg);
1443                 FOREACH_THREAD_IN_GROUP(kg, td)
1444                         td->td_flags |= TDF_NEEDRESCHED;
1445         }
1446 }
1447
1448 void
1449 sched_sleep(struct thread *td)
1450 {
1451         mtx_assert(&sched_lock, MA_OWNED);
1452
1453         td->td_slptime = ticks;
1454 }
1455
1456 void
1457 sched_wakeup(struct thread *td)
1458 {
1459         mtx_assert(&sched_lock, MA_OWNED);
1460
1461         /*
1462          * Let the kseg know how long we slept for.  This is because process
1463          * interactivity behavior is modeled in the kseg.
1464          */
1465         if (td->td_slptime) {
1466                 struct ksegrp *kg;
1467                 int hzticks;
1468
1469                 kg = td->td_ksegrp;
1470                 hzticks = (ticks - td->td_slptime) << 10;
1471                 if (hzticks >= SCHED_SLP_RUN_MAX) {
1472                         kg->kg_slptime = SCHED_SLP_RUN_MAX;
1473                         kg->kg_runtime = 1;
1474                 } else {
1475                         kg->kg_slptime += hzticks;
1476                         sched_interact_update(kg);
1477                 }
1478                 sched_priority(kg);
1479                 sched_slice(td->td_kse);
1480                 td->td_slptime = 0;
1481         }
1482         setrunqueue(td, SRQ_BORING);
1483 }
1484
1485 /*
1486  * Penalize the parent for creating a new child and initialize the child's
1487  * priority.
1488  */
1489 void
1490 sched_fork(struct thread *td, struct thread *childtd)
1491 {
1492
1493         mtx_assert(&sched_lock, MA_OWNED);
1494
1495         sched_fork_ksegrp(td, childtd->td_ksegrp);
1496         sched_fork_thread(td, childtd);
1497 }
1498
1499 void
1500 sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
1501 {
1502         struct ksegrp *kg = td->td_ksegrp;
1503         mtx_assert(&sched_lock, MA_OWNED);
1504
1505         child->kg_slptime = kg->kg_slptime;
1506         child->kg_runtime = kg->kg_runtime;
1507         child->kg_user_pri = kg->kg_user_pri;
1508         sched_interact_fork(child);
1509         kg->kg_runtime += tickincr << 10;
1510         sched_interact_update(kg);
1511 }
1512
1513 void
1514 sched_fork_thread(struct thread *td, struct thread *child)
1515 {
1516         struct kse *ke;
1517         struct kse *ke2;
1518
1519         sched_newthread(child);
1520         ke = td->td_kse;
1521         ke2 = child->td_kse;
1522         ke2->ke_slice = 1;      /* Attempt to quickly learn interactivity. */
1523         ke2->ke_cpu = ke->ke_cpu;
1524         ke2->ke_runq = NULL;
1525
1526         /* Grab our parents cpu estimation information. */
1527         ke2->ke_ticks = ke->ke_ticks;
1528         ke2->ke_ltick = ke->ke_ltick;
1529         ke2->ke_ftick = ke->ke_ftick;
1530 }
1531
1532 void
1533 sched_class(struct ksegrp *kg, int class)
1534 {
1535         struct kseq *kseq;
1536         struct kse *ke;
1537         struct thread *td;
1538         int nclass;
1539         int oclass;
1540
1541         mtx_assert(&sched_lock, MA_OWNED);
1542         if (kg->kg_pri_class == class)
1543                 return;
1544
1545         nclass = PRI_BASE(class);
1546         oclass = PRI_BASE(kg->kg_pri_class);
1547         FOREACH_THREAD_IN_GROUP(kg, td) {
1548                 ke = td->td_kse;
1549                 if ((ke->ke_state != KES_ONRUNQ &&
1550                     ke->ke_state != KES_THREAD) || ke->ke_runq == NULL)
1551                         continue;
1552                 kseq = KSEQ_CPU(ke->ke_cpu);
1553
1554 #ifdef SMP
1555                 /*
1556                  * On SMP if we're on the RUNQ we must adjust the transferable
1557                  * count because could be changing to or from an interrupt
1558                  * class.
1559                  */
1560                 if (ke->ke_state == KES_ONRUNQ) {
1561                         if (KSE_CAN_MIGRATE(ke)) {
1562                                 kseq->ksq_transferable--;
1563                                 kseq->ksq_group->ksg_transferable--;
1564                         }
1565                         if (KSE_CAN_MIGRATE(ke)) {
1566                                 kseq->ksq_transferable++;
1567                                 kseq->ksq_group->ksg_transferable++;
1568                         }
1569                 }
1570 #endif
1571                 if (oclass == PRI_TIMESHARE) {
1572                         kseq->ksq_load_timeshare--;
1573                         kseq_nice_rem(kseq, kg->kg_proc->p_nice);
1574                 }
1575                 if (nclass == PRI_TIMESHARE) {
1576                         kseq->ksq_load_timeshare++;
1577                         kseq_nice_add(kseq, kg->kg_proc->p_nice);
1578                 }
1579         }
1580
1581         kg->kg_pri_class = class;
1582 }
1583
1584 /*
1585  * Return some of the child's priority and interactivity to the parent.
1586  */
1587 void
1588 sched_exit(struct proc *p, struct thread *childtd)
1589 {
1590         mtx_assert(&sched_lock, MA_OWNED);
1591         sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd);
1592         sched_exit_thread(NULL, childtd);
1593 }
1594
1595 void
1596 sched_exit_ksegrp(struct ksegrp *kg, struct thread *td)
1597 {
1598         /* kg->kg_slptime += td->td_ksegrp->kg_slptime; */
1599         kg->kg_runtime += td->td_ksegrp->kg_runtime;
1600         sched_interact_update(kg);
1601 }
1602
1603 void
1604 sched_exit_thread(struct thread *td, struct thread *childtd)
1605 {
1606         CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
1607             childtd, childtd->td_proc->p_comm, childtd->td_priority);
1608         kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse);
1609 }
1610
1611 void
1612 sched_clock(struct thread *td)
1613 {
1614         struct kseq *kseq;
1615         struct ksegrp *kg;
1616         struct kse *ke;
1617
1618         mtx_assert(&sched_lock, MA_OWNED);
1619         kseq = KSEQ_SELF();
1620 #ifdef SMP
1621         if (ticks >= bal_tick)
1622                 sched_balance();
1623         if (ticks >= gbal_tick && balance_groups)
1624                 sched_balance_groups();
1625         /*
1626          * We could have been assigned a non real-time thread without an
1627          * IPI.
1628          */
1629         if (kseq->ksq_assigned)
1630                 kseq_assign(kseq);      /* Potentially sets NEEDRESCHED */
1631 #endif
1632         /*
1633          * sched_setup() apparently happens prior to stathz being set.  We
1634          * need to resolve the timers earlier in the boot so we can avoid
1635          * calculating this here.
1636          */
1637         if (realstathz == 0) {
1638                 realstathz = stathz ? stathz : hz;
1639                 tickincr = hz / realstathz;
1640                 /*
1641                  * XXX This does not work for values of stathz that are much
1642                  * larger than hz.
1643                  */
1644                 if (tickincr == 0)
1645                         tickincr = 1;
1646         }
1647
1648         ke = td->td_kse;
1649         kg = ke->ke_ksegrp;
1650
1651         /* Adjust ticks for pctcpu */
1652         ke->ke_ticks++;
1653         ke->ke_ltick = ticks;
1654
1655         /* Go up to one second beyond our max and then trim back down */
1656         if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
1657                 sched_pctcpu_update(ke);
1658
1659         if (td->td_flags & TDF_IDLETD)
1660                 return;
1661         /*
1662          * We only do slicing code for TIMESHARE ksegrps.
1663          */
1664         if (kg->kg_pri_class != PRI_TIMESHARE)
1665                 return;
1666         /*
1667          * We used a tick charge it to the ksegrp so that we can compute our
1668          * interactivity.
1669          */
1670         kg->kg_runtime += tickincr << 10;
1671         sched_interact_update(kg);
1672
1673         /*
1674          * We used up one time slice.
1675          */
1676         if (--ke->ke_slice > 0)
1677                 return;
1678         /*
1679          * We're out of time, recompute priorities and requeue.
1680          */
1681         kseq_load_rem(kseq, ke);
1682         sched_priority(kg);
1683         sched_slice(ke);
1684         if (SCHED_CURR(kg, ke))
1685                 ke->ke_runq = kseq->ksq_curr;
1686         else
1687                 ke->ke_runq = kseq->ksq_next;
1688         kseq_load_add(kseq, ke);
1689         td->td_flags |= TDF_NEEDRESCHED;
1690 }
1691
1692 int
1693 sched_runnable(void)
1694 {
1695         struct kseq *kseq;
1696         int load;
1697
1698         load = 1;
1699
1700         kseq = KSEQ_SELF();
1701 #ifdef SMP
1702         if (kseq->ksq_assigned) {
1703                 mtx_lock_spin(&sched_lock);
1704                 kseq_assign(kseq);
1705                 mtx_unlock_spin(&sched_lock);
1706         }
1707 #endif
1708         if ((curthread->td_flags & TDF_IDLETD) != 0) {
1709                 if (kseq->ksq_load > 0)
1710                         goto out;
1711         } else
1712                 if (kseq->ksq_load - 1 > 0)
1713                         goto out;
1714         load = 0;
1715 out:
1716         return (load);
1717 }
1718
1719 void
1720 sched_userret(struct thread *td)
1721 {
1722         struct ksegrp *kg;
1723
1724         KASSERT((td->td_flags & TDF_BORROWING) == 0,
1725             ("thread with borrowed priority returning to userland"));
1726         kg = td->td_ksegrp;
1727         if (td->td_priority != kg->kg_user_pri) {
1728                 mtx_lock_spin(&sched_lock);
1729                 td->td_priority = kg->kg_user_pri;
1730                 td->td_base_pri = kg->kg_user_pri;
1731                 mtx_unlock_spin(&sched_lock);
1732         }
1733 }
1734
1735 struct kse *
1736 sched_choose(void)
1737 {
1738         struct kseq *kseq;
1739         struct kse *ke;
1740
1741         mtx_assert(&sched_lock, MA_OWNED);
1742         kseq = KSEQ_SELF();
1743 #ifdef SMP
1744 restart:
1745         if (kseq->ksq_assigned)
1746                 kseq_assign(kseq);
1747 #endif
1748         ke = kseq_choose(kseq);
1749         if (ke) {
1750 #ifdef SMP
1751                 if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE)
1752                         if (kseq_idled(kseq) == 0)
1753                                 goto restart;
1754 #endif
1755                 kseq_runq_rem(kseq, ke);
1756                 ke->ke_state = KES_THREAD;
1757                 return (ke);
1758         }
1759 #ifdef SMP
1760         if (kseq_idled(kseq) == 0)
1761                 goto restart;
1762 #endif
1763         return (NULL);
1764 }
1765
1766 void
1767 sched_add(struct thread *td, int flags)
1768 {
1769         struct kseq *kseq;
1770         struct ksegrp *kg;
1771         struct kse *ke;
1772         int preemptive;
1773         int canmigrate;
1774         int class;
1775
1776         CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
1777             td, td->td_proc->p_comm, td->td_priority, curthread,
1778             curthread->td_proc->p_comm);
1779         mtx_assert(&sched_lock, MA_OWNED);
1780         ke = td->td_kse;
1781         kg = td->td_ksegrp;
1782         canmigrate = 1;
1783         preemptive = !(flags & SRQ_YIELDING);
1784         class = PRI_BASE(kg->kg_pri_class);
1785         kseq = KSEQ_SELF();
1786         if ((ke->ke_flags & KEF_INTERNAL) == 0)
1787                 SLOT_USE(td->td_ksegrp);
1788         ke->ke_flags &= ~KEF_INTERNAL;
1789 #ifdef SMP
1790         if (ke->ke_flags & KEF_ASSIGNED) {
1791                 if (ke->ke_flags & KEF_REMOVED)
1792                         ke->ke_flags &= ~KEF_REMOVED;
1793                 return;
1794         }
1795         canmigrate = KSE_CAN_MIGRATE(ke);
1796 #endif
1797         KASSERT(ke->ke_state != KES_ONRUNQ,
1798             ("sched_add: kse %p (%s) already in run queue", ke,
1799             ke->ke_proc->p_comm));
1800         KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
1801             ("sched_add: process swapped out"));
1802         KASSERT(ke->ke_runq == NULL,
1803             ("sched_add: KSE %p is still assigned to a run queue", ke));
1804         switch (class) {
1805         case PRI_ITHD:
1806         case PRI_REALTIME:
1807                 ke->ke_runq = kseq->ksq_curr;
1808                 ke->ke_slice = SCHED_SLICE_MAX;
1809                 if (canmigrate)
1810                         ke->ke_cpu = PCPU_GET(cpuid);
1811                 break;
1812         case PRI_TIMESHARE:
1813                 if (SCHED_CURR(kg, ke))
1814                         ke->ke_runq = kseq->ksq_curr;
1815                 else
1816                         ke->ke_runq = kseq->ksq_next;
1817                 break;
1818         case PRI_IDLE:
1819                 /*
1820                  * This is for priority prop.
1821                  */
1822                 if (ke->ke_thread->td_priority < PRI_MIN_IDLE)
1823                         ke->ke_runq = kseq->ksq_curr;
1824                 else
1825                         ke->ke_runq = &kseq->ksq_idle;
1826                 ke->ke_slice = SCHED_SLICE_MIN;
1827                 break;
1828         default:
1829                 panic("Unknown pri class.");
1830                 break;
1831         }
1832 #ifdef SMP
1833         /*
1834          * Don't migrate running threads here.  Force the long term balancer
1835          * to do it.
1836          */
1837         if (ke->ke_flags & KEF_HOLD) {
1838                 ke->ke_flags &= ~KEF_HOLD;
1839                 canmigrate = 0;
1840         }
1841         /*
1842          * If this thread is pinned or bound, notify the target cpu.
1843          */
1844         if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) {
1845                 ke->ke_runq = NULL;
1846                 kseq_notify(ke, ke->ke_cpu);
1847                 return;
1848         }
1849         /*
1850          * If we had been idle, clear our bit in the group and potentially
1851          * the global bitmap.  If not, see if we should transfer this thread.
1852          */
1853         if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
1854             (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) {
1855                 /*
1856                  * Check to see if our group is unidling, and if so, remove it
1857                  * from the global idle mask.
1858                  */
1859                 if (kseq->ksq_group->ksg_idlemask ==
1860                     kseq->ksq_group->ksg_cpumask)
1861                         atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
1862                 /*
1863                  * Now remove ourselves from the group specific idle mask.
1864                  */
1865                 kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask);
1866         } else if (canmigrate && kseq->ksq_load > 1 && class != PRI_ITHD)
1867                 if (kseq_transfer(kseq, ke, class))
1868                         return;
1869         ke->ke_cpu = PCPU_GET(cpuid);
1870 #endif
1871         if (td->td_priority < curthread->td_priority &&
1872             ke->ke_runq == kseq->ksq_curr)
1873                 curthread->td_flags |= TDF_NEEDRESCHED;
1874         if (preemptive && maybe_preempt(td))
1875                 return;
1876         ke->ke_state = KES_ONRUNQ;
1877
1878         kseq_runq_add(kseq, ke, flags);
1879         kseq_load_add(kseq, ke);
1880 }
1881
1882 void
1883 sched_rem(struct thread *td)
1884 {
1885         struct kseq *kseq;
1886         struct kse *ke;
1887
1888         CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
1889             td, td->td_proc->p_comm, td->td_priority, curthread,
1890             curthread->td_proc->p_comm);
1891         mtx_assert(&sched_lock, MA_OWNED);
1892         ke = td->td_kse;
1893         SLOT_RELEASE(td->td_ksegrp);
1894         if (ke->ke_flags & KEF_ASSIGNED) {
1895                 ke->ke_flags |= KEF_REMOVED;
1896                 return;
1897         }
1898         KASSERT((ke->ke_state == KES_ONRUNQ),
1899             ("sched_rem: KSE not on run queue"));
1900
1901         ke->ke_state = KES_THREAD;
1902         kseq = KSEQ_CPU(ke->ke_cpu);
1903         kseq_runq_rem(kseq, ke);
1904         kseq_load_rem(kseq, ke);
1905 }
1906
1907 fixpt_t
1908 sched_pctcpu(struct thread *td)
1909 {
1910         fixpt_t pctcpu;
1911         struct kse *ke;
1912
1913         pctcpu = 0;
1914         ke = td->td_kse;
1915         if (ke == NULL)
1916                 return (0);
1917
1918         mtx_lock_spin(&sched_lock);
1919         if (ke->ke_ticks) {
1920                 int rtick;
1921
1922                 /*
1923                  * Don't update more frequently than twice a second.  Allowing
1924                  * this causes the cpu usage to decay away too quickly due to
1925                  * rounding errors.
1926                  */
1927                 if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick ||
1928                     ke->ke_ltick < (ticks - (hz / 2)))
1929                         sched_pctcpu_update(ke);
1930                 /* How many rtick per second ? */
1931                 rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
1932                 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
1933         }
1934
1935         ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
1936         mtx_unlock_spin(&sched_lock);
1937
1938         return (pctcpu);
1939 }
1940
1941 void
1942 sched_bind(struct thread *td, int cpu)
1943 {
1944         struct kse *ke;
1945
1946         mtx_assert(&sched_lock, MA_OWNED);
1947         ke = td->td_kse;
1948         ke->ke_flags |= KEF_BOUND;
1949 #ifdef SMP
1950         if (PCPU_GET(cpuid) == cpu)
1951                 return;
1952         /* sched_rem without the runq_remove */
1953         ke->ke_state = KES_THREAD;
1954         kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
1955         kseq_notify(ke, cpu);
1956         /* When we return from mi_switch we'll be on the correct cpu. */
1957         mi_switch(SW_VOL, NULL);
1958 #endif
1959 }
1960
1961 void
1962 sched_unbind(struct thread *td)
1963 {
1964         mtx_assert(&sched_lock, MA_OWNED);
1965         td->td_kse->ke_flags &= ~KEF_BOUND;
1966 }
1967
1968 int
1969 sched_is_bound(struct thread *td)
1970 {
1971         mtx_assert(&sched_lock, MA_OWNED);
1972         return (td->td_kse->ke_flags & KEF_BOUND);
1973 }
1974
1975 int
1976 sched_load(void)
1977 {
1978 #ifdef SMP
1979         int total;
1980         int i;
1981
1982         total = 0;
1983         for (i = 0; i <= ksg_maxid; i++)
1984                 total += KSEQ_GROUP(i)->ksg_load;
1985         return (total);
1986 #else
1987         return (KSEQ_SELF()->ksq_sysload);
1988 #endif
1989 }
1990
1991 int
1992 sched_sizeof_ksegrp(void)
1993 {
1994         return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
1995 }
1996
1997 int
1998 sched_sizeof_proc(void)
1999 {
2000         return (sizeof(struct proc));
2001 }
2002
2003 int
2004 sched_sizeof_thread(void)
2005 {
2006         return (sizeof(struct thread) + sizeof(struct td_sched));
2007 }
2008 #define KERN_SWITCH_INCLUDE 1
2009 #include "kern/kern_switch.c"