sys/kern/sched_core.c

   1 /*-
   2  * Copyright (c) 2005-2006, David Xu <yfxu@corp.netease.com>
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice unmodified, this list of conditions, and the following
  10  *    disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25  */
  26
  27 #include <sys/cdefs.h>
  28 __FBSDID("$FreeBSD$");
  29
  30 #include "opt_hwpmc_hooks.h"
  31 #include "opt_sched.h"
  32
  33 #define kse td_sched
  34
  35 #include <sys/param.h>
  36 #include <sys/systm.h>
  37 #include <sys/kdb.h>
  38 #include <sys/kernel.h>
  39 #include <sys/kthread.h>
  40 #include <sys/ktr.h>
  41 #include <sys/lock.h>
  42 #include <sys/mutex.h>
  43 #include <sys/proc.h>
  44 #include <sys/resource.h>
  45 #include <sys/resourcevar.h>
  46 #include <sys/sched.h>
  47 #include <sys/smp.h>
  48 #include <sys/sx.h>
  49 #include <sys/sysctl.h>
  50 #include <sys/sysproto.h>
  51 #include <sys/turnstile.h>
  52 #include <sys/umtx.h>
  53 #include <sys/unistd.h>
  54 #include <sys/vmmeter.h>
  55 #ifdef KTRACE
  56 #include <sys/uio.h>
  57 #include <sys/ktrace.h>
  58 #endif
  59
  60 #ifdef HWPMC_HOOKS
  61 #include <sys/pmckern.h>
  62 #endif
  63
  64 #include <machine/cpu.h>
  65 #include <machine/smp.h>
  66
  67 /* get process's nice value, skip value 20 which is not supported */
  68 #define PROC_NICE(p)            MIN((p)->p_nice, 19)
  69
  70 /* convert nice to kernel thread priority */
  71 #define NICE_TO_PRI(nice)       (PUSER + 20 + (nice))
  72
  73 /* get process's static priority */
  74 #define PROC_PRI(p)             NICE_TO_PRI(PROC_NICE(p))
  75
  76 /* convert kernel thread priority to user priority */
  77 #define USER_PRI(pri)           MIN((pri) - PUSER, 39)
  78
  79 /* convert nice value to user priority */
  80 #define PROC_USER_PRI(p)        (PROC_NICE(p) + 20)
  81
  82 /* maximum user priority, highest prio + 1 */
  83 #define MAX_USER_PRI            40
  84
  85 /* maximum kernel priority its nice is 19 */
  86 #define PUSER_MAX               (PUSER + 39)
  87
  88 /* ticks and nanosecond converters */
  89 #define NS_TO_HZ(n)             ((n) / (1000000000 / hz))
  90 #define HZ_TO_NS(h)             ((h) * (1000000000 / hz))
  91
  92 /* ticks and microsecond converters */
  93 #define MS_TO_HZ(m)             ((m) / (1000000 / hz))
  94
  95 #define PRI_SCORE_RATIO         25
  96 #define MAX_SCORE               (MAX_USER_PRI * PRI_SCORE_RATIO / 100)
  97 #define MAX_SLEEP_TIME          (def_timeslice * MAX_SCORE)
  98 #define NS_MAX_SLEEP_TIME       (HZ_TO_NS(MAX_SLEEP_TIME))
  99 #define STARVATION_TIME         (MAX_SLEEP_TIME)
 100
 101 #define CURRENT_SCORE(kg)       \
 102    (MAX_SCORE * NS_TO_HZ((kg)->kg_slptime) / MAX_SLEEP_TIME)
 103
 104 #define SCALE_USER_PRI(x, upri) \
 105     MAX(x * (upri + 1) / (MAX_USER_PRI/2), min_timeslice)
 106
 107 /*
 108  * For a thread whose nice is zero, the score is used to determine
 109  * if it is an interactive thread.
 110  */
 111 #define INTERACTIVE_BASE_SCORE  (MAX_SCORE * 20)/100
 112
 113 /*
 114  * Calculate a score which a thread must have to prove itself is
 115  * an interactive thread.
 116  */
 117 #define INTERACTIVE_SCORE(ke)           \
 118     (PROC_NICE((ke)->ke_proc) * MAX_SCORE / 40 + INTERACTIVE_BASE_SCORE)
 119
 120 /* Test if a thread is an interactive thread */
 121 #define THREAD_IS_INTERACTIVE(ke)       \
 122     ((ke)->ke_ksegrp->kg_user_pri <=    \
 123         PROC_PRI((ke)->ke_proc) - INTERACTIVE_SCORE(ke))
 124
 125 /*
 126  * Calculate how long a thread must sleep to prove itself is an
 127  * interactive sleep.
 128  */
 129 #define INTERACTIVE_SLEEP_TIME(ke)      \
 130     (HZ_TO_NS(MAX_SLEEP_TIME *          \
 131         (MAX_SCORE / 2 + INTERACTIVE_SCORE((ke)) + 1) / MAX_SCORE - 1))
 132
 133 #define CHILD_WEIGHT    90
 134 #define PARENT_WEIGHT   90
 135 #define EXIT_WEIGHT     3
 136
 137 #define SCHED_LOAD_SCALE        128UL
 138
 139 #define IDLE            0
 140 #define IDLE_IDLE       1
 141 #define NOT_IDLE        2
 142
 143 #define KQB_LEN         (8)             /* Number of priority status words. */
 144 #define KQB_L2BPW       (5)             /* Log2(sizeof(rqb_word_t) * NBBY)). */
 145 #define KQB_BPW         (1<<KQB_L2BPW)  /* Bits in an rqb_word_t. */
 146
 147 #define KQB_BIT(pri)    (1 << ((pri) & (KQB_BPW - 1)))
 148 #define KQB_WORD(pri)   ((pri) >> KQB_L2BPW)
 149 #define KQB_FFS(word)   (ffs(word) - 1)
 150
 151 #define KQ_NQS          256
 152
 153 /*
 154  * Type of run queue status word.
 155  */
 156 typedef u_int32_t       kqb_word_t;
 157
 158 /*
 159  * Head of run queues.
 160  */
 161 TAILQ_HEAD(krqhead, kse);
 162
 163 /*
 164  * Bit array which maintains the status of a run queue.  When a queue is
 165  * non-empty the bit corresponding to the queue number will be set.
 166  */
 167 struct krqbits {
 168         kqb_word_t      rqb_bits[KQB_LEN];
 169 };
 170
 171 /*
 172  * Run queue structure. Contains an array of run queues on which processes
 173  * are placed, and a structure to maintain the status of each queue.
 174  */
 175 struct krunq {
 176         struct krqbits  rq_status;
 177         struct krqhead  rq_queues[KQ_NQS];
 178 };
 179
 180 /*
 181  * The following datastructures are allocated within their parent structure
 182  * but are scheduler specific.
 183  */
 184 /*
 185  * The schedulable entity that can be given a context to run.  A process may
 186  * have several of these.
 187  */
 188 struct kse {
 189         struct thread   *ke_thread;     /* (*) Active associated thread. */
 190         TAILQ_ENTRY(kse) ke_procq;      /* (j/z) Run queue. */
 191         int             ke_flags;       /* (j) KEF_* flags. */
 192         fixpt_t         ke_pctcpu;      /* (j) %cpu during p_swtime. */
 193         u_char          ke_rqindex;     /* (j) Run queue index. */
 194         enum {
 195                 KES_THREAD = 0x0,       /* slaved to thread state */
 196                 KES_ONRUNQ
 197         } ke_state;                     /* (j) thread sched specific status. */
 198         int             ke_slice;       /* Time slice in ticks */
 199         struct kseq     *ke_kseq;       /* Kseq the thread belongs to */
 200         struct krunq    *ke_runq;       /* Assiociated runqueue */
 201 #ifdef SMP
 202         int             ke_cpu;         /* CPU that we have affinity for. */
 203         int             ke_wakeup_cpu;  /* CPU that has activated us. */
 204 #endif
 205         int             ke_activated;   /* How is the thread activated. */
 206         uint64_t        ke_timestamp;   /* Last timestamp dependent on state.*/
 207         unsigned        ke_lastran;     /* Last timestamp the thread ran. */
 208
 209         /* The following variables are only used for pctcpu calculation */
 210         int             ke_ltick;       /* Last tick that we were running on */
 211         int             ke_ftick;       /* First tick that we were running on */
 212         int             ke_ticks;       /* Tick count */
 213 };
 214
 215 #define td_kse                  td_sched
 216 #define ke_proc                 ke_thread->td_proc
 217 #define ke_ksegrp               ke_thread->td_ksegrp
 218
 219 /* flags kept in ke_flags */
 220 #define KEF_BOUND       0x0001          /* Thread can not migrate. */
 221 #define KEF_PREEMPTED   0x0002          /* Thread was preempted. */
 222 #define KEF_MIGRATING   0x0004          /* Thread is migrating. */
 223 #define KEF_SLEEP       0x0008          /* Thread did sleep. */
 224 #define KEF_DIDRUN      0x0010          /* Thread actually ran. */
 225 #define KEF_EXIT        0x0020          /* Thread is being killed. */
 226 #define KEF_NEXTRQ      0x0400          /* Thread should be in next queue. */
 227 #define KEF_FIRST_SLICE 0x0800          /* Thread has first time slice left. */
 228
 229 struct kg_sched {
 230         struct thread   *skg_last_assigned; /* (j) Last thread assigned to */
 231                                             /* the system scheduler */
 232         u_long  skg_slptime;            /* (j) Number of ticks we vol. slept */
 233         u_long  skg_runtime;            /* (j) Temp total run time. */
 234         int     skg_avail_opennings;    /* (j) Num unfilled slots in group.*/
 235         int     skg_concurrency;        /* (j) Num threads requested in group.*/
 236 };
 237 #define kg_last_assigned        kg_sched->skg_last_assigned
 238 #define kg_avail_opennings      kg_sched->skg_avail_opennings
 239 #define kg_concurrency          kg_sched->skg_concurrency
 240 #define kg_slptime              kg_sched->skg_slptime
 241 #define kg_runtime              kg_sched->skg_runtime
 242
 243 #define SLOT_RELEASE(kg)        (kg)->kg_avail_opennings++
 244 #define SLOT_USE(kg)            (kg)->kg_avail_opennings--
 245
 246 /*
 247  * Cpu percentage computation macros and defines.
 248  *
 249  * SCHED_CPU_TIME:      Number of seconds to average the cpu usage across.
 250  * SCHED_CPU_TICKS:     Number of hz ticks to average the cpu usage across.
 251  */
 252
 253 #define SCHED_CPU_TIME          10
 254 #define SCHED_CPU_TICKS         (hz * SCHED_CPU_TIME)
 255
 256 /*
 257  * kseq - per processor runqs and statistics.
 258  */
 259 struct kseq {
 260         struct krunq    *ksq_curr;              /* Current queue. */
 261         struct krunq    *ksq_next;              /* Next timeshare queue. */
 262         struct krunq    ksq_timeshare[2];       /* Run queues for !IDLE. */
 263         struct krunq    ksq_idle;               /* Queue of IDLE threads. */
 264         int             ksq_load;
 265         uint64_t        ksq_last_timestamp;     /* Per-cpu last clock tick */
 266         unsigned        ksq_expired_tick;       /* First expired tick */
 267         signed char     ksq_expired_nice;       /* Lowest nice in nextq */
 268 };
 269
 270 static struct kse kse0;
 271 static struct kg_sched kg_sched0;
 272
 273 static int min_timeslice = 5;
 274 static int def_timeslice = 100;
 275 static int granularity = 10;
 276 static int realstathz;
 277 static int sched_tdcnt;
 278 static struct kseq kseq_global;
 279
 280 /*
 281  * One kse queue per processor.
 282  */
 283 #ifdef SMP
 284 static struct kseq      kseq_cpu[MAXCPU];
 285
 286 #define KSEQ_SELF()     (&kseq_cpu[PCPU_GET(cpuid)])
 287 #define KSEQ_CPU(x)     (&kseq_cpu[(x)])
 288 #define KSEQ_ID(x)      ((x) - kseq_cpu)
 289
 290 static cpumask_t        cpu_sibling[MAXCPU];
 291
 292 #else   /* !SMP */
 293
 294 #define KSEQ_SELF()     (&kseq_global)
 295 #define KSEQ_CPU(x)     (&kseq_global)
 296 #endif
 297
 298 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 299 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
 300 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 301
 302 static void sched_setup(void *dummy);
 303 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
 304
 305 static void sched_initticks(void *dummy);
 306 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
 307
 308 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
 309
 310 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "CORE", 0,
 311     "Scheduler name");
 312
 313 #ifdef SMP
 314 /* Enable forwarding of wakeups to all other cpus */
 315 SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD, NULL, "Kernel SMP");
 316
 317 static int runq_fuzz = 0;
 318 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
 319
 320 static int forward_wakeup_enabled = 1;
 321 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
 322            &forward_wakeup_enabled, 0,
 323            "Forwarding of wakeup to idle CPUs");
 324
 325 static int forward_wakeups_requested = 0;
 326 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
 327            &forward_wakeups_requested, 0,
 328            "Requests for Forwarding of wakeup to idle CPUs");
 329
 330 static int forward_wakeups_delivered = 0;
 331 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
 332            &forward_wakeups_delivered, 0,
 333            "Completed Forwarding of wakeup to idle CPUs");
 334
 335 static int forward_wakeup_use_mask = 1;
 336 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
 337            &forward_wakeup_use_mask, 0,
 338            "Use the mask of idle cpus");
 339
 340 static int forward_wakeup_use_loop = 0;
 341 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
 342            &forward_wakeup_use_loop, 0,
 343            "Use a loop to find idle cpus");
 344
 345 static int forward_wakeup_use_single = 0;
 346 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, onecpu, CTLFLAG_RW,
 347            &forward_wakeup_use_single, 0,
 348            "Only signal one idle cpu");
 349
 350 static int forward_wakeup_use_htt = 0;
 351 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, htt2, CTLFLAG_RW,
 352            &forward_wakeup_use_htt, 0,
 353            "account for htt");
 354 #endif
 355
 356 static void slot_fill(struct ksegrp *);
 357
 358 static void krunq_add(struct krunq *, struct kse *);
 359 static struct kse *krunq_choose(struct krunq *);
 360 static void krunq_clrbit(struct krunq *rq, int pri);
 361 static int krunq_findbit(struct krunq *rq);
 362 static void krunq_init(struct krunq *);
 363 static void krunq_remove(struct krunq *, struct kse *);
 364
 365 static struct kse * kseq_choose(struct kseq *);
 366 static void kseq_load_add(struct kseq *, struct kse *);
 367 static void kseq_load_rem(struct kseq *, struct kse *);
 368 static void kseq_runq_add(struct kseq *, struct kse *);
 369 static void kseq_runq_rem(struct kseq *, struct kse *);
 370 static void kseq_setup(struct kseq *);
 371
 372 static int sched_is_timeshare(struct ksegrp *kg);
 373 static struct kse *sched_choose(void);
 374 static int sched_calc_pri(struct ksegrp *kg);
 375 static int sched_starving(struct kseq *, unsigned, struct kse *);
 376 static void sched_pctcpu_update(struct kse *);
 377 static void sched_thread_priority(struct thread *, u_char);
 378 static uint64_t sched_timestamp(void);
 379 static int sched_recalc_pri(struct kse *ke, uint64_t now);
 380 static int sched_timeslice(struct kse *ke);
 381 static void sched_update_runtime(struct kse *ke, uint64_t now);
 382 static void sched_commit_runtime(struct kse *ke);
 383
 384 /*
 385  * Initialize a run structure.
 386  */
 387 static void
 388 krunq_init(struct krunq *rq)
 389 {
 390         int i;
 391
 392         bzero(rq, sizeof *rq);
 393         for (i = 0; i < KQ_NQS; i++)
 394                 TAILQ_INIT(&rq->rq_queues[i]);
 395 }
 396
 397 /*
 398  * Clear the status bit of the queue corresponding to priority level pri,
 399  * indicating that it is empty.
 400  */
 401 static inline void
 402 krunq_clrbit(struct krunq *rq, int pri)
 403 {
 404         struct krqbits *rqb;
 405
 406         rqb = &rq->rq_status;
 407         rqb->rqb_bits[KQB_WORD(pri)] &= ~KQB_BIT(pri);
 408 }
 409
 410 /*
 411  * Find the index of the first non-empty run queue.  This is done by
 412  * scanning the status bits, a set bit indicates a non-empty queue.
 413  */
 414 static int
 415 krunq_findbit(struct krunq *rq)
 416 {
 417         struct krqbits *rqb;
 418         int pri;
 419         int i;
 420
 421         rqb = &rq->rq_status;
 422         for (i = 0; i < KQB_LEN; i++) {
 423                 if (rqb->rqb_bits[i]) {
 424                         pri = KQB_FFS(rqb->rqb_bits[i]) + (i << KQB_L2BPW);
 425                         return (pri);
 426                 }
 427         }
 428         return (-1);
 429 }
 430
 431 static int
 432 krunq_check(struct krunq *rq)
 433 {
 434         struct krqbits *rqb;
 435         int i;
 436
 437         rqb = &rq->rq_status;
 438         for (i = 0; i < KQB_LEN; i++) {
 439                 if (rqb->rqb_bits[i])
 440                         return (1);
 441         }
 442         return (0);
 443 }
 444
 445 /*
 446  * Set the status bit of the queue corresponding to priority level pri,
 447  * indicating that it is non-empty.
 448  */
 449 static inline void
 450 krunq_setbit(struct krunq *rq, int pri)
 451 {
 452         struct krqbits *rqb;
 453
 454         rqb = &rq->rq_status;
 455         rqb->rqb_bits[KQB_WORD(pri)] |= KQB_BIT(pri);
 456 }
 457
 458 /*
 459  * Add the KSE to the queue specified by its priority, and set the
 460  * corresponding status bit.
 461  */
 462 static void
 463 krunq_add(struct krunq *rq, struct kse *ke)
 464 {
 465         struct krqhead *rqh;
 466         int pri;
 467
 468         pri = ke->ke_thread->td_priority;
 469         ke->ke_rqindex = pri;
 470         krunq_setbit(rq, pri);
 471         rqh = &rq->rq_queues[pri];
 472         if (ke->ke_flags & KEF_PREEMPTED)
 473                 TAILQ_INSERT_HEAD(rqh, ke, ke_procq);
 474         else
 475                 TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
 476 }
 477
 478 /*
 479  * Find the highest priority process on the run queue.
 480  */
 481 static struct kse *
 482 krunq_choose(struct krunq *rq)
 483 {
 484         struct krqhead *rqh;
 485         struct kse *ke;
 486         int pri;
 487
 488         mtx_assert(&sched_lock, MA_OWNED);
 489         if ((pri = krunq_findbit(rq)) != -1) {
 490                 rqh = &rq->rq_queues[pri];
 491                 ke = TAILQ_FIRST(rqh);
 492                 KASSERT(ke != NULL, ("krunq_choose: no thread on busy queue"));
 493 #ifdef SMP
 494                 if (pri <= PRI_MAX_ITHD || runq_fuzz <= 0)
 495                         return (ke);
 496
 497                 /*
 498                  * In the first couple of entries, check if
 499                  * there is one for our CPU as a preference.
 500                  */
 501                 struct kse *ke2 = ke;
 502                 const int mycpu = PCPU_GET(cpuid);
 503                 const int mymask = 1 << mycpu;
 504                 int count = runq_fuzz;
 505
 506                 while (count-- && ke2) {
 507                         const int cpu = ke2->ke_wakeup_cpu;
 508                         if (cpu_sibling[cpu] & mymask) {
 509                                 ke = ke2;
 510                                 break;
 511                         }
 512                         ke2 = TAILQ_NEXT(ke2, ke_procq);
 513                 }
 514 #endif
 515                 return (ke);
 516         }
 517
 518         return (NULL);
 519 }
 520
 521 /*
 522  * Remove the KSE from the queue specified by its priority, and clear the
 523  * corresponding status bit if the queue becomes empty.
 524  * Caller must set ke->ke_state afterwards.
 525  */
 526 static void
 527 krunq_remove(struct krunq *rq, struct kse *ke)
 528 {
 529         struct krqhead *rqh;
 530         int pri;
 531
 532         KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 533                 ("runq_remove: process swapped out"));
 534         pri = ke->ke_rqindex;
 535         rqh = &rq->rq_queues[pri];
 536         KASSERT(ke != NULL, ("krunq_remove: no proc on busy queue"));
 537         TAILQ_REMOVE(rqh, ke, ke_procq);
 538         if (TAILQ_EMPTY(rqh))
 539                 krunq_clrbit(rq, pri);
 540 }
 541
 542 static inline void
 543 kseq_runq_add(struct kseq *kseq, struct kse *ke)
 544 {
 545         krunq_add(ke->ke_runq, ke);
 546         ke->ke_kseq = kseq;
 547 }
 548
 549 static inline void
 550 kseq_runq_rem(struct kseq *kseq, struct kse *ke)
 551 {
 552         krunq_remove(ke->ke_runq, ke);
 553         ke->ke_kseq = NULL;
 554         ke->ke_runq = NULL;
 555 }
 556
 557 static inline void
 558 kseq_load_add(struct kseq *kseq, struct kse *ke)
 559 {
 560         kseq->ksq_load++;
 561         if ((ke->ke_proc->p_flag & P_NOLOAD) == 0)
 562                 sched_tdcnt++;
 563 }
 564
 565 static inline void
 566 kseq_load_rem(struct kseq *kseq, struct kse *ke)
 567 {
 568         kseq->ksq_load--;
 569         if ((ke->ke_proc->p_flag & P_NOLOAD) == 0)
 570                 sched_tdcnt--;
 571 }
 572
 573 /*
 574  * Pick the highest priority task we have and return it.
 575  */
 576 static struct kse *
 577 kseq_choose(struct kseq *kseq)
 578 {
 579         struct krunq *swap;
 580         struct kse *ke;
 581
 582         mtx_assert(&sched_lock, MA_OWNED);
 583         ke = krunq_choose(kseq->ksq_curr);
 584         if (ke != NULL)
 585                 return (ke);
 586
 587         kseq->ksq_expired_nice = PRIO_MAX + 1;
 588         kseq->ksq_expired_tick = 0;
 589         swap = kseq->ksq_curr;
 590         kseq->ksq_curr = kseq->ksq_next;
 591         kseq->ksq_next = swap;
 592         ke = krunq_choose(kseq->ksq_curr);
 593         if (ke != NULL)
 594                 return (ke);
 595
 596         return krunq_choose(&kseq->ksq_idle);
 597 }
 598
 599 static inline uint64_t
 600 sched_timestamp(void)
 601 {
 602         uint64_t now = cputick2usec(cpu_ticks()) * 1000;
 603         return (now);
 604 }
 605
 606 static inline int
 607 sched_timeslice(struct kse *ke)
 608 {
 609         struct proc *p = ke->ke_proc;
 610
 611         if (ke->ke_proc->p_nice < 0)
 612                 return SCALE_USER_PRI(def_timeslice*4, PROC_USER_PRI(p));
 613         else
 614                 return SCALE_USER_PRI(def_timeslice, PROC_USER_PRI(p));
 615 }
 616
 617 static inline int
 618 sched_is_timeshare(struct ksegrp *kg)
 619 {
 620         return (kg->kg_pri_class == PRI_TIMESHARE);
 621 }
 622
 623 static int
 624 sched_calc_pri(struct ksegrp *kg)
 625 {
 626         int score, pri;
 627
 628         if (sched_is_timeshare(kg)) {
 629                 score = CURRENT_SCORE(kg) - MAX_SCORE / 2;
 630                 pri = PROC_PRI(kg->kg_proc) - score;
 631                 if (pri < PUSER)
 632                         pri = PUSER;
 633                 else if (pri > PUSER_MAX)
 634                         pri = PUSER_MAX;
 635                 return (pri);
 636         }
 637         return (kg->kg_base_user_pri);
 638 }
 639
 640 static int
 641 sched_recalc_pri(struct kse *ke, uint64_t now)
 642 {
 643         uint64_t        delta;
 644         unsigned int    sleep_time;
 645         struct ksegrp   *kg;
 646
 647         kg = ke->ke_ksegrp;
 648         delta = now - ke->ke_timestamp;
 649         if (__predict_false(!sched_is_timeshare(kg)))
 650                 return (kg->kg_base_user_pri);
 651
 652         if (delta > NS_MAX_SLEEP_TIME)
 653                 sleep_time = NS_MAX_SLEEP_TIME;
 654         else
 655                 sleep_time = (unsigned int)delta;
 656         if (__predict_false(sleep_time == 0))
 657                 goto out;
 658
 659         if (ke->ke_activated != -1 &&
 660             sleep_time > INTERACTIVE_SLEEP_TIME(ke)) {
 661                 kg->kg_slptime = HZ_TO_NS(MAX_SLEEP_TIME - def_timeslice);
 662         } else {
 663                 sleep_time *= (MAX_SCORE - CURRENT_SCORE(kg)) ? : 1;
 664
 665                 /*
 666                  * If thread is waking from uninterruptible sleep, it is
 667                  * unlikely an interactive sleep, limit its sleep time to
 668                  * prevent it from being an interactive thread.
 669                  */
 670                 if (ke->ke_activated == -1) {
 671                         if (kg->kg_slptime >= INTERACTIVE_SLEEP_TIME(ke))
 672                                 sleep_time = 0;
 673                         else if (kg->kg_slptime + sleep_time >=
 674                                 INTERACTIVE_SLEEP_TIME(ke)) {
 675                                 kg->kg_slptime = INTERACTIVE_SLEEP_TIME(ke);
 676                                 sleep_time = 0;
 677                         }
 678                 }
 679
 680                 /*
 681                  * Thread gets priority boost here.
 682                  */
 683                 kg->kg_slptime += sleep_time;
 684
 685                 /* Sleep time should never be larger than maximum */
 686                 if (kg->kg_slptime > NS_MAX_SLEEP_TIME)
 687                         kg->kg_slptime = NS_MAX_SLEEP_TIME;
 688         }
 689
 690 out:
 691         return (sched_calc_pri(kg));
 692 }
 693
 694 static void
 695 sched_update_runtime(struct kse *ke, uint64_t now)
 696 {
 697         uint64_t runtime;
 698         struct ksegrp *kg = ke->ke_ksegrp;
 699
 700         if (sched_is_timeshare(kg)) {
 701                 if ((int64_t)(now - ke->ke_timestamp) < NS_MAX_SLEEP_TIME) {
 702                         runtime = now - ke->ke_timestamp;
 703                         if ((int64_t)(now - ke->ke_timestamp) < 0)
 704                                 runtime = 0;
 705                 } else {
 706                         runtime = NS_MAX_SLEEP_TIME;
 707                 }
 708                 runtime /= (CURRENT_SCORE(kg) ? : 1);
 709                 kg->kg_runtime += runtime;
 710                 ke->ke_timestamp = now;
 711         }
 712 }
 713
 714 static void
 715 sched_commit_runtime(struct kse *ke)
 716 {
 717         struct ksegrp *kg = ke->ke_ksegrp;
 718
 719         if (kg->kg_runtime > kg->kg_slptime)
 720                 kg->kg_slptime = 0;
 721         else
 722                 kg->kg_slptime -= kg->kg_runtime;
 723         kg->kg_runtime = 0;
 724 }
 725
 726 static void
 727 kseq_setup(struct kseq *kseq)
 728 {
 729         krunq_init(&kseq->ksq_timeshare[0]);
 730         krunq_init(&kseq->ksq_timeshare[1]);
 731         krunq_init(&kseq->ksq_idle);
 732         kseq->ksq_curr = &kseq->ksq_timeshare[0];
 733         kseq->ksq_next = &kseq->ksq_timeshare[1];
 734         kseq->ksq_expired_nice = PRIO_MAX + 1;
 735         kseq->ksq_expired_tick = 0;
 736 }
 737
 738 static void
 739 sched_setup(void *dummy)
 740 {
 741 #ifdef SMP
 742         int i;
 743 #endif
 744
 745         /*
 746          * To avoid divide-by-zero, we set realstathz a dummy value
 747          * in case which sched_clock() called before sched_initticks().
 748          */
 749         realstathz      = hz;
 750         min_timeslice   = MAX(5 * hz / 1000, 1);
 751         def_timeslice   = MAX(100 * hz / 1000, 1);
 752         granularity     = MAX(10 * hz / 1000, 1);
 753
 754         kseq_setup(&kseq_global);
 755 #ifdef SMP
 756         runq_fuzz = MIN(mp_ncpus * 2, 8);
 757         /*
 758          * Initialize the kseqs.
 759          */
 760         for (i = 0; i < MAXCPU; i++) {
 761                 struct kseq *ksq;
 762
 763                 ksq = &kseq_cpu[i];
 764                 kseq_setup(&kseq_cpu[i]);
 765                 cpu_sibling[i] = 1 << i;
 766         }
 767         if (smp_topology != NULL) {
 768                 int i, j;
 769                 cpumask_t visited;
 770                 struct cpu_group *cg;
 771
 772                 visited = 0;
 773                 for (i = 0; i < smp_topology->ct_count; i++) {
 774                         cg = &smp_topology->ct_group[i];
 775                         if (cg->cg_mask & visited)
 776                                 panic("duplicated cpumask in ct_group.");
 777                         if (cg->cg_mask == 0)
 778                                 continue;
 779                         visited |= cg->cg_mask;
 780                         for (j = 0; j < MAXCPU; j++) {
 781                                 if ((cg->cg_mask & (1 << j)) != 0)
 782                                         cpu_sibling[j] |= cg->cg_mask;
 783                         }
 784                 }
 785         }
 786 #endif
 787
 788         mtx_lock_spin(&sched_lock);
 789         kseq_load_add(KSEQ_SELF(), &kse0);
 790         mtx_unlock_spin(&sched_lock);
 791 }
 792
 793 /* ARGSUSED */
 794 static void
 795 sched_initticks(void *dummy)
 796 {
 797         mtx_lock_spin(&sched_lock);
 798         realstathz = stathz ? stathz : hz;
 799         mtx_unlock_spin(&sched_lock);
 800 }
 801
 802 /*
 803  * Very early in the boot some setup of scheduler-specific
 804  * parts of proc0 and of soem scheduler resources needs to be done.
 805  * Called from:
 806  *  proc0_init()
 807  */
 808 void
 809 schedinit(void)
 810 {
 811         /*
 812          * Set up the scheduler specific parts of proc0.
 813          */
 814         proc0.p_sched = NULL; /* XXX */
 815         ksegrp0.kg_sched = &kg_sched0;
 816         thread0.td_sched = &kse0;
 817         kse0.ke_thread = &thread0;
 818         kse0.ke_state = KES_THREAD;
 819         kse0.ke_slice = 100;
 820         kg_sched0.skg_concurrency = 1;
 821         kg_sched0.skg_avail_opennings = 0; /* we are already running */
 822 }
 823
 824 /*
 825  * This is only somewhat accurate since given many processes of the same
 826  * priority they will switch when their slices run out, which will be
 827  * at most SCHED_SLICE_MAX.
 828  */
 829 int
 830 sched_rr_interval(void)
 831 {
 832         return (def_timeslice);
 833 }
 834
 835 static void
 836 sched_pctcpu_update(struct kse *ke)
 837 {
 838         /*
 839          * Adjust counters and watermark for pctcpu calc.
 840          */
 841         if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) {
 842                 /*
 843                  * Shift the tick count out so that the divide doesn't
 844                  * round away our results.
 845                  */
 846                 ke->ke_ticks <<= 10;
 847                 ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) *
 848                     SCHED_CPU_TICKS;
 849                 ke->ke_ticks >>= 10;
 850         } else
 851                 ke->ke_ticks = 0;
 852         ke->ke_ltick = ticks;
 853         ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
 854 }
 855
 856 static void
 857 sched_thread_priority(struct thread *td, u_char prio)
 858 {
 859         struct kse *ke;
 860
 861         ke = td->td_kse;
 862         mtx_assert(&sched_lock, MA_OWNED);
 863         if (__predict_false(td->td_priority == prio))
 864                 return;
 865
 866         if (TD_ON_RUNQ(td)) {
 867                 /*
 868                  * If the priority has been elevated due to priority
 869                  * propagation, we may have to move ourselves to a new
 870                  * queue.  We still call adjustrunqueue below in case kse
 871                  * needs to fix things up.
 872                  */
 873                 if (prio < td->td_priority && ke->ke_runq != NULL &&
 874                     ke->ke_runq != ke->ke_kseq->ksq_curr) {
 875                         krunq_remove(ke->ke_runq, ke);
 876                         ke->ke_runq = ke->ke_kseq->ksq_curr;
 877                         krunq_add(ke->ke_runq, ke);
 878                 }
 879                 adjustrunqueue(td, prio);
 880         } else
 881                 td->td_priority = prio;
 882 }
 883
 884 /*
 885  * Update a thread's priority when it is lent another thread's
 886  * priority.
 887  */
 888 void
 889 sched_lend_prio(struct thread *td, u_char prio)
 890 {
 891
 892         td->td_flags |= TDF_BORROWING;
 893         sched_thread_priority(td, prio);
 894 }
 895
 896 /*
 897  * Restore a thread's priority when priority propagation is
 898  * over.  The prio argument is the minimum priority the thread
 899  * needs to have to satisfy other possible priority lending
 900  * requests.  If the thread's regular priority is less
 901  * important than prio, the thread will keep a priority boost
 902  * of prio.
 903  */
 904 void
 905 sched_unlend_prio(struct thread *td, u_char prio)
 906 {
 907         u_char base_pri;
 908
 909         if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
 910             td->td_base_pri <= PRI_MAX_TIMESHARE)
 911                 base_pri = td->td_ksegrp->kg_user_pri;
 912         else
 913                 base_pri = td->td_base_pri;
 914         if (prio >= base_pri) {
 915                 td->td_flags &= ~TDF_BORROWING;
 916                 sched_thread_priority(td, base_pri);
 917         } else
 918                 sched_lend_prio(td, prio);
 919 }
 920
 921 void
 922 sched_prio(struct thread *td, u_char prio)
 923 {
 924         u_char oldprio;
 925
 926         if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE)
 927                 prio = MIN(prio, PUSER_MAX);
 928
 929         /* First, update the base priority. */
 930         td->td_base_pri = prio;
 931
 932         /*
 933          * If the thread is borrowing another thread's priority, don't
 934          * ever lower the priority.
 935          */
 936         if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
 937                 return;
 938
 939         /* Change the real priority. */
 940         oldprio = td->td_priority;
 941         sched_thread_priority(td, prio);
 942
 943         /*
 944          * If the thread is on a turnstile, then let the turnstile update
 945          * its state.
 946          */
 947         if (TD_ON_LOCK(td) && oldprio != prio)
 948                 turnstile_adjust(td, oldprio);
 949 }
 950
 951 void
 952 sched_user_prio(struct ksegrp *kg, u_char prio)
 953 {
 954         struct thread *td;
 955         u_char oldprio;
 956
 957         kg->kg_base_user_pri = prio;
 958
 959         /* XXXKSE only for 1:1 */
 960
 961         td = TAILQ_FIRST(&kg->kg_threads);
 962         if (td == NULL) {
 963                 kg->kg_user_pri = prio;
 964                 return;
 965         }
 966
 967         if (td->td_flags & TDF_UBORROWING && kg->kg_user_pri <= prio)
 968                 return;
 969
 970         oldprio = kg->kg_user_pri;
 971         kg->kg_user_pri = prio;
 972
 973         if (TD_ON_UPILOCK(td) && oldprio != prio)
 974                 umtx_pi_adjust(td, oldprio);
 975 }
 976
 977 void
 978 sched_lend_user_prio(struct thread *td, u_char prio)
 979 {
 980         u_char oldprio;
 981
 982         td->td_flags |= TDF_UBORROWING;
 983
 984         oldprio = td->td_ksegrp->kg_user_pri;
 985         td->td_ksegrp->kg_user_pri = prio;
 986
 987         if (TD_ON_UPILOCK(td) && oldprio != prio)
 988                 umtx_pi_adjust(td, oldprio);
 989 }
 990
 991 void
 992 sched_unlend_user_prio(struct thread *td, u_char prio)
 993 {
 994         struct ksegrp *kg = td->td_ksegrp;
 995         u_char base_pri;
 996
 997         base_pri = kg->kg_base_user_pri;
 998         if (prio >= base_pri) {
 999                 td->td_flags &= ~TDF_UBORROWING;
1000                 sched_user_prio(kg, base_pri);
1001         } else
1002                 sched_lend_user_prio(td, prio);
1003 }
1004
1005 void
1006 sched_switch(struct thread *td, struct thread *newtd, int flags)
1007 {
1008         struct kseq *ksq;
1009         struct kse *ke;
1010         struct ksegrp *kg;
1011         uint64_t now;
1012
1013         mtx_assert(&sched_lock, MA_OWNED);
1014
1015         now = sched_timestamp();
1016         ke = td->td_kse;
1017         kg = td->td_ksegrp;
1018         ksq = KSEQ_SELF();
1019
1020         td->td_lastcpu = td->td_oncpu;
1021         td->td_oncpu = NOCPU;
1022         td->td_flags &= ~TDF_NEEDRESCHED;
1023         td->td_owepreempt = 0;
1024
1025         if (td == PCPU_GET(idlethread)) {
1026                 TD_SET_CAN_RUN(td);
1027         } else {
1028                 sched_update_runtime(ke, now);
1029                 /* We are ending our run so make our slot available again */
1030                 SLOT_RELEASE(td->td_ksegrp);
1031                 kseq_load_rem(ksq, ke);
1032                 if (TD_IS_RUNNING(td)) {
1033                         setrunqueue(td, (flags & SW_PREEMPT) ?
1034                             SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
1035                             SRQ_OURSELF|SRQ_YIELDING);
1036                 } else {
1037                         if ((td->td_proc->p_flag & P_HADTHREADS) &&
1038                             (newtd == NULL ||
1039                              newtd->td_ksegrp != td->td_ksegrp)) {
1040                                 /*
1041                                  * We will not be on the run queue.
1042                                  * So we must be sleeping or similar.
1043                                  * Don't use the slot if we will need it
1044                                  * for newtd.
1045                                  */
1046                                 slot_fill(td->td_ksegrp);
1047                         }
1048                         ke->ke_flags &= ~KEF_NEXTRQ;
1049                 }
1050         }
1051
1052         if (newtd != NULL) {
1053                 /*
1054                  * If we bring in a thread account for it as if it had been
1055                  * added to the run queue and then chosen.
1056                  */
1057                 SLOT_USE(newtd->td_ksegrp);
1058                 newtd->td_kse->ke_flags |= KEF_DIDRUN;
1059                 newtd->td_kse->ke_timestamp = now;
1060                 TD_SET_RUNNING(newtd);
1061                 kseq_load_add(ksq, newtd->td_kse);
1062         } else {
1063                 newtd = choosethread();
1064                 /* sched_choose sets ke_timestamp, just reuse it */
1065         }
1066         if (td != newtd) {
1067                 ke->ke_lastran = tick;
1068
1069 #ifdef  HWPMC_HOOKS
1070                 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1071                         PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1072 #endif
1073                 cpu_switch(td, newtd);
1074 #ifdef  HWPMC_HOOKS
1075                 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1076                         PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1077 #endif
1078         }
1079
1080         sched_lock.mtx_lock = (uintptr_t)td;
1081
1082         td->td_oncpu = PCPU_GET(cpuid);
1083 }
1084
1085 void
1086 sched_nice(struct proc *p, int nice)
1087 {
1088         struct ksegrp *kg;
1089         struct thread *td;
1090
1091         PROC_LOCK_ASSERT(p, MA_OWNED);
1092         mtx_assert(&sched_lock, MA_OWNED);
1093         p->p_nice = nice;
1094         FOREACH_KSEGRP_IN_PROC(p, kg) {
1095                 if (kg->kg_pri_class == PRI_TIMESHARE) {
1096                         sched_user_prio(kg, sched_calc_pri(kg));
1097                         FOREACH_THREAD_IN_GROUP(kg, td)
1098                                 td->td_flags |= TDF_NEEDRESCHED;
1099                 }
1100         }
1101 }
1102
1103 void
1104 sched_sleep(struct thread *td)
1105 {
1106         struct kse *ke;
1107
1108         mtx_assert(&sched_lock, MA_OWNED);
1109         ke = td->td_kse;
1110         if (td->td_flags & TDF_SINTR)
1111                 ke->ke_activated = 0;
1112         else
1113                 ke->ke_activated = -1;
1114         ke->ke_flags |= KEF_SLEEP;
1115 }
1116
1117 void
1118 sched_wakeup(struct thread *td)
1119 {
1120         struct kse *ke;
1121         struct ksegrp *kg;
1122         struct kseq *kseq, *mykseq;
1123         uint64_t now;
1124
1125         mtx_assert(&sched_lock, MA_OWNED);
1126         ke = td->td_kse;
1127         kg = td->td_ksegrp;
1128         mykseq = KSEQ_SELF();
1129         if (ke->ke_flags & KEF_SLEEP) {
1130                 ke->ke_flags &= ~KEF_SLEEP;
1131                 if (sched_is_timeshare(kg)) {
1132                         sched_commit_runtime(ke);
1133                         now = sched_timestamp();
1134                         kseq = KSEQ_CPU(td->td_lastcpu);
1135 #ifdef SMP
1136                         if (kseq != mykseq)
1137                                 now = now - mykseq->ksq_last_timestamp +
1138                                     kseq->ksq_last_timestamp;
1139 #endif
1140                         sched_user_prio(kg, sched_recalc_pri(ke, now));
1141                 }
1142         }
1143         setrunqueue(td, SRQ_BORING);
1144 }
1145
1146 /*
1147  * Penalize the parent for creating a new child and initialize the child's
1148  * priority.
1149  */
1150 void
1151 sched_fork(struct thread *td, struct thread *childtd)
1152 {
1153
1154         mtx_assert(&sched_lock, MA_OWNED);
1155         sched_fork_ksegrp(td, childtd->td_ksegrp);
1156         sched_fork_thread(td, childtd);
1157 }
1158
1159 void
1160 sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
1161 {
1162         struct ksegrp *kg = td->td_ksegrp;
1163
1164         mtx_assert(&sched_lock, MA_OWNED);
1165         child->kg_slptime = kg->kg_slptime * CHILD_WEIGHT / 100;
1166         if (child->kg_pri_class == PRI_TIMESHARE)
1167                 sched_user_prio(child, sched_calc_pri(child));
1168         kg->kg_slptime = kg->kg_slptime * PARENT_WEIGHT / 100;
1169 }
1170
1171 void
1172 sched_fork_thread(struct thread *td, struct thread *child)
1173 {
1174         struct kse *ke;
1175         struct kse *ke2;
1176
1177         sched_newthread(child);
1178
1179         ke = td->td_kse;
1180         ke2 = child->td_kse;
1181         ke2->ke_slice = (ke->ke_slice + 1) >> 1;
1182         ke2->ke_flags |= KEF_FIRST_SLICE | (ke->ke_flags & KEF_NEXTRQ);
1183         ke2->ke_activated = 0;
1184         ke->ke_slice >>= 1;
1185         if (ke->ke_slice == 0) {
1186                 ke->ke_slice = 1;
1187                 sched_tick();
1188         }
1189
1190         /* Grab our parents cpu estimation information. */
1191         ke2->ke_ticks = ke->ke_ticks;
1192         ke2->ke_ltick = ke->ke_ltick;
1193         ke2->ke_ftick = ke->ke_ftick;
1194 }
1195
1196 void
1197 sched_class(struct ksegrp *kg, int class)
1198 {
1199         mtx_assert(&sched_lock, MA_OWNED);
1200         kg->kg_pri_class = class;
1201 }
1202
1203 /*
1204  * Return some of the child's priority and interactivity to the parent.
1205  */
1206 void
1207 sched_exit(struct proc *p, struct thread *childtd)
1208 {
1209         mtx_assert(&sched_lock, MA_OWNED);
1210         sched_exit_thread(FIRST_THREAD_IN_PROC(p), childtd);
1211         sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd);
1212 }
1213
1214 void
1215 sched_exit_ksegrp(struct ksegrp *parentkg, struct thread *td)
1216 {
1217         if (td->td_ksegrp->kg_slptime < parentkg->kg_slptime) {
1218                 parentkg->kg_slptime = parentkg->kg_slptime /
1219                         (EXIT_WEIGHT) * (EXIT_WEIGHT - 1) +
1220                         td->td_ksegrp->kg_slptime / EXIT_WEIGHT;
1221         }
1222 }
1223
1224 void
1225 sched_exit_thread(struct thread *td, struct thread *childtd)
1226 {
1227         struct kse *childke  = childtd->td_kse;
1228         struct kse *parentke = td->td_kse;
1229
1230         kseq_load_rem(KSEQ_SELF(), childke);
1231         sched_update_runtime(childke, sched_timestamp());
1232         sched_commit_runtime(childke);
1233         if ((childke->ke_flags & KEF_FIRST_SLICE) &&
1234             td->td_proc == childtd->td_proc->p_pptr) {
1235                 parentke->ke_slice += childke->ke_slice;
1236                 if (parentke->ke_slice > sched_timeslice(parentke))
1237                         parentke->ke_slice = sched_timeslice(parentke);
1238         }
1239 }
1240
1241 static int
1242 sched_starving(struct kseq *ksq, unsigned now, struct kse *ke)
1243 {
1244         uint64_t delta;
1245
1246         if (ke->ke_proc->p_nice > ksq->ksq_expired_nice)
1247                 return (1);
1248         if (ksq->ksq_expired_tick == 0)
1249                 return (0);
1250         delta = HZ_TO_NS((uint64_t)now - ksq->ksq_expired_tick);
1251         if (delta > STARVATION_TIME * ksq->ksq_load)
1252                 return (1);
1253         return (0);
1254 }
1255
1256 /*
1257  * An interactive thread has smaller time slice granularity,
1258  * a cpu hog can have larger granularity.
1259  */
1260 static inline int
1261 sched_timeslice_split(struct kse *ke)
1262 {
1263         int score, g;
1264
1265         score = (int)(MAX_SCORE - CURRENT_SCORE(ke->ke_ksegrp));
1266         if (score == 0)
1267                 score = 1;
1268 #ifdef SMP
1269         g = granularity * ((1 << score) - 1) * smp_cpus;
1270 #else
1271         g = granularity * ((1 << score) - 1);
1272 #endif
1273         return (ke->ke_slice >= g && ke->ke_slice % g == 0);
1274 }
1275
1276 void
1277 sched_tick(void)
1278 {
1279         struct thread   *td;
1280         struct proc     *p;
1281         struct kse      *ke;
1282         struct ksegrp   *kg;
1283         struct kseq     *kseq;
1284         uint64_t        now;
1285         int             cpuid;
1286         int             class;
1287
1288         mtx_assert(&sched_lock, MA_OWNED);
1289
1290         td = curthread;
1291         ke = td->td_kse;
1292         kg = td->td_ksegrp;
1293         p = td->td_proc;
1294         class = PRI_BASE(kg->kg_pri_class);
1295         now = sched_timestamp();
1296         cpuid = PCPU_GET(cpuid);
1297         kseq = KSEQ_CPU(cpuid);
1298         kseq->ksq_last_timestamp = now;
1299
1300         if (class == PRI_IDLE) {
1301                 /*
1302                  * Processes of equal idle priority are run round-robin.
1303                  */
1304                 if (td != PCPU_GET(idlethread) && --ke->ke_slice <= 0) {
1305                         ke->ke_slice = def_timeslice;
1306                         td->td_flags |= TDF_NEEDRESCHED;
1307                 }
1308                 return;
1309         }
1310
1311         if (class == PRI_REALTIME) {
1312                 /*
1313                  * Realtime scheduling, do round robin for RR class, FIFO
1314                  * is not affected.
1315                  */
1316                 if (PRI_NEED_RR(kg->kg_pri_class) && --ke->ke_slice <= 0) {
1317                         ke->ke_slice = def_timeslice;
1318                         td->td_flags |= TDF_NEEDRESCHED;
1319                 }
1320                 return;
1321         }
1322
1323         /*
1324          * We skip kernel thread, though it may be classified as TIMESHARE.
1325          */
1326         if (class != PRI_TIMESHARE || (p->p_flag & P_KTHREAD) != 0)
1327                 return;
1328
1329         if (--ke->ke_slice <= 0) {
1330                 td->td_flags |= TDF_NEEDRESCHED;
1331                 sched_update_runtime(ke, now);
1332                 sched_commit_runtime(ke);
1333                 sched_user_prio(kg, sched_calc_pri(kg));
1334                 ke->ke_slice = sched_timeslice(ke);
1335                 ke->ke_flags &= ~KEF_FIRST_SLICE;
1336                 if (ke->ke_flags & KEF_BOUND || td->td_pinned) {
1337                         if (kseq->ksq_expired_tick == 0)
1338                                 kseq->ksq_expired_tick = tick;
1339                 } else {
1340                         if (kseq_global.ksq_expired_tick == 0)
1341                                 kseq_global.ksq_expired_tick = tick;
1342                 }
1343                 if (!THREAD_IS_INTERACTIVE(ke) ||
1344                     sched_starving(kseq, tick, ke) ||
1345                     sched_starving(&kseq_global, tick, ke)) {
1346                         /* The thead becomes cpu hog, schedule it off. */
1347                         ke->ke_flags |= KEF_NEXTRQ;
1348                         if (ke->ke_flags & KEF_BOUND || td->td_pinned) {
1349                                 if (p->p_nice < kseq->ksq_expired_nice)
1350                                         kseq->ksq_expired_nice = p->p_nice;
1351                         } else {
1352                                 if (p->p_nice < kseq_global.ksq_expired_nice)
1353                                         kseq_global.ksq_expired_nice =
1354                                                 p->p_nice;
1355                         }
1356                 }
1357         } else {
1358                 /*
1359                  * Don't allow an interactive thread which has long timeslice
1360                  * to monopolize CPU, split the long timeslice into small
1361                  * chunks. This essentially does round-robin between
1362                  * interactive threads.
1363                  */
1364                 if (THREAD_IS_INTERACTIVE(ke) && sched_timeslice_split(ke))
1365                         td->td_flags |= TDF_NEEDRESCHED;
1366         }
1367 }
1368
1369 void
1370 sched_clock(struct thread *td)
1371 {
1372         struct ksegrp *kg;
1373         struct kse *ke;
1374
1375         mtx_assert(&sched_lock, MA_OWNED);
1376         ke = td->td_kse;
1377         kg = ke->ke_ksegrp;
1378
1379         /* Adjust ticks for pctcpu */
1380         ke->ke_ticks++;
1381         ke->ke_ltick = ticks;
1382
1383         /* Go up to one second beyond our max and then trim back down */
1384         if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
1385                 sched_pctcpu_update(ke);
1386 }
1387
1388 static int
1389 kseq_runnable(struct kseq *kseq)
1390 {
1391         return (krunq_check(kseq->ksq_curr) ||
1392                 krunq_check(kseq->ksq_next) ||
1393                 krunq_check(&kseq->ksq_idle));
1394 }
1395
1396 int
1397 sched_runnable(void)
1398 {
1399 #ifdef SMP
1400         return (kseq_runnable(&kseq_global) || kseq_runnable(KSEQ_SELF()));
1401 #else
1402         return (kseq_runnable(&kseq_global));
1403 #endif
1404 }
1405
1406 void
1407 sched_userret(struct thread *td)
1408 {
1409         struct ksegrp *kg;
1410
1411         KASSERT((td->td_flags & TDF_BORROWING) == 0,
1412             ("thread with borrowed priority returning to userland"));
1413         kg = td->td_ksegrp;
1414         if (td->td_priority != kg->kg_user_pri) {
1415                 mtx_lock_spin(&sched_lock);
1416                 td->td_priority = kg->kg_user_pri;
1417                 td->td_base_pri = kg->kg_user_pri;
1418                 mtx_unlock_spin(&sched_lock);
1419         }
1420 }
1421
1422 struct kse *
1423 sched_choose(void)
1424 {
1425         struct kse  *ke;
1426         struct kseq *kseq;
1427
1428 #ifdef SMP
1429         struct kse *kecpu;
1430
1431         mtx_assert(&sched_lock, MA_OWNED);
1432         kseq = &kseq_global;
1433         ke = kseq_choose(&kseq_global);
1434         kecpu = kseq_choose(KSEQ_SELF());
1435
1436         if (ke == NULL ||
1437             (kecpu != NULL &&
1438              kecpu->ke_thread->td_priority < ke->ke_thread->td_priority)) {
1439                 ke = kecpu;
1440                 kseq = KSEQ_SELF();
1441         }
1442 #else
1443         kseq = &kseq_global;
1444         ke = kseq_choose(kseq);
1445 #endif
1446
1447         if (ke != NULL) {
1448                 kseq_runq_rem(kseq, ke);
1449                 ke->ke_state = KES_THREAD;
1450                 ke->ke_flags &= ~KEF_PREEMPTED;
1451                 ke->ke_timestamp = sched_timestamp();
1452         }
1453
1454         return (ke);
1455 }
1456
1457 #ifdef SMP
1458 static int
1459 forward_wakeup(int cpunum, cpumask_t me)
1460 {
1461         cpumask_t map, dontuse;
1462         cpumask_t map2;
1463         struct pcpu *pc;
1464         cpumask_t id, map3;
1465
1466         mtx_assert(&sched_lock, MA_OWNED);
1467
1468         CTR0(KTR_RUNQ, "forward_wakeup()");
1469
1470         if ((!forward_wakeup_enabled) ||
1471              (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
1472                 return (0);
1473         if (!smp_started || cold || panicstr)
1474                 return (0);
1475
1476         forward_wakeups_requested++;
1477
1478         /*
1479          * check the idle mask we received against what we calculated before
1480          * in the old version.
1481          */
1482         /*
1483          * don't bother if we should be doing it ourself..
1484          */
1485         if ((me & idle_cpus_mask) && (cpunum == NOCPU || me == (1 << cpunum)))
1486                 return (0);
1487
1488         dontuse = me | stopped_cpus | hlt_cpus_mask;
1489         map3 = 0;
1490         if (forward_wakeup_use_loop) {
1491                 SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
1492                         id = pc->pc_cpumask;
1493                         if ( (id & dontuse) == 0 &&
1494                             pc->pc_curthread == pc->pc_idlethread) {
1495                                 map3 |= id;
1496                         }
1497                 }
1498         }
1499
1500         if (forward_wakeup_use_mask) {
1501                 map = 0;
1502                 map = idle_cpus_mask & ~dontuse;
1503
1504                 /* If they are both on, compare and use loop if different */
1505                 if (forward_wakeup_use_loop) {
1506                         if (map != map3) {
1507                                 printf("map (%02X) != map3 (%02X)\n",
1508                                                 map, map3);
1509                                 map = map3;
1510                         }
1511                 }
1512         } else {
1513                 map = map3;
1514         }
1515         /* If we only allow a specific CPU, then mask off all the others */
1516         if (cpunum != NOCPU) {
1517                 KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
1518                 map &= (1 << cpunum);
1519         } else {
1520                 /* Try choose an idle die. */
1521                 if (forward_wakeup_use_htt) {
1522                         map2 =  (map & (map >> 1)) & 0x5555;
1523                         if (map2) {
1524                                 map = map2;
1525                         }
1526                 }
1527
1528                 /* set only one bit */
1529                 if (forward_wakeup_use_single) {
1530                         map = map & ((~map) + 1);
1531                 }
1532         }
1533         if (map) {
1534                 forward_wakeups_delivered++;
1535                 ipi_selected(map, IPI_AST);
1536                 return (1);
1537         }
1538         return (0);
1539 }
1540 #endif
1541
1542 void
1543 sched_add(struct thread *td, int flags)
1544 {
1545         struct kseq *ksq;
1546         struct ksegrp *kg;
1547         struct kse *ke;
1548         struct thread *mytd;
1549         int class;
1550         int nextrq;
1551         int need_resched = 0;
1552 #ifdef SMP
1553         int cpu;
1554         int mycpu;
1555         int pinned;
1556         struct kseq *myksq;
1557 #endif
1558
1559         mtx_assert(&sched_lock, MA_OWNED);
1560         mytd = curthread;
1561         ke = td->td_kse;
1562         kg = td->td_ksegrp;
1563         KASSERT(ke->ke_state != KES_ONRUNQ,
1564             ("sched_add: kse %p (%s) already in run queue", ke,
1565             ke->ke_proc->p_comm));
1566         KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
1567             ("sched_add: process swapped out"));
1568         KASSERT(ke->ke_runq == NULL,
1569             ("sched_add: KSE %p is still assigned to a run queue", ke));
1570
1571         class = PRI_BASE(kg->kg_pri_class);
1572 #ifdef SMP
1573         mycpu = PCPU_GET(cpuid);
1574         myksq = KSEQ_CPU(mycpu);
1575         ke->ke_wakeup_cpu = mycpu;
1576 #endif
1577         nextrq = (ke->ke_flags & KEF_NEXTRQ);
1578         ke->ke_flags &= ~KEF_NEXTRQ;
1579         if (flags & SRQ_PREEMPTED)
1580                 ke->ke_flags |= KEF_PREEMPTED;
1581         ksq = &kseq_global;
1582 #ifdef SMP
1583         if (td->td_pinned != 0) {
1584                 cpu = td->td_lastcpu;
1585                 ksq = KSEQ_CPU(cpu);
1586                 pinned = 1;
1587         } else if ((ke)->ke_flags & KEF_BOUND) {
1588                 cpu = ke->ke_cpu;
1589                 ksq = KSEQ_CPU(cpu);
1590                 pinned = 1;
1591         } else {
1592                 pinned = 0;
1593                 cpu = NOCPU;
1594         }
1595 #endif
1596         switch (class) {
1597         case PRI_ITHD:
1598         case PRI_REALTIME:
1599                 ke->ke_runq = ksq->ksq_curr;
1600                 break;
1601         case PRI_TIMESHARE:
1602                 if ((td->td_flags & TDF_BORROWING) == 0 && nextrq)
1603                         ke->ke_runq = ksq->ksq_next;
1604                 else
1605                         ke->ke_runq = ksq->ksq_curr;
1606                 break;
1607         case PRI_IDLE:
1608                 /*
1609                  * This is for priority prop.
1610                  */
1611                 if (td->td_priority < PRI_MIN_IDLE)
1612                         ke->ke_runq = ksq->ksq_curr;
1613                 else
1614                         ke->ke_runq = &ksq->ksq_idle;
1615                 break;
1616         default:
1617                 panic("Unknown pri class.");
1618                 break;
1619         }
1620
1621 #ifdef SMP
1622         if ((ke->ke_runq == kseq_global.ksq_curr ||
1623              ke->ke_runq == myksq->ksq_curr) &&
1624              td->td_priority < mytd->td_priority) {
1625 #else
1626         if (ke->ke_runq == kseq_global.ksq_curr &&
1627             td->td_priority < mytd->td_priority) {
1628 #endif
1629                 struct krunq *rq;
1630
1631                 rq = ke->ke_runq;
1632                 ke->ke_runq = NULL;
1633                 if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td))
1634                         return;
1635                 ke->ke_runq = rq;
1636                 need_resched = TDF_NEEDRESCHED;
1637         }
1638
1639         SLOT_USE(kg);
1640         ke->ke_state = KES_ONRUNQ;
1641         kseq_runq_add(ksq, ke);
1642         kseq_load_add(ksq, ke);
1643
1644 #ifdef SMP
1645         if (pinned) {
1646                 if (cpu != mycpu) {
1647                         struct thread *running = pcpu_find(cpu)->pc_curthread;
1648                         if (ksq->ksq_curr == ke->ke_runq &&
1649                             running->td_priority < td->td_priority) {
1650                                 if (td->td_priority <= PRI_MAX_ITHD)
1651                                         ipi_selected(1 << cpu, IPI_PREEMPT);
1652                                 else {
1653                                         running->td_flags |= TDF_NEEDRESCHED;
1654                                         ipi_selected(1 << cpu, IPI_AST);
1655                                 }
1656                         }
1657                 } else
1658                         curthread->td_flags |= need_resched;
1659         } else {
1660                 cpumask_t me = 1 << mycpu;
1661                 cpumask_t idle = idle_cpus_mask & me;
1662                 int forwarded = 0;
1663
1664                 if (!idle && ((flags & SRQ_INTR) == 0) &&
1665                     (idle_cpus_mask & ~(hlt_cpus_mask | me)))
1666                         forwarded = forward_wakeup(cpu, me);
1667                 if (forwarded == 0)
1668                         curthread->td_flags |= need_resched;
1669         }
1670 #else
1671         mytd->td_flags |= need_resched;
1672 #endif
1673 }
1674
1675 void
1676 sched_rem(struct thread *td)
1677 {
1678         struct kseq *kseq;
1679         struct kse *ke;
1680
1681         mtx_assert(&sched_lock, MA_OWNED);
1682         ke = td->td_kse;
1683         KASSERT((ke->ke_state == KES_ONRUNQ),
1684             ("sched_rem: KSE not on run queue"));
1685
1686         kseq = ke->ke_kseq;
1687         SLOT_RELEASE(td->td_ksegrp);
1688         kseq_runq_rem(kseq, ke);
1689         kseq_load_rem(kseq, ke);
1690         ke->ke_state = KES_THREAD;
1691 }
1692
1693 fixpt_t
1694 sched_pctcpu(struct thread *td)
1695 {
1696         fixpt_t pctcpu;
1697         struct kse *ke;
1698
1699         pctcpu = 0;
1700         ke = td->td_kse;
1701         if (ke == NULL)
1702                 return (0);
1703
1704         mtx_lock_spin(&sched_lock);
1705         if (ke->ke_ticks) {
1706                 int rtick;
1707
1708                 /*
1709                  * Don't update more frequently than twice a second.  Allowing
1710                  * this causes the cpu usage to decay away too quickly due to
1711                  * rounding errors.
1712                  */
1713                 if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick ||
1714                     ke->ke_ltick < (ticks - (hz / 2)))
1715                         sched_pctcpu_update(ke);
1716                 /* How many rtick per second ? */
1717                 rtick = MIN(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
1718                 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
1719         }
1720
1721         ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
1722         mtx_unlock_spin(&sched_lock);
1723
1724         return (pctcpu);
1725 }
1726
1727 void
1728 sched_bind(struct thread *td, int cpu)
1729 {
1730         struct kse *ke;
1731
1732         mtx_assert(&sched_lock, MA_OWNED);
1733         ke = td->td_kse;
1734         ke->ke_flags |= KEF_BOUND;
1735 #ifdef SMP
1736         ke->ke_cpu = cpu;
1737         if (PCPU_GET(cpuid) == cpu)
1738                 return;
1739         mi_switch(SW_VOL, NULL);
1740 #endif
1741 }
1742
1743 void
1744 sched_unbind(struct thread *td)
1745 {
1746         mtx_assert(&sched_lock, MA_OWNED);
1747         td->td_kse->ke_flags &= ~KEF_BOUND;
1748 }
1749
1750 int
1751 sched_is_bound(struct thread *td)
1752 {
1753         mtx_assert(&sched_lock, MA_OWNED);
1754         return (td->td_kse->ke_flags & KEF_BOUND);
1755 }
1756
1757 int
1758 sched_load(void)
1759 {
1760         return (sched_tdcnt);
1761 }
1762
1763 void
1764 sched_relinquish(struct thread *td)
1765 {
1766         struct ksegrp *kg;
1767
1768         kg = td->td_ksegrp;
1769         mtx_lock_spin(&sched_lock);
1770         if (sched_is_timeshare(kg)) {
1771                 sched_prio(td, PRI_MAX_TIMESHARE);
1772                 td->td_kse->ke_flags |= KEF_NEXTRQ;
1773         }
1774         mi_switch(SW_VOL, NULL);
1775         mtx_unlock_spin(&sched_lock);
1776 }
1777
1778 int
1779 sched_sizeof_ksegrp(void)
1780 {
1781         return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
1782 }
1783
1784 int
1785 sched_sizeof_proc(void)
1786 {
1787         return (sizeof(struct proc));
1788 }
1789
1790 int
1791 sched_sizeof_thread(void)
1792 {
1793         return (sizeof(struct thread) + sizeof(struct td_sched));
1794 }
1795 #define KERN_SWITCH_INCLUDE 1
1796 #include "kern/kern_switch.c"