sys/kern/kern_synch.c

   1 /*-
   2  * Copyright (c) 1982, 1986, 1990, 1991, 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  * (c) UNIX System Laboratories, Inc.
   5  * All or some portions of this file are derived from material licensed
   6  * to the University of California by American Telephone and Telegraph
   7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   8  * the permission of UNIX System Laboratories, Inc.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  * 3. All advertising materials mentioning features or use of this software
  19  *    must display the following acknowledgement:
  20  *      This product includes software developed by the University of
  21  *      California, Berkeley and its contributors.
  22  * 4. Neither the name of the University nor the names of its contributors
  23  *    may be used to endorse or promote products derived from this software
  24  *    without specific prior written permission.
  25  *
  26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36  * SUCH DAMAGE.
  37  *
  38  *      @(#)kern_synch.c        8.9 (Berkeley) 5/19/95
  39  * $FreeBSD$
  40  */
  41
  42 #include "opt_ktrace.h"
  43
  44 #include <sys/param.h>
  45 #include <sys/systm.h>
  46 #include <sys/condvar.h>
  47 #include <sys/kernel.h>
  48 #include <sys/ktr.h>
  49 #include <sys/lock.h>
  50 #include <sys/mutex.h>
  51 #include <sys/proc.h>
  52 #include <sys/resourcevar.h>
  53 #include <sys/signalvar.h>
  54 #include <sys/smp.h>
  55 #include <sys/sx.h>
  56 #include <sys/sysctl.h>
  57 #include <sys/sysproto.h>
  58 #include <sys/vmmeter.h>
  59 #include <vm/vm.h>
  60 #include <vm/vm_extern.h>
  61 #ifdef KTRACE
  62 #include <sys/uio.h>
  63 #include <sys/ktrace.h>
  64 #endif
  65
  66 #include <machine/cpu.h>
  67
  68 static void sched_setup __P((void *dummy));
  69 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
  70
  71 int     hogticks;
  72 int     lbolt;
  73 int     sched_quantum;          /* Roundrobin scheduling quantum in ticks. */
  74
  75 static struct callout schedcpu_callout;
  76 static struct callout roundrobin_callout;
  77
  78 static void     endtsleep __P((void *));
  79 static void     roundrobin __P((void *arg));
  80 static void     schedcpu __P((void *arg));
  81
  82 static int
  83 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
  84 {
  85         int error, new_val;
  86
  87         new_val = sched_quantum * tick;
  88         error = sysctl_handle_int(oidp, &new_val, 0, req);
  89         if (error != 0 || req->newptr == NULL)
  90                 return (error);
  91         if (new_val < tick)
  92                 return (EINVAL);
  93         sched_quantum = new_val / tick;
  94         hogticks = 2 * sched_quantum;
  95         return (0);
  96 }
  97
  98 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
  99         0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
 100
 101 /*
 102  * Arrange to reschedule if necessary, taking the priorities and
 103  * schedulers into account.
 104  */
 105 void
 106 maybe_resched(p)
 107         struct proc *p;
 108 {
 109
 110         mtx_assert(&sched_lock, MA_OWNED);
 111         if (p->p_pri.pri_level < curproc->p_pri.pri_level)
 112                 need_resched(curproc);
 113 }
 114
 115 int
 116 roundrobin_interval(void)
 117 {
 118         return (sched_quantum);
 119 }
 120
 121 /*
 122  * Force switch among equal priority processes every 100ms.
 123  */
 124 /* ARGSUSED */
 125 static void
 126 roundrobin(arg)
 127         void *arg;
 128 {
 129
 130         mtx_lock_spin(&sched_lock);
 131         need_resched(curproc);
 132 #ifdef SMP
 133         forward_roundrobin();
 134 #endif
 135         mtx_unlock_spin(&sched_lock);
 136
 137         callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
 138 }
 139
 140 /*
 141  * Constants for digital decay and forget:
 142  *      90% of (p_estcpu) usage in 5 * loadav time
 143  *      95% of (p_pctcpu) usage in 60 seconds (load insensitive)
 144  *          Note that, as ps(1) mentions, this can let percentages
 145  *          total over 100% (I've seen 137.9% for 3 processes).
 146  *
 147  * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
 148  *
 149  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
 150  * That is, the system wants to compute a value of decay such
 151  * that the following for loop:
 152  *      for (i = 0; i < (5 * loadavg); i++)
 153  *              p_estcpu *= decay;
 154  * will compute
 155  *      p_estcpu *= 0.1;
 156  * for all values of loadavg:
 157  *
 158  * Mathematically this loop can be expressed by saying:
 159  *      decay ** (5 * loadavg) ~= .1
 160  *
 161  * The system computes decay as:
 162  *      decay = (2 * loadavg) / (2 * loadavg + 1)
 163  *
 164  * We wish to prove that the system's computation of decay
 165  * will always fulfill the equation:
 166  *      decay ** (5 * loadavg) ~= .1
 167  *
 168  * If we compute b as:
 169  *      b = 2 * loadavg
 170  * then
 171  *      decay = b / (b + 1)
 172  *
 173  * We now need to prove two things:
 174  *      1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
 175  *      2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
 176  *
 177  * Facts:
 178  *         For x close to zero, exp(x) =~ 1 + x, since
 179  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
 180  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
 181  *         For x close to zero, ln(1+x) =~ x, since
 182  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
 183  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
 184  *         ln(.1) =~ -2.30
 185  *
 186  * Proof of (1):
 187  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
 188  *      solving for factor,
 189  *      ln(factor) =~ (-2.30/5*loadav), or
 190  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
 191  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
 192  *
 193  * Proof of (2):
 194  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
 195  *      solving for power,
 196  *      power*ln(b/(b+1)) =~ -2.30, or
 197  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
 198  *
 199  * Actual power values for the implemented algorithm are as follows:
 200  *      loadav: 1       2       3       4
 201  *      power:  5.68    10.32   14.94   19.55
 202  */
 203
 204 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
 205 #define loadfactor(loadav)      (2 * (loadav))
 206 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 207
 208 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 209 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
 210 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 211
 212 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
 213 static int      fscale __unused = FSCALE;
 214 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
 215
 216 /*
 217  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
 218  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
 219  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
 220  *
 221  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
 222  *      1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
 223  *
 224  * If you don't want to bother with the faster/more-accurate formula, you
 225  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
 226  * (more general) method of calculating the %age of CPU used by a process.
 227  */
 228 #define CCPU_SHIFT      11
 229
 230 /*
 231  * Recompute process priorities, every hz ticks.
 232  * MP-safe, called without the Giant mutex.
 233  */
 234 /* ARGSUSED */
 235 static void
 236 schedcpu(arg)
 237         void *arg;
 238 {
 239         register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 240         register struct proc *p;
 241         register int realstathz, s;
 242
 243         realstathz = stathz ? stathz : hz;
 244         sx_slock(&allproc_lock);
 245         LIST_FOREACH(p, &allproc, p_list) {
 246                 /*
 247                  * Increment time in/out of memory and sleep time
 248                  * (if sleeping).  We ignore overflow; with 16-bit int's
 249                  * (remember them?) overflow takes 45 days.
 250                 if (p->p_stat == SWAIT)
 251                         continue;
 252                  */
 253                 mtx_lock_spin(&sched_lock);
 254                 p->p_swtime++;
 255                 if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
 256                         p->p_slptime++;
 257                 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
 258                 /*
 259                  * If the process has slept the entire second,
 260                  * stop recalculating its priority until it wakes up.
 261                  */
 262                 if (p->p_slptime > 1) {
 263                         mtx_unlock_spin(&sched_lock);
 264                         continue;
 265                 }
 266
 267                 /*
 268                  * prevent state changes and protect run queue
 269                  */
 270                 s = splhigh();
 271
 272                 /*
 273                  * p_pctcpu is only for ps.
 274                  */
 275 #if     (FSHIFT >= CCPU_SHIFT)
 276                 p->p_pctcpu += (realstathz == 100)?
 277                         ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
 278                         100 * (((fixpt_t) p->p_cpticks)
 279                                 << (FSHIFT - CCPU_SHIFT)) / realstathz;
 280 #else
 281                 p->p_pctcpu += ((FSCALE - ccpu) *
 282                         (p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
 283 #endif
 284                 p->p_cpticks = 0;
 285                 p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
 286                 resetpriority(p);
 287                 if (p->p_pri.pri_level >= PUSER) {
 288                         if ((p != curproc) &&
 289 #ifdef SMP
 290                             p->p_oncpu == NOCPU &&      /* idle */
 291 #endif
 292                             p->p_stat == SRUN &&
 293                             (p->p_sflag & PS_INMEM) &&
 294                             (p->p_pri.pri_level / RQ_PPQ) !=
 295                             (p->p_pri.pri_user / RQ_PPQ)) {
 296                                 remrunqueue(p);
 297                                 p->p_pri.pri_level = p->p_pri.pri_user;
 298                                 setrunqueue(p);
 299                         } else
 300                                 p->p_pri.pri_level = p->p_pri.pri_user;
 301                 }
 302                 mtx_unlock_spin(&sched_lock);
 303                 splx(s);
 304         }
 305         sx_sunlock(&allproc_lock);
 306         vmmeter();
 307         wakeup((caddr_t)&lbolt);
 308         callout_reset(&schedcpu_callout, hz, schedcpu, NULL);
 309 }
 310
 311 /*
 312  * Recalculate the priority of a process after it has slept for a while.
 313  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
 314  * least six times the loadfactor will decay p_estcpu to zero.
 315  */
 316 void
 317 updatepri(p)
 318         register struct proc *p;
 319 {
 320         register unsigned int newcpu = p->p_estcpu;
 321         register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 322
 323         if (p->p_slptime > 5 * loadfac)
 324                 p->p_estcpu = 0;
 325         else {
 326                 p->p_slptime--; /* the first time was done in schedcpu */
 327                 while (newcpu && --p->p_slptime)
 328                         newcpu = decay_cpu(loadfac, newcpu);
 329                 p->p_estcpu = newcpu;
 330         }
 331         resetpriority(p);
 332 }
 333
 334 /*
 335  * We're only looking at 7 bits of the address; everything is
 336  * aligned to 4, lots of things are aligned to greater powers
 337  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
 338  */
 339 #define TABLESIZE       128
 340 static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
 341 #define LOOKUP(x)       (((intptr_t)(x) >> 8) & (TABLESIZE - 1))
 342
 343 void
 344 sleepinit(void)
 345 {
 346         int i;
 347
 348         sched_quantum = hz/10;
 349         hogticks = 2 * sched_quantum;
 350         for (i = 0; i < TABLESIZE; i++)
 351                 TAILQ_INIT(&slpque[i]);
 352 }
 353
 354 /*
 355  * General sleep call.  Suspends the current process until a wakeup is
 356  * performed on the specified identifier.  The process will then be made
 357  * runnable with the specified priority.  Sleeps at most timo/hz seconds
 358  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
 359  * before and after sleeping, else signals are not checked.  Returns 0 if
 360  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
 361  * signal needs to be delivered, ERESTART is returned if the current system
 362  * call should be restarted if possible, and EINTR is returned if the system
 363  * call should be interrupted by the signal (return EINTR).
 364  *
 365  * The mutex argument is exited before the caller is suspended, and
 366  * entered before msleep returns.  If priority includes the PDROP
 367  * flag the mutex is not entered before returning.
 368  */
 369 int
 370 msleep(ident, mtx, priority, wmesg, timo)
 371         void *ident;
 372         struct mtx *mtx;
 373         int priority, timo;
 374         const char *wmesg;
 375 {
 376         struct proc *p = curproc;
 377         int sig, catch = priority & PCATCH;
 378         int rval = 0;
 379         WITNESS_SAVE_DECL(mtx);
 380
 381 #ifdef KTRACE
 382         if (p && KTRPOINT(p, KTR_CSW))
 383                 ktrcsw(p->p_tracep, 1, 0);
 384 #endif
 385         WITNESS_SLEEP(0, &mtx->mtx_object);
 386         mtx_lock_spin(&sched_lock);
 387         if (cold || panicstr) {
 388                 /*
 389                  * After a panic, or during autoconfiguration,
 390                  * just give interrupts a chance, then just return;
 391                  * don't run any other procs or panic below,
 392                  * in case this is the idle process and already asleep.
 393                  */
 394                 if (mtx != NULL && priority & PDROP)
 395                         mtx_unlock_flags(mtx, MTX_NOSWITCH);
 396                 mtx_unlock_spin(&sched_lock);
 397                 return (0);
 398         }
 399
 400         DROP_GIANT_NOSWITCH();
 401
 402         if (mtx != NULL) {
 403                 mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
 404                 WITNESS_SAVE(&mtx->mtx_object, mtx);
 405                 mtx_unlock_flags(mtx, MTX_NOSWITCH);
 406                 if (priority & PDROP)
 407                         mtx = NULL;
 408         }
 409
 410         KASSERT(p != NULL, ("msleep1"));
 411         KASSERT(ident != NULL && p->p_stat == SRUN, ("msleep"));
 412         /*
 413          * Process may be sitting on a slpque if asleep() was called, remove
 414          * it before re-adding.
 415          */
 416         if (p->p_wchan != NULL)
 417                 unsleep(p);
 418
 419         p->p_wchan = ident;
 420         p->p_wmesg = wmesg;
 421         p->p_slptime = 0;
 422         p->p_pri.pri_level = priority & PRIMASK;
 423         CTR4(KTR_PROC, "msleep: proc %p (pid %d, %s), schedlock %p",
 424                 p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 425         TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
 426         if (timo)
 427                 callout_reset(&p->p_slpcallout, timo, endtsleep, p);
 428         /*
 429          * We put ourselves on the sleep queue and start our timeout
 430          * before calling CURSIG, as we could stop there, and a wakeup
 431          * or a SIGCONT (or both) could occur while we were stopped.
 432          * A SIGCONT would cause us to be marked as SSLEEP
 433          * without resuming us, thus we must be ready for sleep
 434          * when CURSIG is called.  If the wakeup happens while we're
 435          * stopped, p->p_wchan will be 0 upon return from CURSIG.
 436          */
 437         if (catch) {
 438                 CTR4(KTR_PROC,
 439                         "msleep caught: proc %p (pid %d, %s), schedlock %p",
 440                         p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 441                 p->p_sflag |= PS_SINTR;
 442                 mtx_unlock_spin(&sched_lock);
 443                 if ((sig = CURSIG(p))) {
 444                         mtx_lock_spin(&sched_lock);
 445                         if (p->p_wchan)
 446                                 unsleep(p);
 447                         p->p_stat = SRUN;
 448                         goto resume;
 449                 }
 450                 mtx_lock_spin(&sched_lock);
 451                 if (p->p_wchan == NULL) {
 452                         catch = 0;
 453                         goto resume;
 454                 }
 455         } else
 456                 sig = 0;
 457         p->p_stat = SSLEEP;
 458         p->p_stats->p_ru.ru_nvcsw++;
 459         mi_switch();
 460         CTR4(KTR_PROC,
 461                 "msleep resume: proc %p (pid %d, %s), schedlock %p",
 462                 p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 463 resume:
 464         p->p_sflag &= ~PS_SINTR;
 465         if (p->p_sflag & PS_TIMEOUT) {
 466                 p->p_sflag &= ~PS_TIMEOUT;
 467                 if (sig == 0) {
 468 #ifdef KTRACE
 469                         if (KTRPOINT(p, KTR_CSW))
 470                                 ktrcsw(p->p_tracep, 0, 0);
 471 #endif
 472                         rval = EWOULDBLOCK;
 473                         mtx_unlock_spin(&sched_lock);
 474                         goto out;
 475                 }
 476         } else if (timo)
 477                 callout_stop(&p->p_slpcallout);
 478         mtx_unlock_spin(&sched_lock);
 479
 480         if (catch && (sig != 0 || (sig = CURSIG(p)))) {
 481 #ifdef KTRACE
 482                 if (KTRPOINT(p, KTR_CSW))
 483                         ktrcsw(p->p_tracep, 0, 0);
 484 #endif
 485                 PROC_LOCK(p);
 486                 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 487                         rval = EINTR;
 488                 else
 489                         rval = ERESTART;
 490                 PROC_UNLOCK(p);
 491                 goto out;
 492         }
 493 out:
 494 #ifdef KTRACE
 495         if (KTRPOINT(p, KTR_CSW))
 496                 ktrcsw(p->p_tracep, 0, 0);
 497 #endif
 498         PICKUP_GIANT();
 499         if (mtx != NULL) {
 500                 mtx_lock(mtx);
 501                 WITNESS_RESTORE(&mtx->mtx_object, mtx);
 502         }
 503         return (rval);
 504 }
 505
 506 /*
 507  * asleep() - async sleep call.  Place process on wait queue and return
 508  * immediately without blocking.  The process stays runnable until mawait()
 509  * is called.  If ident is NULL, remove process from wait queue if it is still
 510  * on one.
 511  *
 512  * Only the most recent sleep condition is effective when making successive
 513  * calls to asleep() or when calling msleep().
 514  *
 515  * The timeout, if any, is not initiated until mawait() is called.  The sleep
 516  * priority, signal, and timeout is specified in the asleep() call but may be
 517  * overriden in the mawait() call.
 518  *
 519  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
 520  */
 521
 522 int
 523 asleep(void *ident, int priority, const char *wmesg, int timo)
 524 {
 525         struct proc *p = curproc;
 526         int s;
 527
 528         /*
 529          * obtain sched_lock while manipulating sleep structures and slpque.
 530          *
 531          * Remove preexisting wait condition (if any) and place process
 532          * on appropriate slpque, but do not put process to sleep.
 533          */
 534
 535         s = splhigh();
 536         mtx_lock_spin(&sched_lock);
 537
 538         if (p->p_wchan != NULL)
 539                 unsleep(p);
 540
 541         if (ident) {
 542                 p->p_wchan = ident;
 543                 p->p_wmesg = wmesg;
 544                 p->p_slptime = 0;
 545                 p->p_asleep.as_priority = priority;
 546                 p->p_asleep.as_timo = timo;
 547                 TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
 548         }
 549
 550         mtx_unlock_spin(&sched_lock);
 551         splx(s);
 552
 553         return(0);
 554 }
 555
 556 /*
 557  * mawait() - wait for async condition to occur.   The process blocks until
 558  * wakeup() is called on the most recent asleep() address.  If wakeup is called
 559  * prior to mawait(), mawait() winds up being a NOP.
 560  *
 561  * If mawait() is called more then once (without an intervening asleep() call),
 562  * mawait() is still effectively a NOP but it calls mi_switch() to give other
 563  * processes some cpu before returning.  The process is left runnable.
 564  *
 565  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
 566  */
 567
 568 int
 569 mawait(struct mtx *mtx, int priority, int timo)
 570 {
 571         struct proc *p = curproc;
 572         int rval = 0;
 573         int s;
 574         WITNESS_SAVE_DECL(mtx);
 575
 576         WITNESS_SLEEP(0, &mtx->mtx_object);
 577         mtx_lock_spin(&sched_lock);
 578         DROP_GIANT_NOSWITCH();
 579         if (mtx != NULL) {
 580                 mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
 581                 WITNESS_SAVE(&mtx->mtx_object, mtx);
 582                 mtx_unlock_flags(mtx, MTX_NOSWITCH);
 583                 if (priority & PDROP)
 584                         mtx = NULL;
 585         }
 586
 587         s = splhigh();
 588
 589         if (p->p_wchan != NULL) {
 590                 int sig;
 591                 int catch;
 592
 593                 /*
 594                  * The call to mawait() can override defaults specified in
 595                  * the original asleep().
 596                  */
 597                 if (priority < 0)
 598                         priority = p->p_asleep.as_priority;
 599                 if (timo < 0)
 600                         timo = p->p_asleep.as_timo;
 601
 602                 /*
 603                  * Install timeout
 604                  */
 605
 606                 if (timo)
 607                         callout_reset(&p->p_slpcallout, timo, endtsleep, p);
 608
 609                 sig = 0;
 610                 catch = priority & PCATCH;
 611
 612                 if (catch) {
 613                         p->p_sflag |= PS_SINTR;
 614                         mtx_unlock_spin(&sched_lock);
 615                         if ((sig = CURSIG(p))) {
 616                                 mtx_lock_spin(&sched_lock);
 617                                 if (p->p_wchan)
 618                                         unsleep(p);
 619                                 p->p_stat = SRUN;
 620                                 goto resume;
 621                         }
 622                         mtx_lock_spin(&sched_lock);
 623                         if (p->p_wchan == NULL) {
 624                                 catch = 0;
 625                                 goto resume;
 626                         }
 627                 }
 628                 p->p_stat = SSLEEP;
 629                 p->p_stats->p_ru.ru_nvcsw++;
 630                 mi_switch();
 631 resume:
 632
 633                 splx(s);
 634                 p->p_sflag &= ~PS_SINTR;
 635                 if (p->p_sflag & PS_TIMEOUT) {
 636                         p->p_sflag &= ~PS_TIMEOUT;
 637                         if (sig == 0) {
 638 #ifdef KTRACE
 639                                 if (KTRPOINT(p, KTR_CSW))
 640                                         ktrcsw(p->p_tracep, 0, 0);
 641 #endif
 642                                 rval = EWOULDBLOCK;
 643                                 mtx_unlock_spin(&sched_lock);
 644                                 goto out;
 645                         }
 646                 } else if (timo)
 647                         callout_stop(&p->p_slpcallout);
 648                 mtx_unlock_spin(&sched_lock);
 649
 650                 if (catch && (sig != 0 || (sig = CURSIG(p)))) {
 651 #ifdef KTRACE
 652                         if (KTRPOINT(p, KTR_CSW))
 653                                 ktrcsw(p->p_tracep, 0, 0);
 654 #endif
 655                         PROC_LOCK(p);
 656                         if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 657                                 rval = EINTR;
 658                         else
 659                                 rval = ERESTART;
 660                         PROC_UNLOCK(p);
 661                         goto out;
 662                 }
 663 #ifdef KTRACE
 664                 if (KTRPOINT(p, KTR_CSW))
 665                         ktrcsw(p->p_tracep, 0, 0);
 666 #endif
 667         } else {
 668                 /*
 669                  * If as_priority is 0, mawait() has been called without an
 670                  * intervening asleep().  We are still effectively a NOP,
 671                  * but we call mi_switch() for safety.
 672                  */
 673
 674                 if (p->p_asleep.as_priority == 0) {
 675                         p->p_stats->p_ru.ru_nvcsw++;
 676                         mi_switch();
 677                 }
 678                 mtx_unlock_spin(&sched_lock);
 679                 splx(s);
 680         }
 681
 682         /*
 683          * clear p_asleep.as_priority as an indication that mawait() has been
 684          * called.  If mawait() is called again without an intervening asleep(),
 685          * mawait() is still effectively a NOP but the above mi_switch() code
 686          * is triggered as a safety.
 687          */
 688         p->p_asleep.as_priority = 0;
 689
 690 out:
 691         PICKUP_GIANT();
 692         if (mtx != NULL) {
 693                 mtx_lock(mtx);
 694                 WITNESS_RESTORE(&mtx->mtx_object, mtx);
 695         }
 696         return (rval);
 697 }
 698
 699 /*
 700  * Implement timeout for msleep or asleep()/mawait()
 701  *
 702  * If process hasn't been awakened (wchan non-zero),
 703  * set timeout flag and undo the sleep.  If proc
 704  * is stopped, just unsleep so it will remain stopped.
 705  * MP-safe, called without the Giant mutex.
 706  */
 707 static void
 708 endtsleep(arg)
 709         void *arg;
 710 {
 711         register struct proc *p;
 712         int s;
 713
 714         p = (struct proc *)arg;
 715         CTR4(KTR_PROC,
 716                 "endtsleep: proc %p (pid %d, %s), schedlock %p",
 717                 p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 718         s = splhigh();
 719         mtx_lock_spin(&sched_lock);
 720         if (p->p_wchan) {
 721                 if (p->p_stat == SSLEEP)
 722                         setrunnable(p);
 723                 else
 724                         unsleep(p);
 725                 p->p_sflag |= PS_TIMEOUT;
 726         }
 727         mtx_unlock_spin(&sched_lock);
 728         splx(s);
 729 }
 730
 731 /*
 732  * Remove a process from its wait queue
 733  */
 734 void
 735 unsleep(p)
 736         register struct proc *p;
 737 {
 738         int s;
 739
 740         s = splhigh();
 741         mtx_lock_spin(&sched_lock);
 742         if (p->p_wchan) {
 743                 TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_slpq);
 744                 p->p_wchan = NULL;
 745         }
 746         mtx_unlock_spin(&sched_lock);
 747         splx(s);
 748 }
 749
 750 /*
 751  * Make all processes sleeping on the specified identifier runnable.
 752  */
 753 void
 754 wakeup(ident)
 755         register void *ident;
 756 {
 757         register struct slpquehead *qp;
 758         register struct proc *p;
 759         int s;
 760
 761         s = splhigh();
 762         mtx_lock_spin(&sched_lock);
 763         qp = &slpque[LOOKUP(ident)];
 764 restart:
 765         TAILQ_FOREACH(p, qp, p_slpq) {
 766                 if (p->p_wchan == ident) {
 767                         TAILQ_REMOVE(qp, p, p_slpq);
 768                         p->p_wchan = NULL;
 769                         if (p->p_stat == SSLEEP) {
 770                                 /* OPTIMIZED EXPANSION OF setrunnable(p); */
 771                                 CTR4(KTR_PROC,
 772                                         "wakeup: proc %p (pid %d, %s), schedlock %p",
 773                                         p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 774                                 if (p->p_slptime > 1)
 775                                         updatepri(p);
 776                                 p->p_slptime = 0;
 777                                 p->p_stat = SRUN;
 778                                 if (p->p_sflag & PS_INMEM) {
 779                                         setrunqueue(p);
 780                                         maybe_resched(p);
 781                                 } else {
 782                                         p->p_sflag |= PS_SWAPINREQ;
 783                                         wakeup((caddr_t)&proc0);
 784                                 }
 785                                 /* END INLINE EXPANSION */
 786                                 goto restart;
 787                         }
 788                 }
 789         }
 790         mtx_unlock_spin(&sched_lock);
 791         splx(s);
 792 }
 793
 794 /*
 795  * Make a process sleeping on the specified identifier runnable.
 796  * May wake more than one process if a target process is currently
 797  * swapped out.
 798  */
 799 void
 800 wakeup_one(ident)
 801         register void *ident;
 802 {
 803         register struct slpquehead *qp;
 804         register struct proc *p;
 805         int s;
 806
 807         s = splhigh();
 808         mtx_lock_spin(&sched_lock);
 809         qp = &slpque[LOOKUP(ident)];
 810
 811         TAILQ_FOREACH(p, qp, p_slpq) {
 812                 if (p->p_wchan == ident) {
 813                         TAILQ_REMOVE(qp, p, p_slpq);
 814                         p->p_wchan = NULL;
 815                         if (p->p_stat == SSLEEP) {
 816                                 /* OPTIMIZED EXPANSION OF setrunnable(p); */
 817                                 CTR4(KTR_PROC,
 818                                         "wakeup1: proc %p (pid %d, %s), schedlock %p",
 819                                         p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 820                                 if (p->p_slptime > 1)
 821                                         updatepri(p);
 822                                 p->p_slptime = 0;
 823                                 p->p_stat = SRUN;
 824                                 if (p->p_sflag & PS_INMEM) {
 825                                         setrunqueue(p);
 826                                         maybe_resched(p);
 827                                         break;
 828                                 } else {
 829                                         p->p_sflag |= PS_SWAPINREQ;
 830                                         wakeup((caddr_t)&proc0);
 831                                 }
 832                                 /* END INLINE EXPANSION */
 833                         }
 834                 }
 835         }
 836         mtx_unlock_spin(&sched_lock);
 837         splx(s);
 838 }
 839
 840 /*
 841  * The machine independent parts of mi_switch().
 842  * Must be called at splstatclock() or higher.
 843  */
 844 void
 845 mi_switch()
 846 {
 847         struct timeval new_switchtime;
 848         register struct proc *p = curproc;      /* XXX */
 849 #if 0
 850         register struct rlimit *rlim;
 851 #endif
 852         u_int sched_nest;
 853
 854         mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 855
 856         /*
 857          * Compute the amount of time during which the current
 858          * process was running, and add that to its total so far.
 859          */
 860         microuptime(&new_switchtime);
 861         if (timevalcmp(&new_switchtime, PCPU_PTR(switchtime), <)) {
 862 #if 0
 863                 /* XXX: This doesn't play well with sched_lock right now. */
 864                 printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
 865                     PCPU_GET(switchtime.tv_sec), PCPU_GET(switchtime.tv_usec),
 866                     new_switchtime.tv_sec, new_switchtime.tv_usec);
 867 #endif
 868                 new_switchtime = PCPU_GET(switchtime);
 869         } else {
 870                 p->p_runtime += (new_switchtime.tv_usec - PCPU_GET(switchtime.tv_usec)) +
 871                     (new_switchtime.tv_sec - PCPU_GET(switchtime.tv_sec)) *
 872                     (int64_t)1000000;
 873         }
 874
 875 #if 0
 876         /*
 877          * Check if the process exceeds its cpu resource allocation.
 878          * If over max, kill it.
 879          *
 880          * XXX drop sched_lock, pickup Giant
 881          */
 882         if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
 883             p->p_runtime > p->p_limit->p_cpulimit) {
 884                 rlim = &p->p_rlimit[RLIMIT_CPU];
 885                 if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
 886                         mtx_unlock_spin(&sched_lock);
 887                         PROC_LOCK(p);
 888                         killproc(p, "exceeded maximum CPU limit");
 889                         mtx_lock_spin(&sched_lock);
 890                         PROC_UNLOCK_NOSWITCH(p);
 891                 } else {
 892                         mtx_unlock_spin(&sched_lock);
 893                         PROC_LOCK(p);
 894                         psignal(p, SIGXCPU);
 895                         mtx_lock_spin(&sched_lock);
 896                         PROC_UNLOCK_NOSWITCH(p);
 897                         if (rlim->rlim_cur < rlim->rlim_max) {
 898                                 /* XXX: we should make a private copy */
 899                                 rlim->rlim_cur += 5;
 900                         }
 901                 }
 902         }
 903 #endif
 904
 905         /*
 906          * Pick a new current process and record its start time.
 907          */
 908         cnt.v_swtch++;
 909         PCPU_SET(switchtime, new_switchtime);
 910         CTR4(KTR_PROC, "mi_switch: old proc %p (pid %d, %s), schedlock %p",
 911                 p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 912         sched_nest = sched_lock.mtx_recurse;
 913         curproc->p_lastcpu = curproc->p_oncpu;
 914         curproc->p_oncpu = NOCPU;
 915         clear_resched(curproc);
 916         cpu_switch();
 917         curproc->p_oncpu = PCPU_GET(cpuid);
 918         sched_lock.mtx_recurse = sched_nest;
 919         sched_lock.mtx_lock = (uintptr_t)curproc;
 920         CTR4(KTR_PROC, "mi_switch: new proc %p (pid %d, %s), schedlock %p",
 921                 p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 922         if (PCPU_GET(switchtime.tv_sec) == 0)
 923                 microuptime(PCPU_PTR(switchtime));
 924         PCPU_SET(switchticks, ticks);
 925 }
 926
 927 /*
 928  * Change process state to be runnable,
 929  * placing it on the run queue if it is in memory,
 930  * and awakening the swapper if it isn't in memory.
 931  */
 932 void
 933 setrunnable(p)
 934         register struct proc *p;
 935 {
 936         register int s;
 937
 938         s = splhigh();
 939         mtx_lock_spin(&sched_lock);
 940         switch (p->p_stat) {
 941         case 0:
 942         case SRUN:
 943         case SZOMB:
 944         case SWAIT:
 945         default:
 946                 panic("setrunnable");
 947         case SSTOP:
 948         case SSLEEP:                    /* e.g. when sending signals */
 949                 if (p->p_sflag & PS_CVWAITQ)
 950                         cv_waitq_remove(p);
 951                 else
 952                         unsleep(p);
 953                 break;
 954
 955         case SIDL:
 956                 break;
 957         }
 958         p->p_stat = SRUN;
 959         if (p->p_sflag & PS_INMEM)
 960                 setrunqueue(p);
 961         splx(s);
 962         if (p->p_slptime > 1)
 963                 updatepri(p);
 964         p->p_slptime = 0;
 965         if ((p->p_sflag & PS_INMEM) == 0) {
 966                 p->p_sflag |= PS_SWAPINREQ;
 967                 wakeup((caddr_t)&proc0);
 968         }
 969         else
 970                 maybe_resched(p);
 971         mtx_unlock_spin(&sched_lock);
 972 }
 973
 974 /*
 975  * Compute the priority of a process when running in user mode.
 976  * Arrange to reschedule if the resulting priority is better
 977  * than that of the current process.
 978  */
 979 void
 980 resetpriority(p)
 981         register struct proc *p;
 982 {
 983         register unsigned int newpriority;
 984
 985         mtx_lock_spin(&sched_lock);
 986         if (p->p_pri.pri_class == PRI_TIMESHARE) {
 987                 newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
 988                     NICE_WEIGHT * (p->p_nice - PRIO_MIN);
 989                 newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
 990                     PRI_MAX_TIMESHARE);
 991                 p->p_pri.pri_user = newpriority;
 992         }
 993         maybe_resched(p);
 994         mtx_unlock_spin(&sched_lock);
 995 }
 996
 997 /* ARGSUSED */
 998 static void
 999 sched_setup(dummy)
1000         void *dummy;
1001 {
1002
1003         callout_init(&schedcpu_callout, 1);
1004         callout_init(&roundrobin_callout, 0);
1005
1006         /* Kick off timeout driven events by calling first time. */
1007         roundrobin(NULL);
1008         schedcpu(NULL);
1009 }
1010
1011 /*
1012  * We adjust the priority of the current process.  The priority of
1013  * a process gets worse as it accumulates CPU time.  The cpu usage
1014  * estimator (p_estcpu) is increased here.  resetpriority() will
1015  * compute a different priority each time p_estcpu increases by
1016  * INVERSE_ESTCPU_WEIGHT
1017  * (until MAXPRI is reached).  The cpu usage estimator ramps up
1018  * quite quickly when the process is running (linearly), and decays
1019  * away exponentially, at a rate which is proportionally slower when
1020  * the system is busy.  The basic principle is that the system will
1021  * 90% forget that the process used a lot of CPU time in 5 * loadav
1022  * seconds.  This causes the system to favor processes which haven't
1023  * run much recently, and to round-robin among other processes.
1024  */
1025 void
1026 schedclock(p)
1027         struct proc *p;
1028 {
1029
1030         p->p_cpticks++;
1031         p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
1032         if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
1033                 resetpriority(p);
1034                 if (p->p_pri.pri_level >= PUSER)
1035                         p->p_pri.pri_level = p->p_pri.pri_user;
1036         }
1037 }
1038
1039 /*
1040  * General purpose yield system call
1041  */
1042 int
1043 yield(struct proc *p, struct yield_args *uap)
1044 {
1045         int s;
1046
1047         p->p_retval[0] = 0;
1048
1049         s = splhigh();
1050         mtx_lock_spin(&sched_lock);
1051         DROP_GIANT_NOSWITCH();
1052         p->p_pri.pri_level = PRI_MAX_TIMESHARE;
1053         setrunqueue(p);
1054         p->p_stats->p_ru.ru_nvcsw++;
1055         mi_switch();
1056         mtx_unlock_spin(&sched_lock);
1057         PICKUP_GIANT();
1058         splx(s);
1059
1060         return (0);
1061 }