1 static volatile int print_tci = 1;
4 * Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org>
5 * Copyright (c) 1982, 1986, 1991, 1993
6 * The Regents of the University of California. All rights reserved.
7 * (c) UNIX System Laboratories, Inc.
8 * All or some portions of this file are derived from material licensed
9 * to the University of California by American Telephone and Telegraph
10 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
11 * the permission of UNIX System Laboratories, Inc.
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 * must display the following acknowledgement:
23 * This product includes software developed by the University of
24 * California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 * may be used to endorse or promote products derived from this software
27 * without specific prior written permission.
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
42 * $Id: kern_clock.c,v 1.66 1998/04/06 08:26:03 phk Exp $
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/dkstat.h>
48 #include <sys/callout.h>
49 #include <sys/kernel.h>
51 #include <sys/resourcevar.h>
52 #include <sys/signalvar.h>
53 #include <sys/timex.h>
57 #include <vm/vm_map.h>
58 #include <sys/sysctl.h>
60 #include <machine/cpu.h>
61 #include <machine/limits.h>
67 #if defined(SMP) && defined(BETTER_CLOCK)
68 #include <machine/smp.h>
71 static void initclocks __P((void *dummy));
72 SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
74 static void tco_forward __P((void));
75 static void tco_setscales __P((struct timecounter *tc));
77 /* Some of these don't belong here, but it's easiest to concentrate them. */
78 #if defined(SMP) && defined(BETTER_CLOCK)
79 long cp_time[CPUSTATES];
81 static long cp_time[CPUSTATES];
83 long dk_seek[DK_NDRIVE];
84 static long dk_time[DK_NDRIVE]; /* time busy (in statclock ticks) */
85 long dk_wds[DK_NDRIVE];
86 long dk_wpms[DK_NDRIVE];
87 long dk_xfer[DK_NDRIVE];
91 char dk_names[DK_NDRIVE][DK_NAMELEN];
98 struct timecounter *timecounter;
103 * Clock handling routines.
105 * This code is written to operate with two timers that run independently of
108 * The main timer, running hz times per second, is used to trigger interval
109 * timers, timeouts and rescheduling as needed.
111 * The second timer handles kernel and user profiling,
112 * and does resource use estimation. If the second timer is programmable,
113 * it is randomized to avoid aliasing between the two clocks. For example,
114 * the randomization prevents an adversary from always giving up the cpu
115 * just before its quantum expires. Otherwise, it would never accumulate
116 * cpu ticks. The mean frequency of the second timer is stathz.
118 * If no second timer exists, stathz will be zero; in this case we drive
119 * profiling and statistics off the main clock. This WILL NOT be accurate;
120 * do not do it unless absolutely necessary.
122 * The statistics clock may (or may not) be run at a higher rate while
123 * profiling. This profile clock runs at profhz. We require that profhz
124 * be an integral multiple of stathz.
126 * If the statistics clock is running fast, it must be divided by the ratio
127 * profhz/stathz for statistics. (For profiling, every tick counts.)
129 * Time-of-day is maintained using a "timecounter", which may or may
130 * not be related to the hardware generating the above mentioned
136 static int profprocs;
138 static int psdiv, pscnt; /* prof => stat divider */
139 int psratio; /* ratio: prof / stat */
142 * Initialize clock frequencies and start both clocks running.
152 * Set divisors to 1 (normal case) and let the machine-specific
159 * Compute profhz/stathz, and fix profhz if needed.
161 i = stathz ? stathz : hz;
164 psratio = profhz / i;
168 * The real-time timer, interrupting hz times per second.
172 register struct clockframe *frame;
174 register struct proc *p;
178 register struct pstats *pstats;
181 * Run current process's virtual and profile time, as needed.
184 if (CLKF_USERMODE(frame) &&
185 timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
186 itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
187 psignal(p, SIGVTALRM);
188 if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
189 itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
193 #if defined(SMP) && defined(BETTER_CLOCK)
194 forward_hardclock(pscnt);
198 * If no separate statistics clock is available, run it from here.
207 * Process callouts at a very low cpu priority, so we don't keep the
208 * relatively high clock interrupt priority any longer than necessary.
210 if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
211 if (CLKF_BASEPRI(frame)) {
213 * Save the overhead of a software interrupt;
214 * it will happen as soon as we return, so do it now.
216 (void)splsoftclock();
220 } else if (softticks + 1 == ticks)
225 * Compute number of ticks in the specified amount of time.
231 register unsigned long ticks;
232 register long sec, usec;
235 * If the number of usecs in the whole seconds part of the time
236 * difference fits in a long, then the total number of usecs will
237 * fit in an unsigned long. Compute the total and convert it to
238 * ticks, rounding up and adding 1 to allow for the current tick
239 * to expire. Rounding also depends on unsigned long arithmetic
242 * Otherwise, if the number of ticks in the whole seconds part of
243 * the time difference fits in a long, then convert the parts to
244 * ticks separately and add, using similar rounding methods and
245 * overflow avoidance. This method would work in the previous
246 * case but it is slightly slower and assumes that hz is integral.
248 * Otherwise, round the time difference down to the maximum
249 * representable value.
251 * If ints have 32 bits, then the maximum value for any timeout in
252 * 10ms ticks is 248 days.
266 printf("tvotohz: negative time difference %ld sec %ld usec\n",
270 } else if (sec <= LONG_MAX / 1000000)
271 ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
273 else if (sec <= LONG_MAX / hz)
275 + ((unsigned long)usec + (tick - 1)) / tick + 1;
285 * Compute number of hz until specified time. Used to
286 * compute third argument to timeout() from an absolute time.
295 t2.tv_sec = tv->tv_sec - t2.tv_sec;
296 t2.tv_usec = tv->tv_usec - t2.tv_usec;
297 return (tvtohz(&t2));
301 * Start profiling on a process.
303 * Kernel profiling passes proc0 which never exits and hence
304 * keeps the profile clock running constantly.
308 register struct proc *p;
312 if ((p->p_flag & P_PROFIL) == 0) {
313 p->p_flag |= P_PROFIL;
314 if (++profprocs == 1 && stathz != 0) {
316 psdiv = pscnt = psratio;
317 setstatclockrate(profhz);
324 * Stop profiling on a process.
328 register struct proc *p;
332 if (p->p_flag & P_PROFIL) {
333 p->p_flag &= ~P_PROFIL;
334 if (--profprocs == 0 && stathz != 0) {
337 setstatclockrate(stathz);
344 * Statistics clock. Grab profile sample, and if divider reaches 0,
345 * do process and kernel statistics.
349 register struct clockframe *frame;
352 register struct gmonparam *g;
354 register struct proc *p;
356 struct pstats *pstats;
361 if (CLKF_USERMODE(frame)) {
363 if (p->p_flag & P_PROFIL)
364 addupc_intr(p, CLKF_PC(frame), 1);
365 #if defined(SMP) && defined(BETTER_CLOCK)
367 forward_statclock(pscnt);
372 * Came from user mode; CPU was in user state.
373 * If this process is being profiled record the tick.
376 if (p->p_nice > NZERO)
383 * Kernel statistics are just like addupc_intr, only easier.
386 if (g->state == GMON_PROF_ON) {
387 i = CLKF_PC(frame) - g->lowpc;
388 if (i < g->textsize) {
389 i /= HISTFRACTION * sizeof(*g->kcount);
394 #if defined(SMP) && defined(BETTER_CLOCK)
396 forward_statclock(pscnt);
401 * Came from kernel mode, so we were:
402 * - handling an interrupt,
403 * - doing syscall or trap work on behalf of the current
405 * - spinning in the idle loop.
406 * Whichever it is, charge the time as appropriate.
407 * Note that we charge interrupts to the current process,
408 * regardless of whether they are ``for'' that process,
409 * so that we know how much of its real time was spent
410 * in ``non-process'' (i.e., interrupt) work.
413 if (CLKF_INTR(frame)) {
417 } else if (p != NULL) {
426 * We maintain statistics shown by user-level statistics
427 * programs: the amount of time in each cpu state, and
428 * the amount of time each of DK_NDRIVE ``drives'' is busy.
430 * XXX should either run linked list of drives, or (better)
431 * grab timestamps in the start & done code.
433 for (i = 0; i < DK_NDRIVE; i++)
434 if (dk_busy & (1 << i))
438 * We adjust the priority of the current process. The priority of
439 * a process gets worse as it accumulates CPU time. The cpu usage
440 * estimator (p_estcpu) is increased here. The formula for computing
441 * priorities (in kern_synch.c) will compute a different value each
442 * time p_estcpu increases by 4. The cpu usage estimator ramps up
443 * quite quickly when the process is running (linearly), and decays
444 * away exponentially, at a rate which is proportionally slower when
445 * the system is busy. The basic principal is that the system will
446 * 90% forget that the process used a lot of CPU time in 5 * loadav
447 * seconds. This causes the system to favor processes which haven't
448 * run much recently, and to round-robin among other processes.
452 if (++p->p_estcpu == 0)
454 if ((p->p_estcpu & 3) == 0) {
456 if (p->p_priority >= PUSER)
457 p->p_priority = p->p_usrpri;
460 /* Update resource usage integrals and maximums. */
461 if ((pstats = p->p_stats) != NULL &&
462 (ru = &pstats->p_ru) != NULL &&
463 (vm = p->p_vmspace) != NULL) {
464 ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024;
465 ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024;
466 ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024;
467 rss = vm->vm_pmap.pm_stats.resident_count *
469 if (ru->ru_maxrss < rss)
476 * Return information about system clocks.
479 sysctl_kern_clockrate SYSCTL_HANDLER_ARGS
481 struct clockinfo clkinfo;
483 * Construct clockinfo structure.
487 clkinfo.tickadj = tickadj;
488 clkinfo.profhz = profhz;
489 clkinfo.stathz = stathz ? stathz : hz;
490 return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
493 SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
494 0, 0, sysctl_kern_clockrate, "S,clockinfo","");
498 * We have four functions for looking at the clock, two for microseconds
499 * and two for nanoseconds. For each there is fast but less precise
500 * version "get{nano|micro}time" which will return a time which is up
501 * to 1/HZ previous to the call, whereas the raw version "{nano|micro}time"
502 * will return a timestamp which is as precise as possible.
506 getmicrotime(struct timeval *tvp)
508 struct timecounter *tc;
511 *tvp = tc->microtime;
515 getnanotime(struct timespec *tsp)
517 struct timecounter *tc;
524 microtime(struct timeval *tv)
526 struct timecounter *tc;
528 tc = (struct timecounter *)timecounter;
529 tv->tv_sec = tc->offset_sec;
530 tv->tv_usec = tc->offset_micro;
532 ((u_int64_t)tc->get_timedelta(tc) * tc->scale_micro) >> 32;
533 tv->tv_usec += boottime.tv_usec;
534 tv->tv_sec += boottime.tv_sec;
535 while (tv->tv_usec >= 1000000) {
536 tv->tv_usec -= 1000000;
542 nanotime(struct timespec *tv)
546 struct timecounter *tc;
548 tc = (struct timecounter *)timecounter;
549 tv->tv_sec = tc->offset_sec;
550 count = tc->get_timedelta(tc);
551 delta = tc->offset_nano;
552 delta += ((u_int64_t)count * tc->scale_nano_f);
554 delta += ((u_int64_t)count * tc->scale_nano_i);
555 delta += boottime.tv_usec * 1000;
556 tv->tv_sec += boottime.tv_sec;
557 while (delta >= 1000000000) {
565 getmicroruntime(struct timeval *tvp)
567 struct timecounter *tc;
570 tvp->tv_sec = tc->offset_sec;
571 tvp->tv_usec = tc->offset_micro;
575 getnanoruntime(struct timespec *tsp)
577 struct timecounter *tc;
580 tsp->tv_sec = tc->offset_sec;
581 tsp->tv_nsec = tc->offset_nano >> 32;
585 microruntime(struct timeval *tv)
587 struct timecounter *tc;
589 tc = (struct timecounter *)timecounter;
590 tv->tv_sec = tc->offset_sec;
591 tv->tv_usec = tc->offset_micro;
593 ((u_int64_t)tc->get_timedelta(tc) * tc->scale_micro) >> 32;
594 if (tv->tv_usec >= 1000000) {
595 tv->tv_usec -= 1000000;
601 nanoruntime(struct timespec *tv)
605 struct timecounter *tc;
607 tc = (struct timecounter *)timecounter;
608 tv->tv_sec = tc->offset_sec;
609 count = tc->get_timedelta(tc);
610 delta = tc->offset_nano;
611 delta += ((u_int64_t)count * tc->scale_nano_f);
613 delta += ((u_int64_t)count * tc->scale_nano_i);
614 if (delta >= 1000000000) {
622 tco_setscales(struct timecounter *tc)
626 scale = 1000000000LL << 32;
627 if (tc->adjustment > 0)
628 scale += (tc->adjustment * 1000LL) << 10;
630 scale -= (-tc->adjustment * 1000LL) << 10;
631 scale /= tc->frequency;
632 tc->scale_micro = scale / 1000;
633 tc->scale_nano_f = scale & 0xffffffff;
634 tc->scale_nano_i = scale >> 32;
638 delta_timecounter(struct timecounter *tc)
641 return((tc->get_timecount() - tc->offset_count) & tc->counter_mask);
645 init_timecounter(struct timecounter *tc)
647 struct timespec ts0, ts1;
650 if (!tc->get_timedelta)
651 tc->get_timedelta = delta_timecounter;
654 tc->offset_count = tc->get_timecount();
655 tc[0].tweak = &tc[0];
656 tc[2] = tc[1] = tc[0];
657 tc[1].other = &tc[2];
658 tc[2].other = &tc[1];
659 if (!timecounter || !strcmp(timecounter->name, "dummy"))
660 timecounter = &tc[2];
664 * Figure out the cost of calling this timecounter.
665 * XXX: The 1:15 ratio is a guess at reality.
668 for (i = 0; i < 16; i ++)
670 for (i = 0; i < 240; i ++)
671 tc->get_timedelta(tc);
673 ts1.tv_sec -= ts0.tv_sec;
674 tc->cost = ts1.tv_sec * 1000000000 + ts1.tv_nsec - ts0.tv_nsec;
676 if (print_tci && strcmp(tc->name, "dummy"))
677 printf("Timecounter \"%s\" frequency %lu Hz cost %u ns\n",
678 tc->name, tc->frequency, tc->cost);
680 /* XXX: For now always start using the counter. */
681 tc->offset_count = tc->get_timecount();
683 tc->offset_nano = (u_int64_t)ts1.tv_nsec << 32;
684 tc->offset_micro = ts1.tv_nsec / 1000;
685 tc->offset_sec = ts1.tv_sec;
690 set_timecounter(struct timespec *ts)
695 boottime.tv_sec = ts->tv_sec - ts2.tv_sec;
696 boottime.tv_usec = (ts->tv_nsec - ts2.tv_nsec) / 1000;
697 if (boottime.tv_usec < 0) {
698 boottime.tv_usec += 1000000;
701 /* fiddle all the little crinkly bits around the fiords... */
706 #if 0 /* Currently unused */
708 switch_timecounter(struct timecounter *newtc)
711 struct timecounter *tc;
716 if (newtc == tc || newtc == tc->other) {
721 newtc->offset_sec = ts.tv_sec;
722 newtc->offset_nano = (u_int64_t)ts.tv_nsec << 32;
723 newtc->offset_micro = ts.tv_nsec / 1000;
724 newtc->offset_count = newtc->get_timecount();
730 static struct timecounter *
731 sync_other_counter(void)
733 struct timecounter *tc, *tco;
736 tc = timecounter->other;
740 delta = tc->get_timedelta(tc);
741 tc->offset_count += delta;
742 tc->offset_count &= tc->counter_mask;
743 tc->offset_nano += (u_int64_t)delta * tc->scale_nano_f;
744 tc->offset_nano += (u_int64_t)delta * tc->scale_nano_i << 32;
751 struct timecounter *tc;
753 tc = sync_other_counter();
754 if (timedelta != 0) {
755 tc->offset_nano += (u_int64_t)(tickdelta * 1000) << 32;
756 timedelta -= tickdelta;
759 while (tc->offset_nano >= 1000000000ULL << 32) {
760 tc->offset_nano -= 1000000000ULL << 32;
762 tc->frequency = tc->tweak->frequency;
763 tc->adjustment = tc->tweak->adjustment;
764 ntp_update_second(tc); /* XXX only needed if xntpd runs */
768 tc->offset_micro = (tc->offset_nano / 1000) >> 32;
770 /* Figure out the wall-clock time */
771 tc->nanotime.tv_sec = tc->offset_sec + boottime.tv_sec;
772 tc->nanotime.tv_nsec = (tc->offset_nano >> 32) + boottime.tv_usec * 1000;
773 tc->microtime.tv_usec = tc->offset_micro + boottime.tv_usec;
774 if (tc->nanotime.tv_nsec >= 1000000000) {
775 tc->nanotime.tv_nsec -= 1000000000;
776 tc->microtime.tv_usec -= 1000000;
777 tc->nanotime.tv_sec++;
779 time_second = tc->microtime.tv_sec = tc->nanotime.tv_sec;
785 sysctl_kern_timecounter_frequency SYSCTL_HANDLER_ARGS
788 return (sysctl_handle_opaque(oidp, &timecounter->tweak->frequency,
789 sizeof(timecounter->tweak->frequency), req));
793 sysctl_kern_timecounter_adjustment SYSCTL_HANDLER_ARGS
796 return (sysctl_handle_opaque(oidp, &timecounter->tweak->adjustment,
797 sizeof(timecounter->tweak->adjustment), req));
800 SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
802 SYSCTL_PROC(_kern_timecounter, OID_AUTO, frequency, CTLTYPE_INT | CTLFLAG_RW,
803 0, sizeof(u_int), sysctl_kern_timecounter_frequency, "I", "");
805 SYSCTL_PROC(_kern_timecounter, OID_AUTO, adjustment, CTLTYPE_INT | CTLFLAG_RW,
806 0, sizeof(int), sysctl_kern_timecounter_adjustment, "I", "");
809 * Implement a dummy timecounter which we can use until we get a real one
810 * in the air. This allows the console and other early stuff to use
815 dummy_get_timecount(void)
817 static u_int64_t now;
821 static struct timecounter dummy_timecounter[3] = {
832 initdummytimecounter(void *dummy)
834 init_timecounter(dummy_timecounter);
837 SYSINIT(dummytc, SI_SUB_CONSOLE, SI_ORDER_FIRST, initdummytimecounter, NULL)