sys/kern/kern_clock.c

   1 /*-
   2  * Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org>
   3  * Copyright (c) 1982, 1986, 1991, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  * (c) UNIX System Laboratories, Inc.
   6  * All or some portions of this file are derived from material licensed
   7  * to the University of California by American Telephone and Telegraph
   8  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   9  * the permission of UNIX System Laboratories, Inc.
  10  *
  11  * Redistribution and use in source and binary forms, with or without
  12  * modification, are permitted provided that the following conditions
  13  * are met:
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in the
  18  *    documentation and/or other materials provided with the distribution.
  19  * 3. All advertising materials mentioning features or use of this software
  20  *    must display the following acknowledgement:
  21  *      This product includes software developed by the University of
  22  *      California, Berkeley and its contributors.
  23  * 4. Neither the name of the University nor the names of its contributors
  24  *    may be used to endorse or promote products derived from this software
  25  *    without specific prior written permission.
  26  *
  27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  37  * SUCH DAMAGE.
  38  *
  39  *      @(#)kern_clock.c        8.5 (Berkeley) 1/21/94
  40  * $Id: kern_clock.c,v 1.87 1999/02/19 14:25:34 luoqi Exp $
  41  */
  42
  43 #include <sys/param.h>
  44 #include <sys/systm.h>
  45 #include <sys/dkstat.h>
  46 #include <sys/callout.h>
  47 #include <sys/kernel.h>
  48 #include <sys/proc.h>
  49 #include <sys/malloc.h>
  50 #include <sys/resourcevar.h>
  51 #include <sys/signalvar.h>
  52 #include <sys/timex.h>
  53 #include <vm/vm.h>
  54 #include <sys/lock.h>
  55 #include <vm/pmap.h>
  56 #include <vm/vm_map.h>
  57 #include <sys/sysctl.h>
  58
  59 #include <machine/cpu.h>
  60 #include <machine/limits.h>
  61
  62 #ifdef GPROF
  63 #include <sys/gmon.h>
  64 #endif
  65
  66 #if defined(SMP) && defined(BETTER_CLOCK)
  67 #include <machine/smp.h>
  68 #endif
  69
  70 /* This is where the NTIMECOUNTER option hangs out */
  71 #include "opt_ntp.h"
  72
  73 /*
  74  * Number of timecounters used to implement stable storage
  75  */
  76 #ifndef NTIMECOUNTER
  77 #define NTIMECOUNTER    5
  78 #endif
  79
  80 static MALLOC_DEFINE(M_TIMECOUNTER, "timecounter",
  81         "Timecounter stable storage");
  82
  83 static void initclocks __P((void *dummy));
  84 SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
  85
  86 static void tco_forward __P((int force));
  87 static void tco_setscales __P((struct timecounter *tc));
  88 static __inline unsigned tco_delta __P((struct timecounter *tc));
  89
  90 /* Some of these don't belong here, but it's easiest to concentrate them. */
  91 #if defined(SMP) && defined(BETTER_CLOCK)
  92 long cp_time[CPUSTATES];
  93 #else
  94 static long cp_time[CPUSTATES];
  95 #endif
  96
  97 long tk_cancc;
  98 long tk_nin;
  99 long tk_nout;
 100 long tk_rawcc;
 101
 102 time_t time_second;
 103
 104 /*
 105  * Which update policy to use.
 106  *   0 - every tick, bad hardware may fail with "calcru negative..."
 107  *   1 - more resistent to the above hardware, but less efficient.
 108  */
 109 static int tco_method;
 110
 111 /*
 112  * Implement a dummy timecounter which we can use until we get a real one
 113  * in the air.  This allows the console and other early stuff to use
 114  * timeservices.
 115  */
 116
 117 static unsigned
 118 dummy_get_timecount(struct timecounter *tc)
 119 {
 120         static unsigned now;
 121         return (++now);
 122 }
 123
 124 static struct timecounter dummy_timecounter = {
 125         dummy_get_timecount,
 126         0,
 127         ~0u,
 128         1000000,
 129         "dummy"
 130 };
 131
 132 struct timecounter *timecounter = &dummy_timecounter;
 133
 134 /*
 135  * Clock handling routines.
 136  *
 137  * This code is written to operate with two timers that run independently of
 138  * each other.
 139  *
 140  * The main timer, running hz times per second, is used to trigger interval
 141  * timers, timeouts and rescheduling as needed.
 142  *
 143  * The second timer handles kernel and user profiling,
 144  * and does resource use estimation.  If the second timer is programmable,
 145  * it is randomized to avoid aliasing between the two clocks.  For example,
 146  * the randomization prevents an adversary from always giving up the cpu
 147  * just before its quantum expires.  Otherwise, it would never accumulate
 148  * cpu ticks.  The mean frequency of the second timer is stathz.
 149  *
 150  * If no second timer exists, stathz will be zero; in this case we drive
 151  * profiling and statistics off the main clock.  This WILL NOT be accurate;
 152  * do not do it unless absolutely necessary.
 153  *
 154  * The statistics clock may (or may not) be run at a higher rate while
 155  * profiling.  This profile clock runs at profhz.  We require that profhz
 156  * be an integral multiple of stathz.
 157  *
 158  * If the statistics clock is running fast, it must be divided by the ratio
 159  * profhz/stathz for statistics.  (For profiling, every tick counts.)
 160  *
 161  * Time-of-day is maintained using a "timecounter", which may or may
 162  * not be related to the hardware generating the above mentioned
 163  * interrupts.
 164  */
 165
 166 int     stathz;
 167 int     profhz;
 168 static int profprocs;
 169 int     ticks;
 170 static int psdiv, pscnt;                /* prof => stat divider */
 171 int     psratio;                        /* ratio: prof / stat */
 172
 173 /*
 174  * Initialize clock frequencies and start both clocks running.
 175  */
 176 /* ARGSUSED*/
 177 static void
 178 initclocks(dummy)
 179         void *dummy;
 180 {
 181         register int i;
 182
 183         /*
 184          * Set divisors to 1 (normal case) and let the machine-specific
 185          * code do its bit.
 186          */
 187         psdiv = pscnt = 1;
 188         cpu_initclocks();
 189
 190         /*
 191          * Compute profhz/stathz, and fix profhz if needed.
 192          */
 193         i = stathz ? stathz : hz;
 194         if (profhz == 0)
 195                 profhz = i;
 196         psratio = profhz / i;
 197 }
 198
 199 /*
 200  * The real-time timer, interrupting hz times per second.
 201  */
 202 void
 203 hardclock(frame)
 204         register struct clockframe *frame;
 205 {
 206         register struct proc *p;
 207
 208         p = curproc;
 209         if (p) {
 210                 register struct pstats *pstats;
 211
 212                 /*
 213                  * Run current process's virtual and profile time, as needed.
 214                  */
 215                 pstats = p->p_stats;
 216                 if (CLKF_USERMODE(frame) &&
 217                     timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
 218                     itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
 219                         psignal(p, SIGVTALRM);
 220                 if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
 221                     itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
 222                         psignal(p, SIGPROF);
 223         }
 224
 225 #if defined(SMP) && defined(BETTER_CLOCK)
 226         forward_hardclock(pscnt);
 227 #endif
 228
 229         /*
 230          * If no separate statistics clock is available, run it from here.
 231          */
 232         if (stathz == 0)
 233                 statclock(frame);
 234
 235         tco_forward(0);
 236         ticks++;
 237
 238         /*
 239          * Process callouts at a very low cpu priority, so we don't keep the
 240          * relatively high clock interrupt priority any longer than necessary.
 241          */
 242         if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
 243                 if (CLKF_BASEPRI(frame)) {
 244                         /*
 245                          * Save the overhead of a software interrupt;
 246                          * it will happen as soon as we return, so do it now.
 247                          */
 248                         (void)splsoftclock();
 249                         softclock();
 250                 } else
 251                         setsoftclock();
 252         } else if (softticks + 1 == ticks)
 253                 ++softticks;
 254 }
 255
 256 /*
 257  * Compute number of ticks in the specified amount of time.
 258  */
 259 int
 260 tvtohz(tv)
 261         struct timeval *tv;
 262 {
 263         register unsigned long ticks;
 264         register long sec, usec;
 265
 266         /*
 267          * If the number of usecs in the whole seconds part of the time
 268          * difference fits in a long, then the total number of usecs will
 269          * fit in an unsigned long.  Compute the total and convert it to
 270          * ticks, rounding up and adding 1 to allow for the current tick
 271          * to expire.  Rounding also depends on unsigned long arithmetic
 272          * to avoid overflow.
 273          *
 274          * Otherwise, if the number of ticks in the whole seconds part of
 275          * the time difference fits in a long, then convert the parts to
 276          * ticks separately and add, using similar rounding methods and
 277          * overflow avoidance.  This method would work in the previous
 278          * case but it is slightly slower and assumes that hz is integral.
 279          *
 280          * Otherwise, round the time difference down to the maximum
 281          * representable value.
 282          *
 283          * If ints have 32 bits, then the maximum value for any timeout in
 284          * 10ms ticks is 248 days.
 285          */
 286         sec = tv->tv_sec;
 287         usec = tv->tv_usec;
 288         if (usec < 0) {
 289                 sec--;
 290                 usec += 1000000;
 291         }
 292         if (sec < 0) {
 293 #ifdef DIAGNOSTIC
 294                 if (usec > 0) {
 295                         sec++;
 296                         usec -= 1000000;
 297                 }
 298                 printf("tvotohz: negative time difference %ld sec %ld usec\n",
 299                        sec, usec);
 300 #endif
 301                 ticks = 1;
 302         } else if (sec <= LONG_MAX / 1000000)
 303                 ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
 304                         / tick + 1;
 305         else if (sec <= LONG_MAX / hz)
 306                 ticks = sec * hz
 307                         + ((unsigned long)usec + (tick - 1)) / tick + 1;
 308         else
 309                 ticks = LONG_MAX;
 310         if (ticks > INT_MAX)
 311                 ticks = INT_MAX;
 312         return ((int)ticks);
 313 }
 314
 315 /*
 316  * Start profiling on a process.
 317  *
 318  * Kernel profiling passes proc0 which never exits and hence
 319  * keeps the profile clock running constantly.
 320  */
 321 void
 322 startprofclock(p)
 323         register struct proc *p;
 324 {
 325         int s;
 326
 327         if ((p->p_flag & P_PROFIL) == 0) {
 328                 p->p_flag |= P_PROFIL;
 329                 if (++profprocs == 1 && stathz != 0) {
 330                         s = splstatclock();
 331                         psdiv = pscnt = psratio;
 332                         setstatclockrate(profhz);
 333                         splx(s);
 334                 }
 335         }
 336 }
 337
 338 /*
 339  * Stop profiling on a process.
 340  */
 341 void
 342 stopprofclock(p)
 343         register struct proc *p;
 344 {
 345         int s;
 346
 347         if (p->p_flag & P_PROFIL) {
 348                 p->p_flag &= ~P_PROFIL;
 349                 if (--profprocs == 0 && stathz != 0) {
 350                         s = splstatclock();
 351                         psdiv = pscnt = 1;
 352                         setstatclockrate(stathz);
 353                         splx(s);
 354                 }
 355         }
 356 }
 357
 358 /*
 359  * Statistics clock.  Grab profile sample, and if divider reaches 0,
 360  * do process and kernel statistics.
 361  */
 362 void
 363 statclock(frame)
 364         register struct clockframe *frame;
 365 {
 366 #ifdef GPROF
 367         register struct gmonparam *g;
 368         int i;
 369 #endif
 370         register struct proc *p;
 371         struct pstats *pstats;
 372         long rss;
 373         struct rusage *ru;
 374         struct vmspace *vm;
 375
 376         if (curproc != NULL && CLKF_USERMODE(frame)) {
 377                 p = curproc;
 378                 if (p->p_flag & P_PROFIL)
 379                         addupc_intr(p, CLKF_PC(frame), 1);
 380 #if defined(SMP) && defined(BETTER_CLOCK)
 381                 if (stathz != 0)
 382                         forward_statclock(pscnt);
 383 #endif
 384                 if (--pscnt > 0)
 385                         return;
 386                 /*
 387                  * Came from user mode; CPU was in user state.
 388                  * If this process is being profiled record the tick.
 389                  */
 390                 p->p_uticks++;
 391                 if (p->p_nice > NZERO)
 392                         cp_time[CP_NICE]++;
 393                 else
 394                         cp_time[CP_USER]++;
 395         } else {
 396 #ifdef GPROF
 397                 /*
 398                  * Kernel statistics are just like addupc_intr, only easier.
 399                  */
 400                 g = &_gmonparam;
 401                 if (g->state == GMON_PROF_ON) {
 402                         i = CLKF_PC(frame) - g->lowpc;
 403                         if (i < g->textsize) {
 404                                 i /= HISTFRACTION * sizeof(*g->kcount);
 405                                 g->kcount[i]++;
 406                         }
 407                 }
 408 #endif
 409 #if defined(SMP) && defined(BETTER_CLOCK)
 410                 if (stathz != 0)
 411                         forward_statclock(pscnt);
 412 #endif
 413                 if (--pscnt > 0)
 414                         return;
 415                 /*
 416                  * Came from kernel mode, so we were:
 417                  * - handling an interrupt,
 418                  * - doing syscall or trap work on behalf of the current
 419                  *   user process, or
 420                  * - spinning in the idle loop.
 421                  * Whichever it is, charge the time as appropriate.
 422                  * Note that we charge interrupts to the current process,
 423                  * regardless of whether they are ``for'' that process,
 424                  * so that we know how much of its real time was spent
 425                  * in ``non-process'' (i.e., interrupt) work.
 426                  */
 427                 p = curproc;
 428                 if (CLKF_INTR(frame)) {
 429                         if (p != NULL)
 430                                 p->p_iticks++;
 431                         cp_time[CP_INTR]++;
 432                 } else if (p != NULL) {
 433                         p->p_sticks++;
 434                         cp_time[CP_SYS]++;
 435                 } else
 436                         cp_time[CP_IDLE]++;
 437         }
 438         pscnt = psdiv;
 439
 440         /*
 441          * We maintain statistics shown by user-level statistics
 442          * programs:  the amount of time in each cpu state.
 443          */
 444
 445         /*
 446          * We adjust the priority of the current process.  The priority of
 447          * a process gets worse as it accumulates CPU time.  The cpu usage
 448          * estimator (p_estcpu) is increased here.  The formula for computing
 449          * priorities (in kern_synch.c) will compute a different value each
 450          * time p_estcpu increases by 4.  The cpu usage estimator ramps up
 451          * quite quickly when the process is running (linearly), and decays
 452          * away exponentially, at a rate which is proportionally slower when
 453          * the system is busy.  The basic principal is that the system will
 454          * 90% forget that the process used a lot of CPU time in 5 * loadav
 455          * seconds.  This causes the system to favor processes which haven't
 456          * run much recently, and to round-robin among other processes.
 457          */
 458         if (p != NULL) {
 459                 p->p_cpticks++;
 460                 if (++p->p_estcpu == 0)
 461                         p->p_estcpu--;
 462                 if ((p->p_estcpu & 3) == 0) {
 463                         resetpriority(p);
 464                         if (p->p_priority >= PUSER)
 465                                 p->p_priority = p->p_usrpri;
 466                 }
 467
 468                 /* Update resource usage integrals and maximums. */
 469                 if ((pstats = p->p_stats) != NULL &&
 470                     (ru = &pstats->p_ru) != NULL &&
 471                     (vm = p->p_vmspace) != NULL) {
 472                         ru->ru_ixrss += pgtok(vm->vm_tsize);
 473                         ru->ru_idrss += pgtok(vm->vm_dsize);
 474                         ru->ru_isrss += pgtok(vm->vm_ssize);
 475                         rss = pgtok(vmspace_resident_count(vm));
 476                         if (ru->ru_maxrss < rss)
 477                                 ru->ru_maxrss = rss;
 478                 }
 479         }
 480 }
 481
 482 /*
 483  * Return information about system clocks.
 484  */
 485 static int
 486 sysctl_kern_clockrate SYSCTL_HANDLER_ARGS
 487 {
 488         struct clockinfo clkinfo;
 489         /*
 490          * Construct clockinfo structure.
 491          */
 492         clkinfo.hz = hz;
 493         clkinfo.tick = tick;
 494         clkinfo.tickadj = tickadj;
 495         clkinfo.profhz = profhz;
 496         clkinfo.stathz = stathz ? stathz : hz;
 497         return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
 498 }
 499
 500 SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
 501         0, 0, sysctl_kern_clockrate, "S,clockinfo","");
 502
 503 static __inline unsigned
 504 tco_delta(struct timecounter *tc)
 505 {
 506
 507         return ((tc->tc_get_timecount(tc) - tc->tc_offset_count) &
 508             tc->tc_counter_mask);
 509 }
 510
 511 /*
 512  * We have four functions for looking at the clock, two for microseconds
 513  * and two for nanoseconds.  For each there is fast but less precise
 514  * version "get{nano|micro}time" which will return a time which is up
 515  * to 1/HZ previous to the call, whereas the raw version "{nano|micro}time"
 516  * will return a timestamp which is as precise as possible.
 517  */
 518
 519 void
 520 getmicrotime(struct timeval *tvp)
 521 {
 522         struct timecounter *tc;
 523
 524         if (!tco_method) {
 525                 tc = timecounter;
 526                 *tvp = tc->tc_microtime;
 527         } else {
 528                 microtime(tvp);
 529         }
 530 }
 531
 532 void
 533 getnanotime(struct timespec *tsp)
 534 {
 535         struct timecounter *tc;
 536
 537         if (!tco_method) {
 538                 tc = timecounter;
 539                 *tsp = tc->tc_nanotime;
 540         } else {
 541                 nanotime(tsp);
 542         }
 543 }
 544
 545 void
 546 microtime(struct timeval *tv)
 547 {
 548         struct timecounter *tc;
 549
 550         tc = (struct timecounter *)timecounter;
 551         tv->tv_sec = tc->tc_offset_sec;
 552         tv->tv_usec = tc->tc_offset_micro;
 553         tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32;
 554         tv->tv_usec += boottime.tv_usec;
 555         tv->tv_sec += boottime.tv_sec;
 556         while (tv->tv_usec >= 1000000) {
 557                 tv->tv_usec -= 1000000;
 558                 tv->tv_sec++;
 559         }
 560 }
 561
 562 void
 563 nanotime(struct timespec *ts)
 564 {
 565         unsigned count;
 566         u_int64_t delta;
 567         struct timecounter *tc;
 568
 569         tc = (struct timecounter *)timecounter;
 570         ts->tv_sec = tc->tc_offset_sec;
 571         count = tco_delta(tc);
 572         delta = tc->tc_offset_nano;
 573         delta += ((u_int64_t)count * tc->tc_scale_nano_f);
 574         delta >>= 32;
 575         delta += ((u_int64_t)count * tc->tc_scale_nano_i);
 576         delta += boottime.tv_usec * 1000;
 577         ts->tv_sec += boottime.tv_sec;
 578         while (delta >= 1000000000) {
 579                 delta -= 1000000000;
 580                 ts->tv_sec++;
 581         }
 582         ts->tv_nsec = delta;
 583 }
 584
 585 void
 586 timecounter_timespec(unsigned count, struct timespec *ts)
 587 {
 588         u_int64_t delta;
 589         struct timecounter *tc;
 590
 591         tc = (struct timecounter *)timecounter;
 592         ts->tv_sec = tc->tc_offset_sec;
 593         count -= tc->tc_offset_count;
 594         count &= tc->tc_counter_mask;
 595         delta = tc->tc_offset_nano;
 596         delta += ((u_int64_t)count * tc->tc_scale_nano_f);
 597         delta >>= 32;
 598         delta += ((u_int64_t)count * tc->tc_scale_nano_i);
 599         delta += boottime.tv_usec * 1000;
 600         ts->tv_sec += boottime.tv_sec;
 601         while (delta >= 1000000000) {
 602                 delta -= 1000000000;
 603                 ts->tv_sec++;
 604         }
 605         ts->tv_nsec = delta;
 606 }
 607
 608 void
 609 getmicrouptime(struct timeval *tvp)
 610 {
 611         struct timecounter *tc;
 612
 613         if (!tco_method) {
 614                 tc = timecounter;
 615                 tvp->tv_sec = tc->tc_offset_sec;
 616                 tvp->tv_usec = tc->tc_offset_micro;
 617         } else {
 618                 microuptime(tvp);
 619         }
 620 }
 621
 622 void
 623 getnanouptime(struct timespec *tsp)
 624 {
 625         struct timecounter *tc;
 626
 627         if (!tco_method) {
 628                 tc = timecounter;
 629                 tsp->tv_sec = tc->tc_offset_sec;
 630                 tsp->tv_nsec = tc->tc_offset_nano >> 32;
 631         } else {
 632                 nanouptime(tsp);
 633         }
 634 }
 635
 636 void
 637 microuptime(struct timeval *tv)
 638 {
 639         struct timecounter *tc;
 640
 641         tc = (struct timecounter *)timecounter;
 642         tv->tv_sec = tc->tc_offset_sec;
 643         tv->tv_usec = tc->tc_offset_micro;
 644         tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32;
 645         if (tv->tv_usec >= 1000000) {
 646                 tv->tv_usec -= 1000000;
 647                 tv->tv_sec++;
 648         }
 649 }
 650
 651 void
 652 nanouptime(struct timespec *ts)
 653 {
 654         unsigned count;
 655         u_int64_t delta;
 656         struct timecounter *tc;
 657
 658         tc = (struct timecounter *)timecounter;
 659         ts->tv_sec = tc->tc_offset_sec;
 660         count = tco_delta(tc);
 661         delta = tc->tc_offset_nano;
 662         delta += ((u_int64_t)count * tc->tc_scale_nano_f);
 663         delta >>= 32;
 664         delta += ((u_int64_t)count * tc->tc_scale_nano_i);
 665         if (delta >= 1000000000) {
 666                 delta -= 1000000000;
 667                 ts->tv_sec++;
 668         }
 669         ts->tv_nsec = delta;
 670 }
 671
 672 static void
 673 tco_setscales(struct timecounter *tc)
 674 {
 675         u_int64_t scale;
 676
 677         scale = 1000000000LL << 32;
 678         if (tc->tc_adjustment > 0)
 679                 scale += (tc->tc_adjustment * 1000LL) << 10;
 680         else
 681                 scale -= (-tc->tc_adjustment * 1000LL) << 10;
 682         scale /= tc->tc_frequency;
 683         tc->tc_scale_micro = scale / 1000;
 684         tc->tc_scale_nano_f = scale & 0xffffffff;
 685         tc->tc_scale_nano_i = scale >> 32;
 686 }
 687
 688 void
 689 init_timecounter(struct timecounter *tc)
 690 {
 691         struct timespec ts1;
 692         struct timecounter *t1, *t2, *t3;
 693         int i;
 694
 695         tc->tc_adjustment = 0;
 696         tco_setscales(tc);
 697         tc->tc_offset_count = tc->tc_get_timecount(tc);
 698         tc->tc_tweak = tc;
 699         MALLOC(t1, struct timecounter *, sizeof *t1, M_TIMECOUNTER, M_WAITOK);
 700         *t1 = *tc;
 701         t2 = t1;
 702         for (i = 1; i < NTIMECOUNTER; i++) {
 703                 MALLOC(t3, struct timecounter *, sizeof *t3,
 704                     M_TIMECOUNTER, M_WAITOK);
 705                 *t3 = *tc;
 706                 t3->tc_other = t2;
 707                 t2 = t3;
 708         }
 709         t1->tc_other = t3;
 710         tc = t1;
 711
 712         printf("Timecounter \"%s\"  frequency %lu Hz\n",
 713             tc->tc_name, (u_long)tc->tc_frequency);
 714
 715         /* XXX: For now always start using the counter. */
 716         tc->tc_offset_count = tc->tc_get_timecount(tc);
 717         nanouptime(&ts1);
 718         tc->tc_offset_nano = (u_int64_t)ts1.tv_nsec << 32;
 719         tc->tc_offset_micro = ts1.tv_nsec / 1000;
 720         tc->tc_offset_sec = ts1.tv_sec;
 721         timecounter = tc;
 722 }
 723
 724 void
 725 set_timecounter(struct timespec *ts)
 726 {
 727         struct timespec ts2;
 728
 729         nanouptime(&ts2);
 730         boottime.tv_sec = ts->tv_sec - ts2.tv_sec;
 731         boottime.tv_usec = (ts->tv_nsec - ts2.tv_nsec) / 1000;
 732         if (boottime.tv_usec < 0) {
 733                 boottime.tv_usec += 1000000;
 734                 boottime.tv_sec--;
 735         }
 736         /* fiddle all the little crinkly bits around the fiords... */
 737         tco_forward(1);
 738 }
 739
 740
 741 #if 0 /* Currently unused */
 742 void
 743 switch_timecounter(struct timecounter *newtc)
 744 {
 745         int s;
 746         struct timecounter *tc;
 747         struct timespec ts;
 748
 749         s = splclock();
 750         tc = timecounter;
 751         if (newtc == tc || newtc == tc->tc_other) {
 752                 splx(s);
 753                 return;
 754         }
 755         nanouptime(&ts);
 756         newtc->tc_offset_sec = ts.tv_sec;
 757         newtc->tc_offset_nano = (u_int64_t)ts.tv_nsec << 32;
 758         newtc->tc_offset_micro = ts.tv_nsec / 1000;
 759         newtc->tc_offset_count = newtc->tc_get_timecount(newtc);
 760         timecounter = newtc;
 761         splx(s);
 762 }
 763 #endif
 764
 765 static struct timecounter *
 766 sync_other_counter(void)
 767 {
 768         struct timecounter *tc, *tcn, *tco;
 769         unsigned delta;
 770
 771         tco = timecounter;
 772         tc = tco->tc_other;
 773         tcn = tc->tc_other;
 774         *tc = *tco;
 775         tc->tc_other = tcn;
 776         delta = tco_delta(tc);
 777         tc->tc_offset_count += delta;
 778         tc->tc_offset_count &= tc->tc_counter_mask;
 779         tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_f;
 780         tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_i << 32;
 781         return (tc);
 782 }
 783
 784 static void
 785 tco_forward(int force)
 786 {
 787         struct timecounter *tc, *tco;
 788
 789         tco = timecounter;
 790         tc = sync_other_counter();
 791         /*
 792          * We may be inducing a tiny error here, the tc_poll_pps() may
 793          * process a latched count which happens after the tco_delta()
 794          * in sync_other_counter(), which would extend the previous
 795          * counters parameters into the domain of this new one.
 796          * Since the timewindow is very small for this, the error is
 797          * going to be only a few weenieseconds (as Dave Mills would
 798          * say), so lets just not talk more about it, OK ?
 799          */
 800         if (tco->tc_poll_pps)
 801                 tco->tc_poll_pps(tco);
 802         if (timedelta != 0) {
 803                 tc->tc_offset_nano += (u_int64_t)(tickdelta * 1000) << 32;
 804                 timedelta -= tickdelta;
 805                 force++;
 806         }
 807
 808         while (tc->tc_offset_nano >= 1000000000ULL << 32) {
 809                 tc->tc_offset_nano -= 1000000000ULL << 32;
 810                 tc->tc_offset_sec++;
 811                 tc->tc_frequency = tc->tc_tweak->tc_frequency;
 812                 tc->tc_adjustment = tc->tc_tweak->tc_adjustment;
 813                 ntp_update_second(tc);  /* XXX only needed if xntpd runs */
 814                 tco_setscales(tc);
 815                 force++;
 816         }
 817
 818         if (tco_method && !force)
 819                 return;
 820
 821         tc->tc_offset_micro = (tc->tc_offset_nano / 1000) >> 32;
 822
 823         /* Figure out the wall-clock time */
 824         tc->tc_nanotime.tv_sec = tc->tc_offset_sec + boottime.tv_sec;
 825         tc->tc_nanotime.tv_nsec =
 826             (tc->tc_offset_nano >> 32) + boottime.tv_usec * 1000;
 827         tc->tc_microtime.tv_usec = tc->tc_offset_micro + boottime.tv_usec;
 828         if (tc->tc_nanotime.tv_nsec >= 1000000000) {
 829                 tc->tc_nanotime.tv_nsec -= 1000000000;
 830                 tc->tc_microtime.tv_usec -= 1000000;
 831                 tc->tc_nanotime.tv_sec++;
 832         }
 833         time_second = tc->tc_microtime.tv_sec = tc->tc_nanotime.tv_sec;
 834
 835         timecounter = tc;
 836 }
 837
 838 static int
 839 sysctl_kern_timecounter_frequency SYSCTL_HANDLER_ARGS
 840 {
 841
 842         return (sysctl_handle_opaque(oidp,
 843             &timecounter->tc_tweak->tc_frequency,
 844             sizeof(timecounter->tc_tweak->tc_frequency), req));
 845 }
 846
 847 static int
 848 sysctl_kern_timecounter_adjustment SYSCTL_HANDLER_ARGS
 849 {
 850
 851         return (sysctl_handle_opaque(oidp,
 852             &timecounter->tc_tweak->tc_adjustment,
 853             sizeof(timecounter->tc_tweak->tc_adjustment), req));
 854 }
 855
 856 SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
 857
 858 SYSCTL_INT(_kern_timecounter, KERN_ARGMAX, method, CTLFLAG_RW, &tco_method, 0,
 859     "This variable determines the method used for updating timecounters. "
 860     "If the default algorithm (0) fails with \"calcru negative...\" messages "
 861     "try the alternate algorithm (1) which handles bad hardware better."
 862
 863 );
 864
 865 SYSCTL_PROC(_kern_timecounter, OID_AUTO, frequency, CTLTYPE_INT | CTLFLAG_RW,
 866     0, sizeof(u_int), sysctl_kern_timecounter_frequency, "I", "");
 867
 868 SYSCTL_PROC(_kern_timecounter, OID_AUTO, adjustment, CTLTYPE_INT | CTLFLAG_RW,
 869     0, sizeof(int), sysctl_kern_timecounter_adjustment, "I", "");