sys/kern/subr_smp.c

   1 /*-
   2  * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26
  27 /*
  28  * This module holds the global variables and machine independent functions
  29  * used for the kernel SMP support.
  30  */
  31
  32 #include <sys/cdefs.h>
  33 __FBSDID("$FreeBSD$");
  34
  35 #include <sys/param.h>
  36 #include <sys/systm.h>
  37 #include <sys/kernel.h>
  38 #include <sys/ktr.h>
  39 #include <sys/proc.h>
  40 #include <sys/bus.h>
  41 #include <sys/lock.h>
  42 #include <sys/malloc.h>
  43 #include <sys/mutex.h>
  44 #include <sys/pcpu.h>
  45 #include <sys/sched.h>
  46 #include <sys/smp.h>
  47 #include <sys/sysctl.h>
  48
  49 #include <machine/cpu.h>
  50 #include <machine/smp.h>
  51
  52 #include "opt_sched.h"
  53
  54 #ifdef SMP
  55 MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data");
  56
  57 volatile cpuset_t stopped_cpus;
  58 volatile cpuset_t started_cpus;
  59 volatile cpuset_t suspended_cpus;
  60 cpuset_t hlt_cpus_mask;
  61 cpuset_t logical_cpus_mask;
  62
  63 void (*cpustop_restartfunc)(void);
  64 #endif
  65
  66 static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS);
  67
  68 /* This is used in modules that need to work in both SMP and UP. */
  69 cpuset_t all_cpus;
  70
  71 int mp_ncpus;
  72 /* export this for libkvm consumers. */
  73 int mp_maxcpus = MAXCPU;
  74
  75 volatile int smp_started;
  76 u_int mp_maxid;
  77
  78 static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL,
  79     "Kernel SMP");
  80
  81 SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0,
  82     "Max CPU ID.");
  83
  84 SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus,
  85     0, "Max number of CPUs that the system was compiled for.");
  86
  87 SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD|CTLTYPE_INT|CTLFLAG_MPSAFE,
  88     NULL, 0, sysctl_kern_smp_active, "I",
  89     "Indicates system is running in SMP mode");
  90
  91 int smp_disabled = 0;   /* has smp been disabled? */
  92 SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
  93     &smp_disabled, 0, "SMP has been disabled from the loader");
  94
  95 int smp_cpus = 1;       /* how many cpu's running */
  96 SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0,
  97     "Number of CPUs online");
  98
  99 int smp_topology = 0;   /* Which topology we're using. */
 100 SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0,
 101     "Topology override setting; 0 is default provided by hardware.");
 102
 103 #ifdef SMP
 104 /* Enable forwarding of a signal to a process running on a different CPU */
 105 static int forward_signal_enabled = 1;
 106 SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
 107            &forward_signal_enabled, 0,
 108            "Forwarding of a signal to a process on a different CPU");
 109
 110 /* Variables needed for SMP rendezvous. */
 111 static volatile int smp_rv_ncpus;
 112 static void (*volatile smp_rv_setup_func)(void *arg);
 113 static void (*volatile smp_rv_action_func)(void *arg);
 114 static void (*volatile smp_rv_teardown_func)(void *arg);
 115 static void *volatile smp_rv_func_arg;
 116 static volatile int smp_rv_waiters[4];
 117
 118 /*
 119  * Shared mutex to restrict busywaits between smp_rendezvous() and
 120  * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
 121  * functions trigger at once and cause multiple CPUs to busywait with
 122  * interrupts disabled.
 123  */
 124 struct mtx smp_ipi_mtx;
 125
 126 /*
 127  * Let the MD SMP code initialize mp_maxid very early if it can.
 128  */
 129 static void
 130 mp_setmaxid(void *dummy)
 131 {
 132
 133         cpu_mp_setmaxid();
 134
 135         KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__));
 136         KASSERT(mp_ncpus > 1 || mp_maxid == 0,
 137             ("%s: one CPU but mp_maxid is not zero", __func__));
 138         KASSERT(mp_maxid >= mp_ncpus - 1,
 139             ("%s: counters out of sync: max %d, count %d", __func__,
 140                 mp_maxid, mp_ncpus));
 141 }
 142 SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);
 143
 144 /*
 145  * Call the MD SMP initialization code.
 146  */
 147 static void
 148 mp_start(void *dummy)
 149 {
 150
 151         mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
 152
 153         /* Probe for MP hardware. */
 154         if (smp_disabled != 0 || cpu_mp_probe() == 0) {
 155                 mp_ncpus = 1;
 156                 CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
 157                 return;
 158         }
 159
 160         cpu_mp_start();
 161         printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
 162             mp_ncpus);
 163         cpu_mp_announce();
 164 }
 165 SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);
 166
 167 void
 168 forward_signal(struct thread *td)
 169 {
 170         int id;
 171
 172         /*
 173          * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on
 174          * this thread, so all we need to do is poke it if it is currently
 175          * executing so that it executes ast().
 176          */
 177         THREAD_LOCK_ASSERT(td, MA_OWNED);
 178         KASSERT(TD_IS_RUNNING(td),
 179             ("forward_signal: thread is not TDS_RUNNING"));
 180
 181         CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
 182
 183         if (!smp_started || cold || panicstr)
 184                 return;
 185         if (!forward_signal_enabled)
 186                 return;
 187
 188         /* No need to IPI ourself. */
 189         if (td == curthread)
 190                 return;
 191
 192         id = td->td_oncpu;
 193         if (id == NOCPU)
 194                 return;
 195         ipi_cpu(id, IPI_AST);
 196 }
 197
 198 /*
 199  * When called the executing CPU will send an IPI to all other CPUs
 200  *  requesting that they halt execution.
 201  *
 202  * Usually (but not necessarily) called with 'other_cpus' as its arg.
 203  *
 204  *  - Signals all CPUs in map to stop.
 205  *  - Waits for each to stop.
 206  *
 207  * Returns:
 208  *  -1: error
 209  *   0: NA
 210  *   1: ok
 211  *
 212  */
 213 #if defined(__amd64__) || defined(__i386__)
 214 #define X86     1
 215 #else
 216 #define X86     0
 217 #endif
 218 static int
 219 generic_stop_cpus(cpuset_t map, u_int type)
 220 {
 221 #ifdef KTR
 222         char cpusetbuf[CPUSETBUFSIZ];
 223 #endif
 224         static volatile u_int stopping_cpu = NOCPU;
 225         int i;
 226         volatile cpuset_t *cpus;
 227
 228         KASSERT(
 229             type == IPI_STOP || type == IPI_STOP_HARD
 230 #if X86
 231             || type == IPI_SUSPEND
 232 #endif
 233             , ("%s: invalid stop type", __func__));
 234
 235         if (!smp_started)
 236                 return (0);
 237
 238         CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
 239             cpusetobj_strprint(cpusetbuf, &map), type);
 240
 241 #if X86
 242         /*
 243          * When suspending, ensure there are are no IPIs in progress.
 244          * IPIs that have been issued, but not yet delivered (e.g.
 245          * not pending on a vCPU when running under virtualization)
 246          * will be lost, violating FreeBSD's assumption of reliable
 247          * IPI delivery.
 248          */
 249         if (type == IPI_SUSPEND)
 250                 mtx_lock_spin(&smp_ipi_mtx);
 251 #endif
 252
 253 #if X86
 254         if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
 255 #endif
 256         if (stopping_cpu != PCPU_GET(cpuid))
 257                 while (atomic_cmpset_int(&stopping_cpu, NOCPU,
 258                     PCPU_GET(cpuid)) == 0)
 259                         while (stopping_cpu != NOCPU)
 260                                 cpu_spinwait(); /* spin */
 261
 262         /* send the stop IPI to all CPUs in map */
 263         ipi_selected(map, type);
 264 #if X86
 265         }
 266 #endif
 267
 268 #if X86
 269         if (type == IPI_SUSPEND)
 270                 cpus = &suspended_cpus;
 271         else
 272 #endif
 273                 cpus = &stopped_cpus;
 274
 275         i = 0;
 276         while (!CPU_SUBSET(cpus, &map)) {
 277                 /* spin */
 278                 cpu_spinwait();
 279                 i++;
 280                 if (i == 100000000) {
 281                         printf("timeout stopping cpus\n");
 282                         break;
 283                 }
 284         }
 285
 286 #if X86
 287         if (type == IPI_SUSPEND)
 288                 mtx_unlock_spin(&smp_ipi_mtx);
 289 #endif
 290
 291         stopping_cpu = NOCPU;
 292         return (1);
 293 }
 294
 295 int
 296 stop_cpus(cpuset_t map)
 297 {
 298
 299         return (generic_stop_cpus(map, IPI_STOP));
 300 }
 301
 302 int
 303 stop_cpus_hard(cpuset_t map)
 304 {
 305
 306         return (generic_stop_cpus(map, IPI_STOP_HARD));
 307 }
 308
 309 #if X86
 310 int
 311 suspend_cpus(cpuset_t map)
 312 {
 313
 314         return (generic_stop_cpus(map, IPI_SUSPEND));
 315 }
 316 #endif
 317
 318 /*
 319  * Called by a CPU to restart stopped CPUs.
 320  *
 321  * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
 322  *
 323  *  - Signals all CPUs in map to restart.
 324  *  - Waits for each to restart.
 325  *
 326  * Returns:
 327  *  -1: error
 328  *   0: NA
 329  *   1: ok
 330  */
 331 static int
 332 generic_restart_cpus(cpuset_t map, u_int type)
 333 {
 334 #ifdef KTR
 335         char cpusetbuf[CPUSETBUFSIZ];
 336 #endif
 337         volatile cpuset_t *cpus;
 338
 339         KASSERT(type == IPI_STOP || type == IPI_STOP_HARD
 340 #if X86
 341             || type == IPI_SUSPEND
 342 #endif
 343             , ("%s: invalid stop type", __func__));
 344
 345         if (!smp_started)
 346                 return (0);
 347
 348         CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
 349
 350 #if X86
 351         if (type == IPI_SUSPEND)
 352                 cpus = &suspended_cpus;
 353         else
 354 #endif
 355                 cpus = &stopped_cpus;
 356
 357         /* signal other cpus to restart */
 358         CPU_COPY_STORE_REL(&map, &started_cpus);
 359
 360 #if X86
 361         if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
 362 #endif
 363         /* wait for each to clear its bit */
 364         while (CPU_OVERLAP(cpus, &map))
 365                 cpu_spinwait();
 366 #if X86
 367         }
 368 #endif
 369
 370         return (1);
 371 }
 372
 373 int
 374 restart_cpus(cpuset_t map)
 375 {
 376
 377         return (generic_restart_cpus(map, IPI_STOP));
 378 }
 379
 380 #if X86
 381 int
 382 resume_cpus(cpuset_t map)
 383 {
 384
 385         return (generic_restart_cpus(map, IPI_SUSPEND));
 386 }
 387 #endif
 388 #undef X86
 389
 390 /*
 391  * All-CPU rendezvous.  CPUs are signalled, all execute the setup function
 392  * (if specified), rendezvous, execute the action function (if specified),
 393  * rendezvous again, execute the teardown function (if specified), and then
 394  * resume.
 395  *
 396  * Note that the supplied external functions _must_ be reentrant and aware
 397  * that they are running in parallel and in an unknown lock context.
 398  */
 399 void
 400 smp_rendezvous_action(void)
 401 {
 402         struct thread *td;
 403         void *local_func_arg;
 404         void (*local_setup_func)(void*);
 405         void (*local_action_func)(void*);
 406         void (*local_teardown_func)(void*);
 407 #ifdef INVARIANTS
 408         int owepreempt;
 409 #endif
 410
 411         /* Ensure we have up-to-date values. */
 412         atomic_add_acq_int(&smp_rv_waiters[0], 1);
 413         while (smp_rv_waiters[0] < smp_rv_ncpus)
 414                 cpu_spinwait();
 415
 416         /* Fetch rendezvous parameters after acquire barrier. */
 417         local_func_arg = smp_rv_func_arg;
 418         local_setup_func = smp_rv_setup_func;
 419         local_action_func = smp_rv_action_func;
 420         local_teardown_func = smp_rv_teardown_func;
 421
 422         /*
 423          * Use a nested critical section to prevent any preemptions
 424          * from occurring during a rendezvous action routine.
 425          * Specifically, if a rendezvous handler is invoked via an IPI
 426          * and the interrupted thread was in the critical_exit()
 427          * function after setting td_critnest to 0 but before
 428          * performing a deferred preemption, this routine can be
 429          * invoked with td_critnest set to 0 and td_owepreempt true.
 430          * In that case, a critical_exit() during the rendezvous
 431          * action would trigger a preemption which is not permitted in
 432          * a rendezvous action.  To fix this, wrap all of the
 433          * rendezvous action handlers in a critical section.  We
 434          * cannot use a regular critical section however as having
 435          * critical_exit() preempt from this routine would also be
 436          * problematic (the preemption must not occur before the IPI
 437          * has been acknowledged via an EOI).  Instead, we
 438          * intentionally ignore td_owepreempt when leaving the
 439          * critical section.  This should be harmless because we do
 440          * not permit rendezvous action routines to schedule threads,
 441          * and thus td_owepreempt should never transition from 0 to 1
 442          * during this routine.
 443          */
 444         td = curthread;
 445         td->td_critnest++;
 446 #ifdef INVARIANTS
 447         owepreempt = td->td_owepreempt;
 448 #endif
 449
 450         /*
 451          * If requested, run a setup function before the main action
 452          * function.  Ensure all CPUs have completed the setup
 453          * function before moving on to the action function.
 454          */
 455         if (local_setup_func != smp_no_rendezvous_barrier) {
 456                 if (smp_rv_setup_func != NULL)
 457                         smp_rv_setup_func(smp_rv_func_arg);
 458                 atomic_add_int(&smp_rv_waiters[1], 1);
 459                 while (smp_rv_waiters[1] < smp_rv_ncpus)
 460                         cpu_spinwait();
 461         }
 462
 463         if (local_action_func != NULL)
 464                 local_action_func(local_func_arg);
 465
 466         if (local_teardown_func != smp_no_rendezvous_barrier) {
 467                 /*
 468                  * Signal that the main action has been completed.  If a
 469                  * full exit rendezvous is requested, then all CPUs will
 470                  * wait here until all CPUs have finished the main action.
 471                  */
 472                 atomic_add_int(&smp_rv_waiters[2], 1);
 473                 while (smp_rv_waiters[2] < smp_rv_ncpus)
 474                         cpu_spinwait();
 475
 476                 if (local_teardown_func != NULL)
 477                         local_teardown_func(local_func_arg);
 478         }
 479
 480         /*
 481          * Signal that the rendezvous is fully completed by this CPU.
 482          * This means that no member of smp_rv_* pseudo-structure will be
 483          * accessed by this target CPU after this point; in particular,
 484          * memory pointed by smp_rv_func_arg.
 485          *
 486          * The release semantic ensures that all accesses performed by
 487          * the current CPU are visible when smp_rendezvous_cpus()
 488          * returns, by synchronizing with the
 489          * atomic_load_acq_int(&smp_rv_waiters[3]).
 490          */
 491         atomic_add_rel_int(&smp_rv_waiters[3], 1);
 492
 493         td->td_critnest--;
 494         KASSERT(owepreempt == td->td_owepreempt,
 495             ("rendezvous action changed td_owepreempt"));
 496 }
 497
 498 void
 499 smp_rendezvous_cpus(cpuset_t map,
 500         void (* setup_func)(void *),
 501         void (* action_func)(void *),
 502         void (* teardown_func)(void *),
 503         void *arg)
 504 {
 505         int curcpumap, i, ncpus = 0;
 506
 507         /* Look comments in the !SMP case. */
 508         if (!smp_started) {
 509                 spinlock_enter();
 510                 if (setup_func != NULL)
 511                         setup_func(arg);
 512                 if (action_func != NULL)
 513                         action_func(arg);
 514                 if (teardown_func != NULL)
 515                         teardown_func(arg);
 516                 spinlock_exit();
 517                 return;
 518         }
 519
 520         CPU_FOREACH(i) {
 521                 if (CPU_ISSET(i, &map))
 522                         ncpus++;
 523         }
 524         if (ncpus == 0)
 525                 panic("ncpus is 0 with non-zero map");
 526
 527         mtx_lock_spin(&smp_ipi_mtx);
 528
 529         /* Pass rendezvous parameters via global variables. */
 530         smp_rv_ncpus = ncpus;
 531         smp_rv_setup_func = setup_func;
 532         smp_rv_action_func = action_func;
 533         smp_rv_teardown_func = teardown_func;
 534         smp_rv_func_arg = arg;
 535         smp_rv_waiters[1] = 0;
 536         smp_rv_waiters[2] = 0;
 537         smp_rv_waiters[3] = 0;
 538         atomic_store_rel_int(&smp_rv_waiters[0], 0);
 539
 540         /*
 541          * Signal other processors, which will enter the IPI with
 542          * interrupts off.
 543          */
 544         curcpumap = CPU_ISSET(curcpu, &map);
 545         CPU_CLR(curcpu, &map);
 546         ipi_selected(map, IPI_RENDEZVOUS);
 547
 548         /* Check if the current CPU is in the map */
 549         if (curcpumap != 0)
 550                 smp_rendezvous_action();
 551
 552         /*
 553          * Ensure that the master CPU waits for all the other
 554          * CPUs to finish the rendezvous, so that smp_rv_*
 555          * pseudo-structure and the arg are guaranteed to not
 556          * be in use.
 557          *
 558          * Load acquire synchronizes with the release add in
 559          * smp_rendezvous_action(), which ensures that our caller sees
 560          * all memory actions done by the called functions on other
 561          * CPUs.
 562          */
 563         while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
 564                 cpu_spinwait();
 565
 566         mtx_unlock_spin(&smp_ipi_mtx);
 567 }
 568
 569 void
 570 smp_rendezvous(void (* setup_func)(void *),
 571                void (* action_func)(void *),
 572                void (* teardown_func)(void *),
 573                void *arg)
 574 {
 575         smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
 576 }
 577
 578 static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];
 579
 580 struct cpu_group *
 581 smp_topo(void)
 582 {
 583         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 584         struct cpu_group *top;
 585
 586         /*
 587          * Check for a fake topology request for debugging purposes.
 588          */
 589         switch (smp_topology) {
 590         case 1:
 591                 /* Dual core with no sharing.  */
 592                 top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
 593                 break;
 594         case 2:
 595                 /* No topology, all cpus are equal. */
 596                 top = smp_topo_none();
 597                 break;
 598         case 3:
 599                 /* Dual core with shared L2.  */
 600                 top = smp_topo_1level(CG_SHARE_L2, 2, 0);
 601                 break;
 602         case 4:
 603                 /* quad core, shared l3 among each package, private l2.  */
 604                 top = smp_topo_1level(CG_SHARE_L3, 4, 0);
 605                 break;
 606         case 5:
 607                 /* quad core,  2 dualcore parts on each package share l2.  */
 608                 top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
 609                 break;
 610         case 6:
 611                 /* Single-core 2xHTT */
 612                 top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
 613                 break;
 614         case 7:
 615                 /* quad core with a shared l3, 8 threads sharing L2.  */
 616                 top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
 617                     CG_FLAG_SMT);
 618                 break;
 619         default:
 620                 /* Default, ask the system what it wants. */
 621                 top = cpu_topo();
 622                 break;
 623         }
 624         /*
 625          * Verify the returned topology.
 626          */
 627         if (top->cg_count != mp_ncpus)
 628                 panic("Built bad topology at %p.  CPU count %d != %d",
 629                     top, top->cg_count, mp_ncpus);
 630         if (CPU_CMP(&top->cg_mask, &all_cpus))
 631                 panic("Built bad topology at %p.  CPU mask (%s) != (%s)",
 632                     top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
 633                     cpusetobj_strprint(cpusetbuf2, &all_cpus));
 634
 635         /*
 636          * Collapse nonsense levels that may be created out of convenience by
 637          * the MD layers.  They cause extra work in the search functions.
 638          */
 639         while (top->cg_children == 1) {
 640                 top = &top->cg_child[0];
 641                 top->cg_parent = NULL;
 642         }
 643         return (top);
 644 }
 645
 646 struct cpu_group *
 647 smp_topo_alloc(u_int count)
 648 {
 649         static u_int index;
 650         u_int curr;
 651
 652         curr = index;
 653         index += count;
 654         return (&group[curr]);
 655 }
 656
 657 struct cpu_group *
 658 smp_topo_none(void)
 659 {
 660         struct cpu_group *top;
 661
 662         top = &group[0];
 663         top->cg_parent = NULL;
 664         top->cg_child = NULL;
 665         top->cg_mask = all_cpus;
 666         top->cg_count = mp_ncpus;
 667         top->cg_children = 0;
 668         top->cg_level = CG_SHARE_NONE;
 669         top->cg_flags = 0;
 670
 671         return (top);
 672 }
 673
 674 static int
 675 smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
 676     int count, int flags, int start)
 677 {
 678         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 679         cpuset_t mask;
 680         int i;
 681
 682         CPU_ZERO(&mask);
 683         for (i = 0; i < count; i++, start++)
 684                 CPU_SET(start, &mask);
 685         child->cg_parent = parent;
 686         child->cg_child = NULL;
 687         child->cg_children = 0;
 688         child->cg_level = share;
 689         child->cg_count = count;
 690         child->cg_flags = flags;
 691         child->cg_mask = mask;
 692         parent->cg_children++;
 693         for (; parent != NULL; parent = parent->cg_parent) {
 694                 if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
 695                         panic("Duplicate children in %p.  mask (%s) child (%s)",
 696                             parent,
 697                             cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
 698                             cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
 699                 CPU_OR(&parent->cg_mask, &child->cg_mask);
 700                 parent->cg_count += child->cg_count;
 701         }
 702
 703         return (start);
 704 }
 705
 706 struct cpu_group *
 707 smp_topo_1level(int share, int count, int flags)
 708 {
 709         struct cpu_group *child;
 710         struct cpu_group *top;
 711         int packages;
 712         int cpu;
 713         int i;
 714
 715         cpu = 0;
 716         top = &group[0];
 717         packages = mp_ncpus / count;
 718         top->cg_child = child = &group[1];
 719         top->cg_level = CG_SHARE_NONE;
 720         for (i = 0; i < packages; i++, child++)
 721                 cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
 722         return (top);
 723 }
 724
 725 struct cpu_group *
 726 smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
 727     int l1flags)
 728 {
 729         struct cpu_group *top;
 730         struct cpu_group *l1g;
 731         struct cpu_group *l2g;
 732         int cpu;
 733         int i;
 734         int j;
 735
 736         cpu = 0;
 737         top = &group[0];
 738         l2g = &group[1];
 739         top->cg_child = l2g;
 740         top->cg_level = CG_SHARE_NONE;
 741         top->cg_children = mp_ncpus / (l2count * l1count);
 742         l1g = l2g + top->cg_children;
 743         for (i = 0; i < top->cg_children; i++, l2g++) {
 744                 l2g->cg_parent = top;
 745                 l2g->cg_child = l1g;
 746                 l2g->cg_level = l2share;
 747                 for (j = 0; j < l2count; j++, l1g++)
 748                         cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
 749                             l1flags, cpu);
 750         }
 751         return (top);
 752 }
 753
 754
 755 struct cpu_group *
 756 smp_topo_find(struct cpu_group *top, int cpu)
 757 {
 758         struct cpu_group *cg;
 759         cpuset_t mask;
 760         int children;
 761         int i;
 762
 763         CPU_SETOF(cpu, &mask);
 764         cg = top;
 765         for (;;) {
 766                 if (!CPU_OVERLAP(&cg->cg_mask, &mask))
 767                         return (NULL);
 768                 if (cg->cg_children == 0)
 769                         return (cg);
 770                 children = cg->cg_children;
 771                 for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
 772                         if (CPU_OVERLAP(&cg->cg_mask, &mask))
 773                                 break;
 774         }
 775         return (NULL);
 776 }
 777 #else /* !SMP */
 778
 779 void
 780 smp_rendezvous_cpus(cpuset_t map,
 781         void (*setup_func)(void *),
 782         void (*action_func)(void *),
 783         void (*teardown_func)(void *),
 784         void *arg)
 785 {
 786         /*
 787          * In the !SMP case we just need to ensure the same initial conditions
 788          * as the SMP case.
 789          */
 790         spinlock_enter();
 791         if (setup_func != NULL)
 792                 setup_func(arg);
 793         if (action_func != NULL)
 794                 action_func(arg);
 795         if (teardown_func != NULL)
 796                 teardown_func(arg);
 797         spinlock_exit();
 798 }
 799
 800 void
 801 smp_rendezvous(void (*setup_func)(void *),
 802                void (*action_func)(void *),
 803                void (*teardown_func)(void *),
 804                void *arg)
 805 {
 806
 807         smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func,
 808             arg);
 809 }
 810
 811 /*
 812  * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
 813  * APIs will still work using this dummy support.
 814  */
 815 static void
 816 mp_setvariables_for_up(void *dummy)
 817 {
 818         mp_ncpus = 1;
 819         mp_maxid = PCPU_GET(cpuid);
 820         CPU_SETOF(mp_maxid, &all_cpus);
 821         KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
 822 }
 823 SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
 824     mp_setvariables_for_up, NULL);
 825 #endif /* SMP */
 826
 827 void
 828 smp_no_rendezvous_barrier(void *dummy)
 829 {
 830 #ifdef SMP
 831         KASSERT((!smp_started),("smp_no_rendezvous called and smp is started"));
 832 #endif
 833 }
 834
 835 /*
 836  * Wait for specified idle threads to switch once.  This ensures that even
 837  * preempted threads have cycled through the switch function once,
 838  * exiting their codepaths.  This allows us to change global pointers
 839  * with no other synchronization.
 840  */
 841 int
 842 quiesce_cpus(cpuset_t map, const char *wmesg, int prio)
 843 {
 844         struct pcpu *pcpu;
 845         u_int gen[MAXCPU];
 846         int error;
 847         int cpu;
 848
 849         error = 0;
 850         for (cpu = 0; cpu <= mp_maxid; cpu++) {
 851                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 852                         continue;
 853                 pcpu = pcpu_find(cpu);
 854                 gen[cpu] = pcpu->pc_idlethread->td_generation;
 855         }
 856         for (cpu = 0; cpu <= mp_maxid; cpu++) {
 857                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 858                         continue;
 859                 pcpu = pcpu_find(cpu);
 860                 thread_lock(curthread);
 861                 sched_bind(curthread, cpu);
 862                 thread_unlock(curthread);
 863                 while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
 864                         error = tsleep(quiesce_cpus, prio, wmesg, 1);
 865                         if (error != EWOULDBLOCK)
 866                                 goto out;
 867                         error = 0;
 868                 }
 869         }
 870 out:
 871         thread_lock(curthread);
 872         sched_unbind(curthread);
 873         thread_unlock(curthread);
 874
 875         return (error);
 876 }
 877
 878 int
 879 quiesce_all_cpus(const char *wmesg, int prio)
 880 {
 881
 882         return quiesce_cpus(all_cpus, wmesg, prio);
 883 }
 884
 885 /* Extra care is taken with this sysctl because the data type is volatile */
 886 static int
 887 sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS)
 888 {
 889         int error, active;
 890
 891         active = smp_started;
 892         error = SYSCTL_OUT(req, &active, sizeof(active));
 893         return (error);
 894 }
 895
 896
 897 #ifdef SMP
 898 void
 899 topo_init_node(struct topo_node *node)
 900 {
 901
 902         bzero(node, sizeof(*node));
 903         TAILQ_INIT(&node->children);
 904 }
 905
 906 void
 907 topo_init_root(struct topo_node *root)
 908 {
 909
 910         topo_init_node(root);
 911         root->type = TOPO_TYPE_SYSTEM;
 912 }
 913
 914 /*
 915  * Add a child node with the given ID under the given parent.
 916  * Do nothing if there is already a child with that ID.
 917  */
 918 struct topo_node *
 919 topo_add_node_by_hwid(struct topo_node *parent, int hwid,
 920     topo_node_type type, uintptr_t subtype)
 921 {
 922         struct topo_node *node;
 923
 924         TAILQ_FOREACH_REVERSE(node, &parent->children,
 925             topo_children, siblings) {
 926                 if (node->hwid == hwid
 927                     && node->type == type && node->subtype == subtype) {
 928                         return (node);
 929                 }
 930         }
 931
 932         node = malloc(sizeof(*node), M_TOPO, M_WAITOK);
 933         topo_init_node(node);
 934         node->parent = parent;
 935         node->hwid = hwid;
 936         node->type = type;
 937         node->subtype = subtype;
 938         TAILQ_INSERT_TAIL(&parent->children, node, siblings);
 939         parent->nchildren++;
 940
 941         return (node);
 942 }
 943
 944 /*
 945  * Find a child node with the given ID under the given parent.
 946  */
 947 struct topo_node *
 948 topo_find_node_by_hwid(struct topo_node *parent, int hwid,
 949     topo_node_type type, uintptr_t subtype)
 950 {
 951
 952         struct topo_node *node;
 953
 954         TAILQ_FOREACH(node, &parent->children, siblings) {
 955                 if (node->hwid == hwid
 956                     && node->type == type && node->subtype == subtype) {
 957                         return (node);
 958                 }
 959         }
 960
 961         return (NULL);
 962 }
 963
 964 /*
 965  * Given a node change the order of its parent's child nodes such
 966  * that the node becomes the firt child while preserving the cyclic
 967  * order of the children.  In other words, the given node is promoted
 968  * by rotation.
 969  */
 970 void
 971 topo_promote_child(struct topo_node *child)
 972 {
 973         struct topo_node *next;
 974         struct topo_node *node;
 975         struct topo_node *parent;
 976
 977         parent = child->parent;
 978         next = TAILQ_NEXT(child, siblings);
 979         TAILQ_REMOVE(&parent->children, child, siblings);
 980         TAILQ_INSERT_HEAD(&parent->children, child, siblings);
 981
 982         while (next != NULL) {
 983                 node = next;
 984                 next = TAILQ_NEXT(node, siblings);
 985                 TAILQ_REMOVE(&parent->children, node, siblings);
 986                 TAILQ_INSERT_AFTER(&parent->children, child, node, siblings);
 987                 child = node;
 988         }
 989 }
 990
 991 /*
 992  * Iterate to the next node in the depth-first search (traversal) of
 993  * the topology tree.
 994  */
 995 struct topo_node *
 996 topo_next_node(struct topo_node *top, struct topo_node *node)
 997 {
 998         struct topo_node *next;
 999
1000         if ((next = TAILQ_FIRST(&node->children)) != NULL)
1001                 return (next);
1002
1003         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1004                 return (next);
1005
1006         while (node != top && (node = node->parent) != top)
1007                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1008                         return (next);
1009
1010         return (NULL);
1011 }
1012
1013 /*
1014  * Iterate to the next node in the depth-first search of the topology tree,
1015  * but without descending below the current node.
1016  */
1017 struct topo_node *
1018 topo_next_nonchild_node(struct topo_node *top, struct topo_node *node)
1019 {
1020         struct topo_node *next;
1021
1022         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1023                 return (next);
1024
1025         while (node != top && (node = node->parent) != top)
1026                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1027                         return (next);
1028
1029         return (NULL);
1030 }
1031
1032 /*
1033  * Assign the given ID to the given topology node that represents a logical
1034  * processor.
1035  */
1036 void
1037 topo_set_pu_id(struct topo_node *node, cpuid_t id)
1038 {
1039
1040         KASSERT(node->type == TOPO_TYPE_PU,
1041             ("topo_set_pu_id: wrong node type: %u", node->type));
1042         KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0,
1043             ("topo_set_pu_id: cpuset already not empty"));
1044         node->id = id;
1045         CPU_SET(id, &node->cpuset);
1046         node->cpu_count = 1;
1047         node->subtype = 1;
1048
1049         while ((node = node->parent) != NULL) {
1050                 KASSERT(!CPU_ISSET(id, &node->cpuset),
1051                     ("logical ID %u is already set in node %p", id, node));
1052                 CPU_SET(id, &node->cpuset);
1053                 node->cpu_count++;
1054         }
1055 }
1056
1057 static struct topology_spec {
1058         topo_node_type  type;
1059         bool            match_subtype;
1060         uintptr_t       subtype;
1061 } topology_level_table[TOPO_LEVEL_COUNT] = {
1062         [TOPO_LEVEL_PKG] = { .type = TOPO_TYPE_PKG, },
1063         [TOPO_LEVEL_GROUP] = { .type = TOPO_TYPE_GROUP, },
1064         [TOPO_LEVEL_CACHEGROUP] = {
1065                 .type = TOPO_TYPE_CACHE,
1066                 .match_subtype = true,
1067                 .subtype = CG_SHARE_L3,
1068         },
1069         [TOPO_LEVEL_CORE] = { .type = TOPO_TYPE_CORE, },
1070         [TOPO_LEVEL_THREAD] = { .type = TOPO_TYPE_PU, },
1071 };
1072
1073 static bool
1074 topo_analyze_table(struct topo_node *root, int all, enum topo_level level,
1075     struct topo_analysis *results)
1076 {
1077         struct topology_spec *spec;
1078         struct topo_node *node;
1079         int count;
1080
1081         if (level >= TOPO_LEVEL_COUNT)
1082                 return (true);
1083
1084         spec = &topology_level_table[level];
1085         count = 0;
1086         node = topo_next_node(root, root);
1087
1088         while (node != NULL) {
1089                 if (node->type != spec->type ||
1090                     (spec->match_subtype && node->subtype != spec->subtype)) {
1091                         node = topo_next_node(root, node);
1092                         continue;
1093                 }
1094                 if (!all && CPU_EMPTY(&node->cpuset)) {
1095                         node = topo_next_nonchild_node(root, node);
1096                         continue;
1097                 }
1098
1099                 count++;
1100
1101                 if (!topo_analyze_table(node, all, level + 1, results))
1102                         return (false);
1103
1104                 node = topo_next_nonchild_node(root, node);
1105         }
1106
1107         /* No explicit subgroups is essentially one subgroup. */
1108         if (count == 0) {
1109                 count = 1;
1110
1111                 if (!topo_analyze_table(root, all, level + 1, results))
1112                         return (false);
1113         }
1114
1115         if (results->entities[level] == -1)
1116                 results->entities[level] = count;
1117         else if (results->entities[level] != count)
1118                 return (false);
1119
1120         return (true);
1121 }
1122
1123 /*
1124  * Check if the topology is uniform, that is, each package has the same number
1125  * of cores in it and each core has the same number of threads (logical
1126  * processors) in it.  If so, calculate the number of packages, the number of
1127  * groups per package, the number of cachegroups per group, and the number of
1128  * logical processors per cachegroup.  'all' parameter tells whether to include
1129  * administratively disabled logical processors into the analysis.
1130  */
1131 int
1132 topo_analyze(struct topo_node *topo_root, int all,
1133     struct topo_analysis *results)
1134 {
1135
1136         results->entities[TOPO_LEVEL_PKG] = -1;
1137         results->entities[TOPO_LEVEL_CORE] = -1;
1138         results->entities[TOPO_LEVEL_THREAD] = -1;
1139         results->entities[TOPO_LEVEL_GROUP] = -1;
1140         results->entities[TOPO_LEVEL_CACHEGROUP] = -1;
1141
1142         if (!topo_analyze_table(topo_root, all, TOPO_LEVEL_PKG, results))
1143                 return (0);
1144
1145         KASSERT(results->entities[TOPO_LEVEL_PKG] > 0,
1146                 ("bug in topology or analysis"));
1147
1148         return (1);
1149 }
1150
1151 #endif /* SMP */
1152