sys/kern/subr_smp.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28
  29 /*
  30  * This module holds the global variables and machine independent functions
  31  * used for the kernel SMP support.
  32  */
  33
  34 #include <sys/cdefs.h>
  35 __FBSDID("$FreeBSD$");
  36
  37 #include <sys/param.h>
  38 #include <sys/systm.h>
  39 #include <sys/kernel.h>
  40 #include <sys/ktr.h>
  41 #include <sys/proc.h>
  42 #include <sys/bus.h>
  43 #include <sys/lock.h>
  44 #include <sys/malloc.h>
  45 #include <sys/mutex.h>
  46 #include <sys/pcpu.h>
  47 #include <sys/sched.h>
  48 #include <sys/smp.h>
  49 #include <sys/sysctl.h>
  50
  51 #include <machine/cpu.h>
  52 #include <machine/smp.h>
  53
  54 #include "opt_sched.h"
  55
  56 #ifdef SMP
  57 MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data");
  58
  59 volatile cpuset_t stopped_cpus;
  60 volatile cpuset_t started_cpus;
  61 volatile cpuset_t suspended_cpus;
  62 cpuset_t hlt_cpus_mask;
  63 cpuset_t logical_cpus_mask;
  64
  65 void (*cpustop_restartfunc)(void);
  66 #endif
  67
  68 static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS);
  69
  70 /* This is used in modules that need to work in both SMP and UP. */
  71 cpuset_t all_cpus;
  72
  73 int mp_ncpus;
  74 /* export this for libkvm consumers. */
  75 int mp_maxcpus = MAXCPU;
  76
  77 volatile int smp_started;
  78 u_int mp_maxid;
  79
  80 static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL,
  81     "Kernel SMP");
  82
  83 SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0,
  84     "Max CPU ID.");
  85
  86 SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus,
  87     0, "Max number of CPUs that the system was compiled for.");
  88
  89 SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD|CTLTYPE_INT|CTLFLAG_MPSAFE,
  90     NULL, 0, sysctl_kern_smp_active, "I",
  91     "Indicates system is running in SMP mode");
  92
  93 int smp_disabled = 0;   /* has smp been disabled? */
  94 SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
  95     &smp_disabled, 0, "SMP has been disabled from the loader");
  96
  97 int smp_cpus = 1;       /* how many cpu's running */
  98 SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0,
  99     "Number of CPUs online");
 100
 101 int smp_topology = 0;   /* Which topology we're using. */
 102 SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0,
 103     "Topology override setting; 0 is default provided by hardware.");
 104
 105 #ifdef SMP
 106 /* Enable forwarding of a signal to a process running on a different CPU */
 107 static int forward_signal_enabled = 1;
 108 SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
 109            &forward_signal_enabled, 0,
 110            "Forwarding of a signal to a process on a different CPU");
 111
 112 /* Variables needed for SMP rendezvous. */
 113 static volatile int smp_rv_ncpus;
 114 static void (*volatile smp_rv_setup_func)(void *arg);
 115 static void (*volatile smp_rv_action_func)(void *arg);
 116 static void (*volatile smp_rv_teardown_func)(void *arg);
 117 static void *volatile smp_rv_func_arg;
 118 static volatile int smp_rv_waiters[4];
 119
 120 /*
 121  * Shared mutex to restrict busywaits between smp_rendezvous() and
 122  * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
 123  * functions trigger at once and cause multiple CPUs to busywait with
 124  * interrupts disabled.
 125  */
 126 struct mtx smp_ipi_mtx;
 127
 128 /*
 129  * Let the MD SMP code initialize mp_maxid very early if it can.
 130  */
 131 static void
 132 mp_setmaxid(void *dummy)
 133 {
 134
 135         cpu_mp_setmaxid();
 136
 137         KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__));
 138         KASSERT(mp_ncpus > 1 || mp_maxid == 0,
 139             ("%s: one CPU but mp_maxid is not zero", __func__));
 140         KASSERT(mp_maxid >= mp_ncpus - 1,
 141             ("%s: counters out of sync: max %d, count %d", __func__,
 142                 mp_maxid, mp_ncpus));
 143 }
 144 SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);
 145
 146 /*
 147  * Call the MD SMP initialization code.
 148  */
 149 static void
 150 mp_start(void *dummy)
 151 {
 152
 153         mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
 154
 155         /* Probe for MP hardware. */
 156         if (smp_disabled != 0 || cpu_mp_probe() == 0) {
 157                 mp_ncpus = 1;
 158                 CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
 159                 return;
 160         }
 161
 162         cpu_mp_start();
 163         printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
 164             mp_ncpus);
 165         cpu_mp_announce();
 166 }
 167 SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);
 168
 169 void
 170 forward_signal(struct thread *td)
 171 {
 172         int id;
 173
 174         /*
 175          * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on
 176          * this thread, so all we need to do is poke it if it is currently
 177          * executing so that it executes ast().
 178          */
 179         THREAD_LOCK_ASSERT(td, MA_OWNED);
 180         KASSERT(TD_IS_RUNNING(td),
 181             ("forward_signal: thread is not TDS_RUNNING"));
 182
 183         CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
 184
 185         if (!smp_started || cold || panicstr)
 186                 return;
 187         if (!forward_signal_enabled)
 188                 return;
 189
 190         /* No need to IPI ourself. */
 191         if (td == curthread)
 192                 return;
 193
 194         id = td->td_oncpu;
 195         if (id == NOCPU)
 196                 return;
 197         ipi_cpu(id, IPI_AST);
 198 }
 199
 200 /*
 201  * When called the executing CPU will send an IPI to all other CPUs
 202  *  requesting that they halt execution.
 203  *
 204  * Usually (but not necessarily) called with 'other_cpus' as its arg.
 205  *
 206  *  - Signals all CPUs in map to stop.
 207  *  - Waits for each to stop.
 208  *
 209  * Returns:
 210  *  -1: error
 211  *   0: NA
 212  *   1: ok
 213  *
 214  */
 215 #if defined(__amd64__) || defined(__i386__)
 216 #define X86     1
 217 #else
 218 #define X86     0
 219 #endif
 220 static int
 221 generic_stop_cpus(cpuset_t map, u_int type)
 222 {
 223 #ifdef KTR
 224         char cpusetbuf[CPUSETBUFSIZ];
 225 #endif
 226         static volatile u_int stopping_cpu = NOCPU;
 227         int i;
 228         volatile cpuset_t *cpus;
 229
 230         KASSERT(
 231             type == IPI_STOP || type == IPI_STOP_HARD
 232 #if X86
 233             || type == IPI_SUSPEND
 234 #endif
 235             , ("%s: invalid stop type", __func__));
 236
 237         if (!smp_started)
 238                 return (0);
 239
 240         CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
 241             cpusetobj_strprint(cpusetbuf, &map), type);
 242
 243 #if X86
 244         /*
 245          * When suspending, ensure there are are no IPIs in progress.
 246          * IPIs that have been issued, but not yet delivered (e.g.
 247          * not pending on a vCPU when running under virtualization)
 248          * will be lost, violating FreeBSD's assumption of reliable
 249          * IPI delivery.
 250          */
 251         if (type == IPI_SUSPEND)
 252                 mtx_lock_spin(&smp_ipi_mtx);
 253 #endif
 254
 255 #if X86
 256         if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
 257 #endif
 258         if (stopping_cpu != PCPU_GET(cpuid))
 259                 while (atomic_cmpset_int(&stopping_cpu, NOCPU,
 260                     PCPU_GET(cpuid)) == 0)
 261                         while (stopping_cpu != NOCPU)
 262                                 cpu_spinwait(); /* spin */
 263
 264         /* send the stop IPI to all CPUs in map */
 265         ipi_selected(map, type);
 266 #if X86
 267         }
 268 #endif
 269
 270 #if X86
 271         if (type == IPI_SUSPEND)
 272                 cpus = &suspended_cpus;
 273         else
 274 #endif
 275                 cpus = &stopped_cpus;
 276
 277         i = 0;
 278         while (!CPU_SUBSET(cpus, &map)) {
 279                 /* spin */
 280                 cpu_spinwait();
 281                 i++;
 282                 if (i == 100000000) {
 283                         printf("timeout stopping cpus\n");
 284                         break;
 285                 }
 286         }
 287
 288 #if X86
 289         if (type == IPI_SUSPEND)
 290                 mtx_unlock_spin(&smp_ipi_mtx);
 291 #endif
 292
 293         stopping_cpu = NOCPU;
 294         return (1);
 295 }
 296
 297 int
 298 stop_cpus(cpuset_t map)
 299 {
 300
 301         return (generic_stop_cpus(map, IPI_STOP));
 302 }
 303
 304 int
 305 stop_cpus_hard(cpuset_t map)
 306 {
 307
 308         return (generic_stop_cpus(map, IPI_STOP_HARD));
 309 }
 310
 311 #if X86
 312 int
 313 suspend_cpus(cpuset_t map)
 314 {
 315
 316         return (generic_stop_cpus(map, IPI_SUSPEND));
 317 }
 318 #endif
 319
 320 /*
 321  * Called by a CPU to restart stopped CPUs.
 322  *
 323  * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
 324  *
 325  *  - Signals all CPUs in map to restart.
 326  *  - Waits for each to restart.
 327  *
 328  * Returns:
 329  *  -1: error
 330  *   0: NA
 331  *   1: ok
 332  */
 333 static int
 334 generic_restart_cpus(cpuset_t map, u_int type)
 335 {
 336 #ifdef KTR
 337         char cpusetbuf[CPUSETBUFSIZ];
 338 #endif
 339         volatile cpuset_t *cpus;
 340
 341         KASSERT(type == IPI_STOP || type == IPI_STOP_HARD
 342 #if X86
 343             || type == IPI_SUSPEND
 344 #endif
 345             , ("%s: invalid stop type", __func__));
 346
 347         if (!smp_started)
 348                 return (0);
 349
 350         CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
 351
 352 #if X86
 353         if (type == IPI_SUSPEND)
 354                 cpus = &resuming_cpus;
 355         else
 356 #endif
 357                 cpus = &stopped_cpus;
 358
 359         /* signal other cpus to restart */
 360 #if X86
 361         if (type == IPI_SUSPEND)
 362                 CPU_COPY_STORE_REL(&map, &toresume_cpus);
 363         else
 364 #endif
 365                 CPU_COPY_STORE_REL(&map, &started_cpus);
 366
 367 #if X86
 368         if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
 369 #endif
 370         /* wait for each to clear its bit */
 371         while (CPU_OVERLAP(cpus, &map))
 372                 cpu_spinwait();
 373 #if X86
 374         }
 375 #endif
 376
 377         return (1);
 378 }
 379
 380 int
 381 restart_cpus(cpuset_t map)
 382 {
 383
 384         return (generic_restart_cpus(map, IPI_STOP));
 385 }
 386
 387 #if X86
 388 int
 389 resume_cpus(cpuset_t map)
 390 {
 391
 392         return (generic_restart_cpus(map, IPI_SUSPEND));
 393 }
 394 #endif
 395 #undef X86
 396
 397 /*
 398  * All-CPU rendezvous.  CPUs are signalled, all execute the setup function
 399  * (if specified), rendezvous, execute the action function (if specified),
 400  * rendezvous again, execute the teardown function (if specified), and then
 401  * resume.
 402  *
 403  * Note that the supplied external functions _must_ be reentrant and aware
 404  * that they are running in parallel and in an unknown lock context.
 405  */
 406 void
 407 smp_rendezvous_action(void)
 408 {
 409         struct thread *td;
 410         void *local_func_arg;
 411         void (*local_setup_func)(void*);
 412         void (*local_action_func)(void*);
 413         void (*local_teardown_func)(void*);
 414 #ifdef INVARIANTS
 415         int owepreempt;
 416 #endif
 417
 418         /* Ensure we have up-to-date values. */
 419         atomic_add_acq_int(&smp_rv_waiters[0], 1);
 420         while (smp_rv_waiters[0] < smp_rv_ncpus)
 421                 cpu_spinwait();
 422
 423         /* Fetch rendezvous parameters after acquire barrier. */
 424         local_func_arg = smp_rv_func_arg;
 425         local_setup_func = smp_rv_setup_func;
 426         local_action_func = smp_rv_action_func;
 427         local_teardown_func = smp_rv_teardown_func;
 428
 429         /*
 430          * Use a nested critical section to prevent any preemptions
 431          * from occurring during a rendezvous action routine.
 432          * Specifically, if a rendezvous handler is invoked via an IPI
 433          * and the interrupted thread was in the critical_exit()
 434          * function after setting td_critnest to 0 but before
 435          * performing a deferred preemption, this routine can be
 436          * invoked with td_critnest set to 0 and td_owepreempt true.
 437          * In that case, a critical_exit() during the rendezvous
 438          * action would trigger a preemption which is not permitted in
 439          * a rendezvous action.  To fix this, wrap all of the
 440          * rendezvous action handlers in a critical section.  We
 441          * cannot use a regular critical section however as having
 442          * critical_exit() preempt from this routine would also be
 443          * problematic (the preemption must not occur before the IPI
 444          * has been acknowledged via an EOI).  Instead, we
 445          * intentionally ignore td_owepreempt when leaving the
 446          * critical section.  This should be harmless because we do
 447          * not permit rendezvous action routines to schedule threads,
 448          * and thus td_owepreempt should never transition from 0 to 1
 449          * during this routine.
 450          */
 451         td = curthread;
 452         td->td_critnest++;
 453 #ifdef INVARIANTS
 454         owepreempt = td->td_owepreempt;
 455 #endif
 456
 457         /*
 458          * If requested, run a setup function before the main action
 459          * function.  Ensure all CPUs have completed the setup
 460          * function before moving on to the action function.
 461          */
 462         if (local_setup_func != smp_no_rendezvous_barrier) {
 463                 if (smp_rv_setup_func != NULL)
 464                         smp_rv_setup_func(smp_rv_func_arg);
 465                 atomic_add_int(&smp_rv_waiters[1], 1);
 466                 while (smp_rv_waiters[1] < smp_rv_ncpus)
 467                         cpu_spinwait();
 468         }
 469
 470         if (local_action_func != NULL)
 471                 local_action_func(local_func_arg);
 472
 473         if (local_teardown_func != smp_no_rendezvous_barrier) {
 474                 /*
 475                  * Signal that the main action has been completed.  If a
 476                  * full exit rendezvous is requested, then all CPUs will
 477                  * wait here until all CPUs have finished the main action.
 478                  */
 479                 atomic_add_int(&smp_rv_waiters[2], 1);
 480                 while (smp_rv_waiters[2] < smp_rv_ncpus)
 481                         cpu_spinwait();
 482
 483                 if (local_teardown_func != NULL)
 484                         local_teardown_func(local_func_arg);
 485         }
 486
 487         /*
 488          * Signal that the rendezvous is fully completed by this CPU.
 489          * This means that no member of smp_rv_* pseudo-structure will be
 490          * accessed by this target CPU after this point; in particular,
 491          * memory pointed by smp_rv_func_arg.
 492          *
 493          * The release semantic ensures that all accesses performed by
 494          * the current CPU are visible when smp_rendezvous_cpus()
 495          * returns, by synchronizing with the
 496          * atomic_load_acq_int(&smp_rv_waiters[3]).
 497          */
 498         atomic_add_rel_int(&smp_rv_waiters[3], 1);
 499
 500         td->td_critnest--;
 501         KASSERT(owepreempt == td->td_owepreempt,
 502             ("rendezvous action changed td_owepreempt"));
 503 }
 504
 505 void
 506 smp_rendezvous_cpus(cpuset_t map,
 507         void (* setup_func)(void *),
 508         void (* action_func)(void *),
 509         void (* teardown_func)(void *),
 510         void *arg)
 511 {
 512         int curcpumap, i, ncpus = 0;
 513
 514         /* Look comments in the !SMP case. */
 515         if (!smp_started) {
 516                 spinlock_enter();
 517                 if (setup_func != NULL)
 518                         setup_func(arg);
 519                 if (action_func != NULL)
 520                         action_func(arg);
 521                 if (teardown_func != NULL)
 522                         teardown_func(arg);
 523                 spinlock_exit();
 524                 return;
 525         }
 526
 527         CPU_FOREACH(i) {
 528                 if (CPU_ISSET(i, &map))
 529                         ncpus++;
 530         }
 531         if (ncpus == 0)
 532                 panic("ncpus is 0 with non-zero map");
 533
 534         mtx_lock_spin(&smp_ipi_mtx);
 535
 536         /* Pass rendezvous parameters via global variables. */
 537         smp_rv_ncpus = ncpus;
 538         smp_rv_setup_func = setup_func;
 539         smp_rv_action_func = action_func;
 540         smp_rv_teardown_func = teardown_func;
 541         smp_rv_func_arg = arg;
 542         smp_rv_waiters[1] = 0;
 543         smp_rv_waiters[2] = 0;
 544         smp_rv_waiters[3] = 0;
 545         atomic_store_rel_int(&smp_rv_waiters[0], 0);
 546
 547         /*
 548          * Signal other processors, which will enter the IPI with
 549          * interrupts off.
 550          */
 551         curcpumap = CPU_ISSET(curcpu, &map);
 552         CPU_CLR(curcpu, &map);
 553         ipi_selected(map, IPI_RENDEZVOUS);
 554
 555         /* Check if the current CPU is in the map */
 556         if (curcpumap != 0)
 557                 smp_rendezvous_action();
 558
 559         /*
 560          * Ensure that the master CPU waits for all the other
 561          * CPUs to finish the rendezvous, so that smp_rv_*
 562          * pseudo-structure and the arg are guaranteed to not
 563          * be in use.
 564          *
 565          * Load acquire synchronizes with the release add in
 566          * smp_rendezvous_action(), which ensures that our caller sees
 567          * all memory actions done by the called functions on other
 568          * CPUs.
 569          */
 570         while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
 571                 cpu_spinwait();
 572
 573         mtx_unlock_spin(&smp_ipi_mtx);
 574 }
 575
 576 void
 577 smp_rendezvous(void (* setup_func)(void *),
 578                void (* action_func)(void *),
 579                void (* teardown_func)(void *),
 580                void *arg)
 581 {
 582         smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
 583 }
 584
 585 static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];
 586
 587 struct cpu_group *
 588 smp_topo(void)
 589 {
 590         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 591         struct cpu_group *top;
 592
 593         /*
 594          * Check for a fake topology request for debugging purposes.
 595          */
 596         switch (smp_topology) {
 597         case 1:
 598                 /* Dual core with no sharing.  */
 599                 top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
 600                 break;
 601         case 2:
 602                 /* No topology, all cpus are equal. */
 603                 top = smp_topo_none();
 604                 break;
 605         case 3:
 606                 /* Dual core with shared L2.  */
 607                 top = smp_topo_1level(CG_SHARE_L2, 2, 0);
 608                 break;
 609         case 4:
 610                 /* quad core, shared l3 among each package, private l2.  */
 611                 top = smp_topo_1level(CG_SHARE_L3, 4, 0);
 612                 break;
 613         case 5:
 614                 /* quad core,  2 dualcore parts on each package share l2.  */
 615                 top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
 616                 break;
 617         case 6:
 618                 /* Single-core 2xHTT */
 619                 top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
 620                 break;
 621         case 7:
 622                 /* quad core with a shared l3, 8 threads sharing L2.  */
 623                 top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
 624                     CG_FLAG_SMT);
 625                 break;
 626         default:
 627                 /* Default, ask the system what it wants. */
 628                 top = cpu_topo();
 629                 break;
 630         }
 631         /*
 632          * Verify the returned topology.
 633          */
 634         if (top->cg_count != mp_ncpus)
 635                 panic("Built bad topology at %p.  CPU count %d != %d",
 636                     top, top->cg_count, mp_ncpus);
 637         if (CPU_CMP(&top->cg_mask, &all_cpus))
 638                 panic("Built bad topology at %p.  CPU mask (%s) != (%s)",
 639                     top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
 640                     cpusetobj_strprint(cpusetbuf2, &all_cpus));
 641
 642         /*
 643          * Collapse nonsense levels that may be created out of convenience by
 644          * the MD layers.  They cause extra work in the search functions.
 645          */
 646         while (top->cg_children == 1) {
 647                 top = &top->cg_child[0];
 648                 top->cg_parent = NULL;
 649         }
 650         return (top);
 651 }
 652
 653 struct cpu_group *
 654 smp_topo_alloc(u_int count)
 655 {
 656         static u_int index;
 657         u_int curr;
 658
 659         curr = index;
 660         index += count;
 661         return (&group[curr]);
 662 }
 663
 664 struct cpu_group *
 665 smp_topo_none(void)
 666 {
 667         struct cpu_group *top;
 668
 669         top = &group[0];
 670         top->cg_parent = NULL;
 671         top->cg_child = NULL;
 672         top->cg_mask = all_cpus;
 673         top->cg_count = mp_ncpus;
 674         top->cg_children = 0;
 675         top->cg_level = CG_SHARE_NONE;
 676         top->cg_flags = 0;
 677
 678         return (top);
 679 }
 680
 681 static int
 682 smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
 683     int count, int flags, int start)
 684 {
 685         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 686         cpuset_t mask;
 687         int i;
 688
 689         CPU_ZERO(&mask);
 690         for (i = 0; i < count; i++, start++)
 691                 CPU_SET(start, &mask);
 692         child->cg_parent = parent;
 693         child->cg_child = NULL;
 694         child->cg_children = 0;
 695         child->cg_level = share;
 696         child->cg_count = count;
 697         child->cg_flags = flags;
 698         child->cg_mask = mask;
 699         parent->cg_children++;
 700         for (; parent != NULL; parent = parent->cg_parent) {
 701                 if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
 702                         panic("Duplicate children in %p.  mask (%s) child (%s)",
 703                             parent,
 704                             cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
 705                             cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
 706                 CPU_OR(&parent->cg_mask, &child->cg_mask);
 707                 parent->cg_count += child->cg_count;
 708         }
 709
 710         return (start);
 711 }
 712
 713 struct cpu_group *
 714 smp_topo_1level(int share, int count, int flags)
 715 {
 716         struct cpu_group *child;
 717         struct cpu_group *top;
 718         int packages;
 719         int cpu;
 720         int i;
 721
 722         cpu = 0;
 723         top = &group[0];
 724         packages = mp_ncpus / count;
 725         top->cg_child = child = &group[1];
 726         top->cg_level = CG_SHARE_NONE;
 727         for (i = 0; i < packages; i++, child++)
 728                 cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
 729         return (top);
 730 }
 731
 732 struct cpu_group *
 733 smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
 734     int l1flags)
 735 {
 736         struct cpu_group *top;
 737         struct cpu_group *l1g;
 738         struct cpu_group *l2g;
 739         int cpu;
 740         int i;
 741         int j;
 742
 743         cpu = 0;
 744         top = &group[0];
 745         l2g = &group[1];
 746         top->cg_child = l2g;
 747         top->cg_level = CG_SHARE_NONE;
 748         top->cg_children = mp_ncpus / (l2count * l1count);
 749         l1g = l2g + top->cg_children;
 750         for (i = 0; i < top->cg_children; i++, l2g++) {
 751                 l2g->cg_parent = top;
 752                 l2g->cg_child = l1g;
 753                 l2g->cg_level = l2share;
 754                 for (j = 0; j < l2count; j++, l1g++)
 755                         cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
 756                             l1flags, cpu);
 757         }
 758         return (top);
 759 }
 760
 761
 762 struct cpu_group *
 763 smp_topo_find(struct cpu_group *top, int cpu)
 764 {
 765         struct cpu_group *cg;
 766         cpuset_t mask;
 767         int children;
 768         int i;
 769
 770         CPU_SETOF(cpu, &mask);
 771         cg = top;
 772         for (;;) {
 773                 if (!CPU_OVERLAP(&cg->cg_mask, &mask))
 774                         return (NULL);
 775                 if (cg->cg_children == 0)
 776                         return (cg);
 777                 children = cg->cg_children;
 778                 for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
 779                         if (CPU_OVERLAP(&cg->cg_mask, &mask))
 780                                 break;
 781         }
 782         return (NULL);
 783 }
 784 #else /* !SMP */
 785
 786 void
 787 smp_rendezvous_cpus(cpuset_t map,
 788         void (*setup_func)(void *),
 789         void (*action_func)(void *),
 790         void (*teardown_func)(void *),
 791         void *arg)
 792 {
 793         /*
 794          * In the !SMP case we just need to ensure the same initial conditions
 795          * as the SMP case.
 796          */
 797         spinlock_enter();
 798         if (setup_func != NULL)
 799                 setup_func(arg);
 800         if (action_func != NULL)
 801                 action_func(arg);
 802         if (teardown_func != NULL)
 803                 teardown_func(arg);
 804         spinlock_exit();
 805 }
 806
 807 void
 808 smp_rendezvous(void (*setup_func)(void *),
 809                void (*action_func)(void *),
 810                void (*teardown_func)(void *),
 811                void *arg)
 812 {
 813
 814         smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func,
 815             arg);
 816 }
 817
 818 /*
 819  * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
 820  * APIs will still work using this dummy support.
 821  */
 822 static void
 823 mp_setvariables_for_up(void *dummy)
 824 {
 825         mp_ncpus = 1;
 826         mp_maxid = PCPU_GET(cpuid);
 827         CPU_SETOF(mp_maxid, &all_cpus);
 828         KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
 829 }
 830 SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
 831     mp_setvariables_for_up, NULL);
 832 #endif /* SMP */
 833
 834 void
 835 smp_no_rendezvous_barrier(void *dummy)
 836 {
 837 #ifdef SMP
 838         KASSERT((!smp_started),("smp_no_rendezvous called and smp is started"));
 839 #endif
 840 }
 841
 842 /*
 843  * Wait for specified idle threads to switch once.  This ensures that even
 844  * preempted threads have cycled through the switch function once,
 845  * exiting their codepaths.  This allows us to change global pointers
 846  * with no other synchronization.
 847  */
 848 int
 849 quiesce_cpus(cpuset_t map, const char *wmesg, int prio)
 850 {
 851         struct pcpu *pcpu;
 852         u_int gen[MAXCPU];
 853         int error;
 854         int cpu;
 855
 856         error = 0;
 857         for (cpu = 0; cpu <= mp_maxid; cpu++) {
 858                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 859                         continue;
 860                 pcpu = pcpu_find(cpu);
 861                 gen[cpu] = pcpu->pc_idlethread->td_generation;
 862         }
 863         for (cpu = 0; cpu <= mp_maxid; cpu++) {
 864                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 865                         continue;
 866                 pcpu = pcpu_find(cpu);
 867                 thread_lock(curthread);
 868                 sched_bind(curthread, cpu);
 869                 thread_unlock(curthread);
 870                 while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
 871                         error = tsleep(quiesce_cpus, prio, wmesg, 1);
 872                         if (error != EWOULDBLOCK)
 873                                 goto out;
 874                         error = 0;
 875                 }
 876         }
 877 out:
 878         thread_lock(curthread);
 879         sched_unbind(curthread);
 880         thread_unlock(curthread);
 881
 882         return (error);
 883 }
 884
 885 int
 886 quiesce_all_cpus(const char *wmesg, int prio)
 887 {
 888
 889         return quiesce_cpus(all_cpus, wmesg, prio);
 890 }
 891
 892 /* Extra care is taken with this sysctl because the data type is volatile */
 893 static int
 894 sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS)
 895 {
 896         int error, active;
 897
 898         active = smp_started;
 899         error = SYSCTL_OUT(req, &active, sizeof(active));
 900         return (error);
 901 }
 902
 903
 904 #ifdef SMP
 905 void
 906 topo_init_node(struct topo_node *node)
 907 {
 908
 909         bzero(node, sizeof(*node));
 910         TAILQ_INIT(&node->children);
 911 }
 912
 913 void
 914 topo_init_root(struct topo_node *root)
 915 {
 916
 917         topo_init_node(root);
 918         root->type = TOPO_TYPE_SYSTEM;
 919 }
 920
 921 /*
 922  * Add a child node with the given ID under the given parent.
 923  * Do nothing if there is already a child with that ID.
 924  */
 925 struct topo_node *
 926 topo_add_node_by_hwid(struct topo_node *parent, int hwid,
 927     topo_node_type type, uintptr_t subtype)
 928 {
 929         struct topo_node *node;
 930
 931         TAILQ_FOREACH_REVERSE(node, &parent->children,
 932             topo_children, siblings) {
 933                 if (node->hwid == hwid
 934                     && node->type == type && node->subtype == subtype) {
 935                         return (node);
 936                 }
 937         }
 938
 939         node = malloc(sizeof(*node), M_TOPO, M_WAITOK);
 940         topo_init_node(node);
 941         node->parent = parent;
 942         node->hwid = hwid;
 943         node->type = type;
 944         node->subtype = subtype;
 945         TAILQ_INSERT_TAIL(&parent->children, node, siblings);
 946         parent->nchildren++;
 947
 948         return (node);
 949 }
 950
 951 /*
 952  * Find a child node with the given ID under the given parent.
 953  */
 954 struct topo_node *
 955 topo_find_node_by_hwid(struct topo_node *parent, int hwid,
 956     topo_node_type type, uintptr_t subtype)
 957 {
 958
 959         struct topo_node *node;
 960
 961         TAILQ_FOREACH(node, &parent->children, siblings) {
 962                 if (node->hwid == hwid
 963                     && node->type == type && node->subtype == subtype) {
 964                         return (node);
 965                 }
 966         }
 967
 968         return (NULL);
 969 }
 970
 971 /*
 972  * Given a node change the order of its parent's child nodes such
 973  * that the node becomes the firt child while preserving the cyclic
 974  * order of the children.  In other words, the given node is promoted
 975  * by rotation.
 976  */
 977 void
 978 topo_promote_child(struct topo_node *child)
 979 {
 980         struct topo_node *next;
 981         struct topo_node *node;
 982         struct topo_node *parent;
 983
 984         parent = child->parent;
 985         next = TAILQ_NEXT(child, siblings);
 986         TAILQ_REMOVE(&parent->children, child, siblings);
 987         TAILQ_INSERT_HEAD(&parent->children, child, siblings);
 988
 989         while (next != NULL) {
 990                 node = next;
 991                 next = TAILQ_NEXT(node, siblings);
 992                 TAILQ_REMOVE(&parent->children, node, siblings);
 993                 TAILQ_INSERT_AFTER(&parent->children, child, node, siblings);
 994                 child = node;
 995         }
 996 }
 997
 998 /*
 999  * Iterate to the next node in the depth-first search (traversal) of
1000  * the topology tree.
1001  */
1002 struct topo_node *
1003 topo_next_node(struct topo_node *top, struct topo_node *node)
1004 {
1005         struct topo_node *next;
1006
1007         if ((next = TAILQ_FIRST(&node->children)) != NULL)
1008                 return (next);
1009
1010         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1011                 return (next);
1012
1013         while (node != top && (node = node->parent) != top)
1014                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1015                         return (next);
1016
1017         return (NULL);
1018 }
1019
1020 /*
1021  * Iterate to the next node in the depth-first search of the topology tree,
1022  * but without descending below the current node.
1023  */
1024 struct topo_node *
1025 topo_next_nonchild_node(struct topo_node *top, struct topo_node *node)
1026 {
1027         struct topo_node *next;
1028
1029         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1030                 return (next);
1031
1032         while (node != top && (node = node->parent) != top)
1033                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1034                         return (next);
1035
1036         return (NULL);
1037 }
1038
1039 /*
1040  * Assign the given ID to the given topology node that represents a logical
1041  * processor.
1042  */
1043 void
1044 topo_set_pu_id(struct topo_node *node, cpuid_t id)
1045 {
1046
1047         KASSERT(node->type == TOPO_TYPE_PU,
1048             ("topo_set_pu_id: wrong node type: %u", node->type));
1049         KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0,
1050             ("topo_set_pu_id: cpuset already not empty"));
1051         node->id = id;
1052         CPU_SET(id, &node->cpuset);
1053         node->cpu_count = 1;
1054         node->subtype = 1;
1055
1056         while ((node = node->parent) != NULL) {
1057                 KASSERT(!CPU_ISSET(id, &node->cpuset),
1058                     ("logical ID %u is already set in node %p", id, node));
1059                 CPU_SET(id, &node->cpuset);
1060                 node->cpu_count++;
1061         }
1062 }
1063
1064 static struct topology_spec {
1065         topo_node_type  type;
1066         bool            match_subtype;
1067         uintptr_t       subtype;
1068 } topology_level_table[TOPO_LEVEL_COUNT] = {
1069         [TOPO_LEVEL_PKG] = { .type = TOPO_TYPE_PKG, },
1070         [TOPO_LEVEL_GROUP] = { .type = TOPO_TYPE_GROUP, },
1071         [TOPO_LEVEL_CACHEGROUP] = {
1072                 .type = TOPO_TYPE_CACHE,
1073                 .match_subtype = true,
1074                 .subtype = CG_SHARE_L3,
1075         },
1076         [TOPO_LEVEL_CORE] = { .type = TOPO_TYPE_CORE, },
1077         [TOPO_LEVEL_THREAD] = { .type = TOPO_TYPE_PU, },
1078 };
1079
1080 static bool
1081 topo_analyze_table(struct topo_node *root, int all, enum topo_level level,
1082     struct topo_analysis *results)
1083 {
1084         struct topology_spec *spec;
1085         struct topo_node *node;
1086         int count;
1087
1088         if (level >= TOPO_LEVEL_COUNT)
1089                 return (true);
1090
1091         spec = &topology_level_table[level];
1092         count = 0;
1093         node = topo_next_node(root, root);
1094
1095         while (node != NULL) {
1096                 if (node->type != spec->type ||
1097                     (spec->match_subtype && node->subtype != spec->subtype)) {
1098                         node = topo_next_node(root, node);
1099                         continue;
1100                 }
1101                 if (!all && CPU_EMPTY(&node->cpuset)) {
1102                         node = topo_next_nonchild_node(root, node);
1103                         continue;
1104                 }
1105
1106                 count++;
1107
1108                 if (!topo_analyze_table(node, all, level + 1, results))
1109                         return (false);
1110
1111                 node = topo_next_nonchild_node(root, node);
1112         }
1113
1114         /* No explicit subgroups is essentially one subgroup. */
1115         if (count == 0) {
1116                 count = 1;
1117
1118                 if (!topo_analyze_table(root, all, level + 1, results))
1119                         return (false);
1120         }
1121
1122         if (results->entities[level] == -1)
1123                 results->entities[level] = count;
1124         else if (results->entities[level] != count)
1125                 return (false);
1126
1127         return (true);
1128 }
1129
1130 /*
1131  * Check if the topology is uniform, that is, each package has the same number
1132  * of cores in it and each core has the same number of threads (logical
1133  * processors) in it.  If so, calculate the number of packages, the number of
1134  * groups per package, the number of cachegroups per group, and the number of
1135  * logical processors per cachegroup.  'all' parameter tells whether to include
1136  * administratively disabled logical processors into the analysis.
1137  */
1138 int
1139 topo_analyze(struct topo_node *topo_root, int all,
1140     struct topo_analysis *results)
1141 {
1142
1143         results->entities[TOPO_LEVEL_PKG] = -1;
1144         results->entities[TOPO_LEVEL_CORE] = -1;
1145         results->entities[TOPO_LEVEL_THREAD] = -1;
1146         results->entities[TOPO_LEVEL_GROUP] = -1;
1147         results->entities[TOPO_LEVEL_CACHEGROUP] = -1;
1148
1149         if (!topo_analyze_table(topo_root, all, TOPO_LEVEL_PKG, results))
1150                 return (0);
1151
1152         KASSERT(results->entities[TOPO_LEVEL_PKG] > 0,
1153                 ("bug in topology or analysis"));
1154
1155         return (1);
1156 }
1157
1158 #endif /* SMP */
1159