sys/kern/subr_smp.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28
  29 /*
  30  * This module holds the global variables and machine independent functions
  31  * used for the kernel SMP support.
  32  */
  33
  34 #include <sys/cdefs.h>
  35 __FBSDID("$FreeBSD$");
  36
  37 #include <sys/param.h>
  38 #include <sys/systm.h>
  39 #include <sys/kernel.h>
  40 #include <sys/ktr.h>
  41 #include <sys/proc.h>
  42 #include <sys/bus.h>
  43 #include <sys/lock.h>
  44 #include <sys/malloc.h>
  45 #include <sys/mutex.h>
  46 #include <sys/pcpu.h>
  47 #include <sys/sched.h>
  48 #include <sys/smp.h>
  49 #include <sys/sysctl.h>
  50
  51 #include <machine/cpu.h>
  52 #include <machine/smp.h>
  53
  54 #include "opt_sched.h"
  55
  56 #ifdef SMP
  57 MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data");
  58
  59 volatile cpuset_t stopped_cpus;
  60 volatile cpuset_t started_cpus;
  61 volatile cpuset_t suspended_cpus;
  62 cpuset_t hlt_cpus_mask;
  63 cpuset_t logical_cpus_mask;
  64
  65 void (*cpustop_restartfunc)(void);
  66 #endif
  67
  68 static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS);
  69
  70 /* This is used in modules that need to work in both SMP and UP. */
  71 cpuset_t all_cpus;
  72
  73 int mp_ncpus;
  74 /* export this for libkvm consumers. */
  75 int mp_maxcpus = MAXCPU;
  76
  77 volatile int smp_started;
  78 u_int mp_maxid;
  79
  80 static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL,
  81     "Kernel SMP");
  82
  83 SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0,
  84     "Max CPU ID.");
  85
  86 SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus,
  87     0, "Max number of CPUs that the system was compiled for.");
  88
  89 SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD|CTLTYPE_INT|CTLFLAG_MPSAFE,
  90     NULL, 0, sysctl_kern_smp_active, "I",
  91     "Indicates system is running in SMP mode");
  92
  93 int smp_disabled = 0;   /* has smp been disabled? */
  94 SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
  95     &smp_disabled, 0, "SMP has been disabled from the loader");
  96
  97 int smp_cpus = 1;       /* how many cpu's running */
  98 SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0,
  99     "Number of CPUs online");
 100
 101 int smp_threads_per_core = 1;   /* how many SMT threads are running per core */
 102 SYSCTL_INT(_kern_smp, OID_AUTO, threads_per_core, CTLFLAG_RD|CTLFLAG_CAPRD,
 103     &smp_threads_per_core, 0, "Number of SMT threads online per core");
 104
 105 int mp_ncores = -1;     /* how many physical cores running */
 106 SYSCTL_INT(_kern_smp, OID_AUTO, cores, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_ncores, 0,
 107     "Number of CPUs online");
 108
 109 int smp_topology = 0;   /* Which topology we're using. */
 110 SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0,
 111     "Topology override setting; 0 is default provided by hardware.");
 112
 113 #ifdef SMP
 114 /* Enable forwarding of a signal to a process running on a different CPU */
 115 static int forward_signal_enabled = 1;
 116 SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
 117            &forward_signal_enabled, 0,
 118            "Forwarding of a signal to a process on a different CPU");
 119
 120 /* Variables needed for SMP rendezvous. */
 121 static volatile int smp_rv_ncpus;
 122 static void (*volatile smp_rv_setup_func)(void *arg);
 123 static void (*volatile smp_rv_action_func)(void *arg);
 124 static void (*volatile smp_rv_teardown_func)(void *arg);
 125 static void *volatile smp_rv_func_arg;
 126 static volatile int smp_rv_waiters[4];
 127
 128 /*
 129  * Shared mutex to restrict busywaits between smp_rendezvous() and
 130  * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
 131  * functions trigger at once and cause multiple CPUs to busywait with
 132  * interrupts disabled.
 133  */
 134 struct mtx smp_ipi_mtx;
 135
 136 /*
 137  * Let the MD SMP code initialize mp_maxid very early if it can.
 138  */
 139 static void
 140 mp_setmaxid(void *dummy)
 141 {
 142
 143         cpu_mp_setmaxid();
 144
 145         KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__));
 146         KASSERT(mp_ncpus > 1 || mp_maxid == 0,
 147             ("%s: one CPU but mp_maxid is not zero", __func__));
 148         KASSERT(mp_maxid >= mp_ncpus - 1,
 149             ("%s: counters out of sync: max %d, count %d", __func__,
 150                 mp_maxid, mp_ncpus));
 151 }
 152 SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);
 153
 154 /*
 155  * Call the MD SMP initialization code.
 156  */
 157 static void
 158 mp_start(void *dummy)
 159 {
 160
 161         mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
 162
 163         /* Probe for MP hardware. */
 164         if (smp_disabled != 0 || cpu_mp_probe() == 0) {
 165                 mp_ncores = 1;
 166                 mp_ncpus = 1;
 167                 CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
 168                 return;
 169         }
 170
 171         cpu_mp_start();
 172         printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
 173             mp_ncpus);
 174
 175         /* Provide a default for most architectures that don't have SMT/HTT. */
 176         if (mp_ncores < 0)
 177                 mp_ncores = mp_ncpus;
 178
 179         cpu_mp_announce();
 180 }
 181 SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);
 182
 183 void
 184 forward_signal(struct thread *td)
 185 {
 186         int id;
 187
 188         /*
 189          * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on
 190          * this thread, so all we need to do is poke it if it is currently
 191          * executing so that it executes ast().
 192          */
 193         THREAD_LOCK_ASSERT(td, MA_OWNED);
 194         KASSERT(TD_IS_RUNNING(td),
 195             ("forward_signal: thread is not TDS_RUNNING"));
 196
 197         CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
 198
 199         if (!smp_started || cold || panicstr)
 200                 return;
 201         if (!forward_signal_enabled)
 202                 return;
 203
 204         /* No need to IPI ourself. */
 205         if (td == curthread)
 206                 return;
 207
 208         id = td->td_oncpu;
 209         if (id == NOCPU)
 210                 return;
 211         ipi_cpu(id, IPI_AST);
 212 }
 213
 214 /*
 215  * When called the executing CPU will send an IPI to all other CPUs
 216  *  requesting that they halt execution.
 217  *
 218  * Usually (but not necessarily) called with 'other_cpus' as its arg.
 219  *
 220  *  - Signals all CPUs in map to stop.
 221  *  - Waits for each to stop.
 222  *
 223  * Returns:
 224  *  -1: error
 225  *   0: NA
 226  *   1: ok
 227  *
 228  */
 229 #if defined(__amd64__) || defined(__i386__)
 230 #define X86     1
 231 #else
 232 #define X86     0
 233 #endif
 234 static int
 235 generic_stop_cpus(cpuset_t map, u_int type)
 236 {
 237 #ifdef KTR
 238         char cpusetbuf[CPUSETBUFSIZ];
 239 #endif
 240         static volatile u_int stopping_cpu = NOCPU;
 241         int i;
 242         volatile cpuset_t *cpus;
 243
 244         KASSERT(
 245             type == IPI_STOP || type == IPI_STOP_HARD
 246 #if X86
 247             || type == IPI_SUSPEND
 248 #endif
 249             , ("%s: invalid stop type", __func__));
 250
 251         if (!smp_started)
 252                 return (0);
 253
 254         CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
 255             cpusetobj_strprint(cpusetbuf, &map), type);
 256
 257 #if X86
 258         /*
 259          * When suspending, ensure there are are no IPIs in progress.
 260          * IPIs that have been issued, but not yet delivered (e.g.
 261          * not pending on a vCPU when running under virtualization)
 262          * will be lost, violating FreeBSD's assumption of reliable
 263          * IPI delivery.
 264          */
 265         if (type == IPI_SUSPEND)
 266                 mtx_lock_spin(&smp_ipi_mtx);
 267 #endif
 268
 269 #if X86
 270         if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
 271 #endif
 272         if (stopping_cpu != PCPU_GET(cpuid))
 273                 while (atomic_cmpset_int(&stopping_cpu, NOCPU,
 274                     PCPU_GET(cpuid)) == 0)
 275                         while (stopping_cpu != NOCPU)
 276                                 cpu_spinwait(); /* spin */
 277
 278         /* send the stop IPI to all CPUs in map */
 279         ipi_selected(map, type);
 280 #if X86
 281         }
 282 #endif
 283
 284 #if X86
 285         if (type == IPI_SUSPEND)
 286                 cpus = &suspended_cpus;
 287         else
 288 #endif
 289                 cpus = &stopped_cpus;
 290
 291         i = 0;
 292         while (!CPU_SUBSET(cpus, &map)) {
 293                 /* spin */
 294                 cpu_spinwait();
 295                 i++;
 296                 if (i == 100000000) {
 297                         printf("timeout stopping cpus\n");
 298                         break;
 299                 }
 300         }
 301
 302 #if X86
 303         if (type == IPI_SUSPEND)
 304                 mtx_unlock_spin(&smp_ipi_mtx);
 305 #endif
 306
 307         stopping_cpu = NOCPU;
 308         return (1);
 309 }
 310
 311 int
 312 stop_cpus(cpuset_t map)
 313 {
 314
 315         return (generic_stop_cpus(map, IPI_STOP));
 316 }
 317
 318 int
 319 stop_cpus_hard(cpuset_t map)
 320 {
 321
 322         return (generic_stop_cpus(map, IPI_STOP_HARD));
 323 }
 324
 325 #if X86
 326 int
 327 suspend_cpus(cpuset_t map)
 328 {
 329
 330         return (generic_stop_cpus(map, IPI_SUSPEND));
 331 }
 332 #endif
 333
 334 /*
 335  * Called by a CPU to restart stopped CPUs.
 336  *
 337  * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
 338  *
 339  *  - Signals all CPUs in map to restart.
 340  *  - Waits for each to restart.
 341  *
 342  * Returns:
 343  *  -1: error
 344  *   0: NA
 345  *   1: ok
 346  */
 347 static int
 348 generic_restart_cpus(cpuset_t map, u_int type)
 349 {
 350 #ifdef KTR
 351         char cpusetbuf[CPUSETBUFSIZ];
 352 #endif
 353         volatile cpuset_t *cpus;
 354
 355         KASSERT(type == IPI_STOP || type == IPI_STOP_HARD
 356 #if X86
 357             || type == IPI_SUSPEND
 358 #endif
 359             , ("%s: invalid stop type", __func__));
 360
 361         if (!smp_started)
 362                 return (0);
 363
 364         CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
 365
 366 #if X86
 367         if (type == IPI_SUSPEND)
 368                 cpus = &resuming_cpus;
 369         else
 370 #endif
 371                 cpus = &stopped_cpus;
 372
 373         /* signal other cpus to restart */
 374 #if X86
 375         if (type == IPI_SUSPEND)
 376                 CPU_COPY_STORE_REL(&map, &toresume_cpus);
 377         else
 378 #endif
 379                 CPU_COPY_STORE_REL(&map, &started_cpus);
 380
 381 #if X86
 382         if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
 383 #endif
 384         /* wait for each to clear its bit */
 385         while (CPU_OVERLAP(cpus, &map))
 386                 cpu_spinwait();
 387 #if X86
 388         }
 389 #endif
 390
 391         return (1);
 392 }
 393
 394 int
 395 restart_cpus(cpuset_t map)
 396 {
 397
 398         return (generic_restart_cpus(map, IPI_STOP));
 399 }
 400
 401 #if X86
 402 int
 403 resume_cpus(cpuset_t map)
 404 {
 405
 406         return (generic_restart_cpus(map, IPI_SUSPEND));
 407 }
 408 #endif
 409 #undef X86
 410
 411 /*
 412  * All-CPU rendezvous.  CPUs are signalled, all execute the setup function
 413  * (if specified), rendezvous, execute the action function (if specified),
 414  * rendezvous again, execute the teardown function (if specified), and then
 415  * resume.
 416  *
 417  * Note that the supplied external functions _must_ be reentrant and aware
 418  * that they are running in parallel and in an unknown lock context.
 419  */
 420 void
 421 smp_rendezvous_action(void)
 422 {
 423         struct thread *td;
 424         void *local_func_arg;
 425         void (*local_setup_func)(void*);
 426         void (*local_action_func)(void*);
 427         void (*local_teardown_func)(void*);
 428 #ifdef INVARIANTS
 429         int owepreempt;
 430 #endif
 431
 432         /* Ensure we have up-to-date values. */
 433         atomic_add_acq_int(&smp_rv_waiters[0], 1);
 434         while (smp_rv_waiters[0] < smp_rv_ncpus)
 435                 cpu_spinwait();
 436
 437         /* Fetch rendezvous parameters after acquire barrier. */
 438         local_func_arg = smp_rv_func_arg;
 439         local_setup_func = smp_rv_setup_func;
 440         local_action_func = smp_rv_action_func;
 441         local_teardown_func = smp_rv_teardown_func;
 442
 443         /*
 444          * Use a nested critical section to prevent any preemptions
 445          * from occurring during a rendezvous action routine.
 446          * Specifically, if a rendezvous handler is invoked via an IPI
 447          * and the interrupted thread was in the critical_exit()
 448          * function after setting td_critnest to 0 but before
 449          * performing a deferred preemption, this routine can be
 450          * invoked with td_critnest set to 0 and td_owepreempt true.
 451          * In that case, a critical_exit() during the rendezvous
 452          * action would trigger a preemption which is not permitted in
 453          * a rendezvous action.  To fix this, wrap all of the
 454          * rendezvous action handlers in a critical section.  We
 455          * cannot use a regular critical section however as having
 456          * critical_exit() preempt from this routine would also be
 457          * problematic (the preemption must not occur before the IPI
 458          * has been acknowledged via an EOI).  Instead, we
 459          * intentionally ignore td_owepreempt when leaving the
 460          * critical section.  This should be harmless because we do
 461          * not permit rendezvous action routines to schedule threads,
 462          * and thus td_owepreempt should never transition from 0 to 1
 463          * during this routine.
 464          */
 465         td = curthread;
 466         td->td_critnest++;
 467 #ifdef INVARIANTS
 468         owepreempt = td->td_owepreempt;
 469 #endif
 470
 471         /*
 472          * If requested, run a setup function before the main action
 473          * function.  Ensure all CPUs have completed the setup
 474          * function before moving on to the action function.
 475          */
 476         if (local_setup_func != smp_no_rendezvous_barrier) {
 477                 if (smp_rv_setup_func != NULL)
 478                         smp_rv_setup_func(smp_rv_func_arg);
 479                 atomic_add_int(&smp_rv_waiters[1], 1);
 480                 while (smp_rv_waiters[1] < smp_rv_ncpus)
 481                         cpu_spinwait();
 482         }
 483
 484         if (local_action_func != NULL)
 485                 local_action_func(local_func_arg);
 486
 487         if (local_teardown_func != smp_no_rendezvous_barrier) {
 488                 /*
 489                  * Signal that the main action has been completed.  If a
 490                  * full exit rendezvous is requested, then all CPUs will
 491                  * wait here until all CPUs have finished the main action.
 492                  */
 493                 atomic_add_int(&smp_rv_waiters[2], 1);
 494                 while (smp_rv_waiters[2] < smp_rv_ncpus)
 495                         cpu_spinwait();
 496
 497                 if (local_teardown_func != NULL)
 498                         local_teardown_func(local_func_arg);
 499         }
 500
 501         /*
 502          * Signal that the rendezvous is fully completed by this CPU.
 503          * This means that no member of smp_rv_* pseudo-structure will be
 504          * accessed by this target CPU after this point; in particular,
 505          * memory pointed by smp_rv_func_arg.
 506          *
 507          * The release semantic ensures that all accesses performed by
 508          * the current CPU are visible when smp_rendezvous_cpus()
 509          * returns, by synchronizing with the
 510          * atomic_load_acq_int(&smp_rv_waiters[3]).
 511          */
 512         atomic_add_rel_int(&smp_rv_waiters[3], 1);
 513
 514         td->td_critnest--;
 515         KASSERT(owepreempt == td->td_owepreempt,
 516             ("rendezvous action changed td_owepreempt"));
 517 }
 518
 519 void
 520 smp_rendezvous_cpus(cpuset_t map,
 521         void (* setup_func)(void *),
 522         void (* action_func)(void *),
 523         void (* teardown_func)(void *),
 524         void *arg)
 525 {
 526         int curcpumap, i, ncpus = 0;
 527
 528         /* Look comments in the !SMP case. */
 529         if (!smp_started) {
 530                 spinlock_enter();
 531                 if (setup_func != NULL)
 532                         setup_func(arg);
 533                 if (action_func != NULL)
 534                         action_func(arg);
 535                 if (teardown_func != NULL)
 536                         teardown_func(arg);
 537                 spinlock_exit();
 538                 return;
 539         }
 540
 541         CPU_FOREACH(i) {
 542                 if (CPU_ISSET(i, &map))
 543                         ncpus++;
 544         }
 545         if (ncpus == 0)
 546                 panic("ncpus is 0 with non-zero map");
 547
 548         mtx_lock_spin(&smp_ipi_mtx);
 549
 550         /* Pass rendezvous parameters via global variables. */
 551         smp_rv_ncpus = ncpus;
 552         smp_rv_setup_func = setup_func;
 553         smp_rv_action_func = action_func;
 554         smp_rv_teardown_func = teardown_func;
 555         smp_rv_func_arg = arg;
 556         smp_rv_waiters[1] = 0;
 557         smp_rv_waiters[2] = 0;
 558         smp_rv_waiters[3] = 0;
 559         atomic_store_rel_int(&smp_rv_waiters[0], 0);
 560
 561         /*
 562          * Signal other processors, which will enter the IPI with
 563          * interrupts off.
 564          */
 565         curcpumap = CPU_ISSET(curcpu, &map);
 566         CPU_CLR(curcpu, &map);
 567         ipi_selected(map, IPI_RENDEZVOUS);
 568
 569         /* Check if the current CPU is in the map */
 570         if (curcpumap != 0)
 571                 smp_rendezvous_action();
 572
 573         /*
 574          * Ensure that the master CPU waits for all the other
 575          * CPUs to finish the rendezvous, so that smp_rv_*
 576          * pseudo-structure and the arg are guaranteed to not
 577          * be in use.
 578          *
 579          * Load acquire synchronizes with the release add in
 580          * smp_rendezvous_action(), which ensures that our caller sees
 581          * all memory actions done by the called functions on other
 582          * CPUs.
 583          */
 584         while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
 585                 cpu_spinwait();
 586
 587         mtx_unlock_spin(&smp_ipi_mtx);
 588 }
 589
 590 void
 591 smp_rendezvous(void (* setup_func)(void *),
 592                void (* action_func)(void *),
 593                void (* teardown_func)(void *),
 594                void *arg)
 595 {
 596         smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
 597 }
 598
 599 static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];
 600
 601 struct cpu_group *
 602 smp_topo(void)
 603 {
 604         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 605         struct cpu_group *top;
 606
 607         /*
 608          * Check for a fake topology request for debugging purposes.
 609          */
 610         switch (smp_topology) {
 611         case 1:
 612                 /* Dual core with no sharing.  */
 613                 top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
 614                 break;
 615         case 2:
 616                 /* No topology, all cpus are equal. */
 617                 top = smp_topo_none();
 618                 break;
 619         case 3:
 620                 /* Dual core with shared L2.  */
 621                 top = smp_topo_1level(CG_SHARE_L2, 2, 0);
 622                 break;
 623         case 4:
 624                 /* quad core, shared l3 among each package, private l2.  */
 625                 top = smp_topo_1level(CG_SHARE_L3, 4, 0);
 626                 break;
 627         case 5:
 628                 /* quad core,  2 dualcore parts on each package share l2.  */
 629                 top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
 630                 break;
 631         case 6:
 632                 /* Single-core 2xHTT */
 633                 top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
 634                 break;
 635         case 7:
 636                 /* quad core with a shared l3, 8 threads sharing L2.  */
 637                 top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
 638                     CG_FLAG_SMT);
 639                 break;
 640         default:
 641                 /* Default, ask the system what it wants. */
 642                 top = cpu_topo();
 643                 break;
 644         }
 645         /*
 646          * Verify the returned topology.
 647          */
 648         if (top->cg_count != mp_ncpus)
 649                 panic("Built bad topology at %p.  CPU count %d != %d",
 650                     top, top->cg_count, mp_ncpus);
 651         if (CPU_CMP(&top->cg_mask, &all_cpus))
 652                 panic("Built bad topology at %p.  CPU mask (%s) != (%s)",
 653                     top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
 654                     cpusetobj_strprint(cpusetbuf2, &all_cpus));
 655
 656         /*
 657          * Collapse nonsense levels that may be created out of convenience by
 658          * the MD layers.  They cause extra work in the search functions.
 659          */
 660         while (top->cg_children == 1) {
 661                 top = &top->cg_child[0];
 662                 top->cg_parent = NULL;
 663         }
 664         return (top);
 665 }
 666
 667 struct cpu_group *
 668 smp_topo_alloc(u_int count)
 669 {
 670         static u_int index;
 671         u_int curr;
 672
 673         curr = index;
 674         index += count;
 675         return (&group[curr]);
 676 }
 677
 678 struct cpu_group *
 679 smp_topo_none(void)
 680 {
 681         struct cpu_group *top;
 682
 683         top = &group[0];
 684         top->cg_parent = NULL;
 685         top->cg_child = NULL;
 686         top->cg_mask = all_cpus;
 687         top->cg_count = mp_ncpus;
 688         top->cg_children = 0;
 689         top->cg_level = CG_SHARE_NONE;
 690         top->cg_flags = 0;
 691
 692         return (top);
 693 }
 694
 695 static int
 696 smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
 697     int count, int flags, int start)
 698 {
 699         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 700         cpuset_t mask;
 701         int i;
 702
 703         CPU_ZERO(&mask);
 704         for (i = 0; i < count; i++, start++)
 705                 CPU_SET(start, &mask);
 706         child->cg_parent = parent;
 707         child->cg_child = NULL;
 708         child->cg_children = 0;
 709         child->cg_level = share;
 710         child->cg_count = count;
 711         child->cg_flags = flags;
 712         child->cg_mask = mask;
 713         parent->cg_children++;
 714         for (; parent != NULL; parent = parent->cg_parent) {
 715                 if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
 716                         panic("Duplicate children in %p.  mask (%s) child (%s)",
 717                             parent,
 718                             cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
 719                             cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
 720                 CPU_OR(&parent->cg_mask, &child->cg_mask);
 721                 parent->cg_count += child->cg_count;
 722         }
 723
 724         return (start);
 725 }
 726
 727 struct cpu_group *
 728 smp_topo_1level(int share, int count, int flags)
 729 {
 730         struct cpu_group *child;
 731         struct cpu_group *top;
 732         int packages;
 733         int cpu;
 734         int i;
 735
 736         cpu = 0;
 737         top = &group[0];
 738         packages = mp_ncpus / count;
 739         top->cg_child = child = &group[1];
 740         top->cg_level = CG_SHARE_NONE;
 741         for (i = 0; i < packages; i++, child++)
 742                 cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
 743         return (top);
 744 }
 745
 746 struct cpu_group *
 747 smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
 748     int l1flags)
 749 {
 750         struct cpu_group *top;
 751         struct cpu_group *l1g;
 752         struct cpu_group *l2g;
 753         int cpu;
 754         int i;
 755         int j;
 756
 757         cpu = 0;
 758         top = &group[0];
 759         l2g = &group[1];
 760         top->cg_child = l2g;
 761         top->cg_level = CG_SHARE_NONE;
 762         top->cg_children = mp_ncpus / (l2count * l1count);
 763         l1g = l2g + top->cg_children;
 764         for (i = 0; i < top->cg_children; i++, l2g++) {
 765                 l2g->cg_parent = top;
 766                 l2g->cg_child = l1g;
 767                 l2g->cg_level = l2share;
 768                 for (j = 0; j < l2count; j++, l1g++)
 769                         cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
 770                             l1flags, cpu);
 771         }
 772         return (top);
 773 }
 774
 775
 776 struct cpu_group *
 777 smp_topo_find(struct cpu_group *top, int cpu)
 778 {
 779         struct cpu_group *cg;
 780         cpuset_t mask;
 781         int children;
 782         int i;
 783
 784         CPU_SETOF(cpu, &mask);
 785         cg = top;
 786         for (;;) {
 787                 if (!CPU_OVERLAP(&cg->cg_mask, &mask))
 788                         return (NULL);
 789                 if (cg->cg_children == 0)
 790                         return (cg);
 791                 children = cg->cg_children;
 792                 for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
 793                         if (CPU_OVERLAP(&cg->cg_mask, &mask))
 794                                 break;
 795         }
 796         return (NULL);
 797 }
 798 #else /* !SMP */
 799
 800 void
 801 smp_rendezvous_cpus(cpuset_t map,
 802         void (*setup_func)(void *),
 803         void (*action_func)(void *),
 804         void (*teardown_func)(void *),
 805         void *arg)
 806 {
 807         /*
 808          * In the !SMP case we just need to ensure the same initial conditions
 809          * as the SMP case.
 810          */
 811         spinlock_enter();
 812         if (setup_func != NULL)
 813                 setup_func(arg);
 814         if (action_func != NULL)
 815                 action_func(arg);
 816         if (teardown_func != NULL)
 817                 teardown_func(arg);
 818         spinlock_exit();
 819 }
 820
 821 void
 822 smp_rendezvous(void (*setup_func)(void *),
 823                void (*action_func)(void *),
 824                void (*teardown_func)(void *),
 825                void *arg)
 826 {
 827
 828         smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func,
 829             arg);
 830 }
 831
 832 /*
 833  * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
 834  * APIs will still work using this dummy support.
 835  */
 836 static void
 837 mp_setvariables_for_up(void *dummy)
 838 {
 839         mp_ncpus = 1;
 840         mp_ncores = 1;
 841         mp_maxid = PCPU_GET(cpuid);
 842         CPU_SETOF(mp_maxid, &all_cpus);
 843         KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
 844 }
 845 SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
 846     mp_setvariables_for_up, NULL);
 847 #endif /* SMP */
 848
 849 void
 850 smp_no_rendezvous_barrier(void *dummy)
 851 {
 852 #ifdef SMP
 853         KASSERT((!smp_started),("smp_no_rendezvous called and smp is started"));
 854 #endif
 855 }
 856
 857 /*
 858  * Wait for specified idle threads to switch once.  This ensures that even
 859  * preempted threads have cycled through the switch function once,
 860  * exiting their codepaths.  This allows us to change global pointers
 861  * with no other synchronization.
 862  */
 863 int
 864 quiesce_cpus(cpuset_t map, const char *wmesg, int prio)
 865 {
 866         struct pcpu *pcpu;
 867         u_int gen[MAXCPU];
 868         int error;
 869         int cpu;
 870
 871         error = 0;
 872         for (cpu = 0; cpu <= mp_maxid; cpu++) {
 873                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 874                         continue;
 875                 pcpu = pcpu_find(cpu);
 876                 gen[cpu] = pcpu->pc_idlethread->td_generation;
 877         }
 878         for (cpu = 0; cpu <= mp_maxid; cpu++) {
 879                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 880                         continue;
 881                 pcpu = pcpu_find(cpu);
 882                 thread_lock(curthread);
 883                 sched_bind(curthread, cpu);
 884                 thread_unlock(curthread);
 885                 while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
 886                         error = tsleep(quiesce_cpus, prio, wmesg, 1);
 887                         if (error != EWOULDBLOCK)
 888                                 goto out;
 889                         error = 0;
 890                 }
 891         }
 892 out:
 893         thread_lock(curthread);
 894         sched_unbind(curthread);
 895         thread_unlock(curthread);
 896
 897         return (error);
 898 }
 899
 900 int
 901 quiesce_all_cpus(const char *wmesg, int prio)
 902 {
 903
 904         return quiesce_cpus(all_cpus, wmesg, prio);
 905 }
 906
 907 /* Extra care is taken with this sysctl because the data type is volatile */
 908 static int
 909 sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS)
 910 {
 911         int error, active;
 912
 913         active = smp_started;
 914         error = SYSCTL_OUT(req, &active, sizeof(active));
 915         return (error);
 916 }
 917
 918
 919 #ifdef SMP
 920 void
 921 topo_init_node(struct topo_node *node)
 922 {
 923
 924         bzero(node, sizeof(*node));
 925         TAILQ_INIT(&node->children);
 926 }
 927
 928 void
 929 topo_init_root(struct topo_node *root)
 930 {
 931
 932         topo_init_node(root);
 933         root->type = TOPO_TYPE_SYSTEM;
 934 }
 935
 936 /*
 937  * Add a child node with the given ID under the given parent.
 938  * Do nothing if there is already a child with that ID.
 939  */
 940 struct topo_node *
 941 topo_add_node_by_hwid(struct topo_node *parent, int hwid,
 942     topo_node_type type, uintptr_t subtype)
 943 {
 944         struct topo_node *node;
 945
 946         TAILQ_FOREACH_REVERSE(node, &parent->children,
 947             topo_children, siblings) {
 948                 if (node->hwid == hwid
 949                     && node->type == type && node->subtype == subtype) {
 950                         return (node);
 951                 }
 952         }
 953
 954         node = malloc(sizeof(*node), M_TOPO, M_WAITOK);
 955         topo_init_node(node);
 956         node->parent = parent;
 957         node->hwid = hwid;
 958         node->type = type;
 959         node->subtype = subtype;
 960         TAILQ_INSERT_TAIL(&parent->children, node, siblings);
 961         parent->nchildren++;
 962
 963         return (node);
 964 }
 965
 966 /*
 967  * Find a child node with the given ID under the given parent.
 968  */
 969 struct topo_node *
 970 topo_find_node_by_hwid(struct topo_node *parent, int hwid,
 971     topo_node_type type, uintptr_t subtype)
 972 {
 973
 974         struct topo_node *node;
 975
 976         TAILQ_FOREACH(node, &parent->children, siblings) {
 977                 if (node->hwid == hwid
 978                     && node->type == type && node->subtype == subtype) {
 979                         return (node);
 980                 }
 981         }
 982
 983         return (NULL);
 984 }
 985
 986 /*
 987  * Given a node change the order of its parent's child nodes such
 988  * that the node becomes the firt child while preserving the cyclic
 989  * order of the children.  In other words, the given node is promoted
 990  * by rotation.
 991  */
 992 void
 993 topo_promote_child(struct topo_node *child)
 994 {
 995         struct topo_node *next;
 996         struct topo_node *node;
 997         struct topo_node *parent;
 998
 999         parent = child->parent;
1000         next = TAILQ_NEXT(child, siblings);
1001         TAILQ_REMOVE(&parent->children, child, siblings);
1002         TAILQ_INSERT_HEAD(&parent->children, child, siblings);
1003
1004         while (next != NULL) {
1005                 node = next;
1006                 next = TAILQ_NEXT(node, siblings);
1007                 TAILQ_REMOVE(&parent->children, node, siblings);
1008                 TAILQ_INSERT_AFTER(&parent->children, child, node, siblings);
1009                 child = node;
1010         }
1011 }
1012
1013 /*
1014  * Iterate to the next node in the depth-first search (traversal) of
1015  * the topology tree.
1016  */
1017 struct topo_node *
1018 topo_next_node(struct topo_node *top, struct topo_node *node)
1019 {
1020         struct topo_node *next;
1021
1022         if ((next = TAILQ_FIRST(&node->children)) != NULL)
1023                 return (next);
1024
1025         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1026                 return (next);
1027
1028         while (node != top && (node = node->parent) != top)
1029                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1030                         return (next);
1031
1032         return (NULL);
1033 }
1034
1035 /*
1036  * Iterate to the next node in the depth-first search of the topology tree,
1037  * but without descending below the current node.
1038  */
1039 struct topo_node *
1040 topo_next_nonchild_node(struct topo_node *top, struct topo_node *node)
1041 {
1042         struct topo_node *next;
1043
1044         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1045                 return (next);
1046
1047         while (node != top && (node = node->parent) != top)
1048                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1049                         return (next);
1050
1051         return (NULL);
1052 }
1053
1054 /*
1055  * Assign the given ID to the given topology node that represents a logical
1056  * processor.
1057  */
1058 void
1059 topo_set_pu_id(struct topo_node *node, cpuid_t id)
1060 {
1061
1062         KASSERT(node->type == TOPO_TYPE_PU,
1063             ("topo_set_pu_id: wrong node type: %u", node->type));
1064         KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0,
1065             ("topo_set_pu_id: cpuset already not empty"));
1066         node->id = id;
1067         CPU_SET(id, &node->cpuset);
1068         node->cpu_count = 1;
1069         node->subtype = 1;
1070
1071         while ((node = node->parent) != NULL) {
1072                 KASSERT(!CPU_ISSET(id, &node->cpuset),
1073                     ("logical ID %u is already set in node %p", id, node));
1074                 CPU_SET(id, &node->cpuset);
1075                 node->cpu_count++;
1076         }
1077 }
1078
1079 static struct topology_spec {
1080         topo_node_type  type;
1081         bool            match_subtype;
1082         uintptr_t       subtype;
1083 } topology_level_table[TOPO_LEVEL_COUNT] = {
1084         [TOPO_LEVEL_PKG] = { .type = TOPO_TYPE_PKG, },
1085         [TOPO_LEVEL_GROUP] = { .type = TOPO_TYPE_GROUP, },
1086         [TOPO_LEVEL_CACHEGROUP] = {
1087                 .type = TOPO_TYPE_CACHE,
1088                 .match_subtype = true,
1089                 .subtype = CG_SHARE_L3,
1090         },
1091         [TOPO_LEVEL_CORE] = { .type = TOPO_TYPE_CORE, },
1092         [TOPO_LEVEL_THREAD] = { .type = TOPO_TYPE_PU, },
1093 };
1094
1095 static bool
1096 topo_analyze_table(struct topo_node *root, int all, enum topo_level level,
1097     struct topo_analysis *results)
1098 {
1099         struct topology_spec *spec;
1100         struct topo_node *node;
1101         int count;
1102
1103         if (level >= TOPO_LEVEL_COUNT)
1104                 return (true);
1105
1106         spec = &topology_level_table[level];
1107         count = 0;
1108         node = topo_next_node(root, root);
1109
1110         while (node != NULL) {
1111                 if (node->type != spec->type ||
1112                     (spec->match_subtype && node->subtype != spec->subtype)) {
1113                         node = topo_next_node(root, node);
1114                         continue;
1115                 }
1116                 if (!all && CPU_EMPTY(&node->cpuset)) {
1117                         node = topo_next_nonchild_node(root, node);
1118                         continue;
1119                 }
1120
1121                 count++;
1122
1123                 if (!topo_analyze_table(node, all, level + 1, results))
1124                         return (false);
1125
1126                 node = topo_next_nonchild_node(root, node);
1127         }
1128
1129         /* No explicit subgroups is essentially one subgroup. */
1130         if (count == 0) {
1131                 count = 1;
1132
1133                 if (!topo_analyze_table(root, all, level + 1, results))
1134                         return (false);
1135         }
1136
1137         if (results->entities[level] == -1)
1138                 results->entities[level] = count;
1139         else if (results->entities[level] != count)
1140                 return (false);
1141
1142         return (true);
1143 }
1144
1145 /*
1146  * Check if the topology is uniform, that is, each package has the same number
1147  * of cores in it and each core has the same number of threads (logical
1148  * processors) in it.  If so, calculate the number of packages, the number of
1149  * groups per package, the number of cachegroups per group, and the number of
1150  * logical processors per cachegroup.  'all' parameter tells whether to include
1151  * administratively disabled logical processors into the analysis.
1152  */
1153 int
1154 topo_analyze(struct topo_node *topo_root, int all,
1155     struct topo_analysis *results)
1156 {
1157
1158         results->entities[TOPO_LEVEL_PKG] = -1;
1159         results->entities[TOPO_LEVEL_CORE] = -1;
1160         results->entities[TOPO_LEVEL_THREAD] = -1;
1161         results->entities[TOPO_LEVEL_GROUP] = -1;
1162         results->entities[TOPO_LEVEL_CACHEGROUP] = -1;
1163
1164         if (!topo_analyze_table(topo_root, all, TOPO_LEVEL_PKG, results))
1165                 return (0);
1166
1167         KASSERT(results->entities[TOPO_LEVEL_PKG] > 0,
1168                 ("bug in topology or analysis"));
1169
1170         return (1);
1171 }
1172
1173 #endif /* SMP */
1174