sys/kern/subr_smp.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25  * SUCH DAMAGE.
  26  */
  27
  28 /*
  29  * This module holds the global variables and machine independent functions
  30  * used for the kernel SMP support.
  31  */
  32
  33 #include <sys/cdefs.h>
  34 __FBSDID("$FreeBSD$");
  35
  36 #include <sys/param.h>
  37 #include <sys/systm.h>
  38 #include <sys/kernel.h>
  39 #include <sys/ktr.h>
  40 #include <sys/proc.h>
  41 #include <sys/bus.h>
  42 #include <sys/lock.h>
  43 #include <sys/malloc.h>
  44 #include <sys/mutex.h>
  45 #include <sys/pcpu.h>
  46 #include <sys/sched.h>
  47 #include <sys/smp.h>
  48 #include <sys/sysctl.h>
  49
  50 #include <machine/cpu.h>
  51 #include <machine/smp.h>
  52
  53 #include "opt_sched.h"
  54
  55 #ifdef SMP
  56 MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data");
  57
  58 volatile cpuset_t stopped_cpus;
  59 volatile cpuset_t started_cpus;
  60 volatile cpuset_t suspended_cpus;
  61 cpuset_t hlt_cpus_mask;
  62 cpuset_t logical_cpus_mask;
  63
  64 void (*cpustop_restartfunc)(void);
  65 #endif
  66
  67 static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS);
  68
  69 /* This is used in modules that need to work in both SMP and UP. */
  70 cpuset_t all_cpus;
  71
  72 int mp_ncpus;
  73 /* export this for libkvm consumers. */
  74 int mp_maxcpus = MAXCPU;
  75
  76 volatile int smp_started;
  77 u_int mp_maxid;
  78
  79 static SYSCTL_NODE(_kern, OID_AUTO, smp,
  80     CTLFLAG_RD | CTLFLAG_CAPRD | CTLFLAG_MPSAFE, NULL,
  81     "Kernel SMP");
  82
  83 SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0,
  84     "Max CPU ID.");
  85
  86 SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus,
  87     0, "Max number of CPUs that the system was compiled for.");
  88
  89 SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD|CTLTYPE_INT|CTLFLAG_MPSAFE,
  90     NULL, 0, sysctl_kern_smp_active, "I",
  91     "Indicates system is running in SMP mode");
  92
  93 int smp_disabled = 0;   /* has smp been disabled? */
  94 SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
  95     &smp_disabled, 0, "SMP has been disabled from the loader");
  96
  97 int smp_cpus = 1;       /* how many cpu's running */
  98 SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0,
  99     "Number of CPUs online");
 100
 101 int smp_threads_per_core = 1;   /* how many SMT threads are running per core */
 102 SYSCTL_INT(_kern_smp, OID_AUTO, threads_per_core, CTLFLAG_RD|CTLFLAG_CAPRD,
 103     &smp_threads_per_core, 0, "Number of SMT threads online per core");
 104
 105 int mp_ncores = -1;     /* how many physical cores running */
 106 SYSCTL_INT(_kern_smp, OID_AUTO, cores, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_ncores, 0,
 107     "Number of physical cores online");
 108
 109 int smp_topology = 0;   /* Which topology we're using. */
 110 SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0,
 111     "Topology override setting; 0 is default provided by hardware.");
 112
 113 #ifdef SMP
 114 /* Enable forwarding of a signal to a process running on a different CPU */
 115 static int forward_signal_enabled = 1;
 116 SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
 117            &forward_signal_enabled, 0,
 118            "Forwarding of a signal to a process on a different CPU");
 119
 120 /* Variables needed for SMP rendezvous. */
 121 static volatile int smp_rv_ncpus;
 122 static void (*volatile smp_rv_setup_func)(void *arg);
 123 static void (*volatile smp_rv_action_func)(void *arg);
 124 static void (*volatile smp_rv_teardown_func)(void *arg);
 125 static void *volatile smp_rv_func_arg;
 126 static volatile int smp_rv_waiters[4];
 127
 128 /*
 129  * Shared mutex to restrict busywaits between smp_rendezvous() and
 130  * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
 131  * functions trigger at once and cause multiple CPUs to busywait with
 132  * interrupts disabled.
 133  */
 134 struct mtx smp_ipi_mtx;
 135
 136 /*
 137  * Let the MD SMP code initialize mp_maxid very early if it can.
 138  */
 139 static void
 140 mp_setmaxid(void *dummy)
 141 {
 142
 143         cpu_mp_setmaxid();
 144
 145         KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__));
 146         KASSERT(mp_ncpus > 1 || mp_maxid == 0,
 147             ("%s: one CPU but mp_maxid is not zero", __func__));
 148         KASSERT(mp_maxid >= mp_ncpus - 1,
 149             ("%s: counters out of sync: max %d, count %d", __func__,
 150                 mp_maxid, mp_ncpus));
 151
 152         cpusetsizemin = howmany(mp_maxid + 1, NBBY);
 153 }
 154 SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);
 155
 156 /*
 157  * Call the MD SMP initialization code.
 158  */
 159 static void
 160 mp_start(void *dummy)
 161 {
 162
 163         mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
 164
 165         /* Probe for MP hardware. */
 166         if (smp_disabled != 0 || cpu_mp_probe() == 0) {
 167                 mp_ncores = 1;
 168                 mp_ncpus = 1;
 169                 CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
 170                 return;
 171         }
 172
 173         cpu_mp_start();
 174         printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
 175             mp_ncpus);
 176
 177         /* Provide a default for most architectures that don't have SMT/HTT. */
 178         if (mp_ncores < 0)
 179                 mp_ncores = mp_ncpus;
 180
 181         cpu_mp_announce();
 182 }
 183 SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);
 184
 185 void
 186 forward_signal(struct thread *td)
 187 {
 188         int id;
 189
 190         /*
 191          * signotify() has already set TDA_AST and TDA_SIG on td_ast for
 192          * this thread, so all we need to do is poke it if it is currently
 193          * executing so that it executes ast().
 194          */
 195         THREAD_LOCK_ASSERT(td, MA_OWNED);
 196         KASSERT(TD_IS_RUNNING(td),
 197             ("forward_signal: thread is not TDS_RUNNING"));
 198
 199         CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
 200
 201         if (!smp_started || cold || KERNEL_PANICKED())
 202                 return;
 203         if (!forward_signal_enabled)
 204                 return;
 205
 206         /* No need to IPI ourself. */
 207         if (td == curthread)
 208                 return;
 209
 210         id = td->td_oncpu;
 211         if (id == NOCPU)
 212                 return;
 213         ipi_cpu(id, IPI_AST);
 214 }
 215
 216 /*
 217  * When called the executing CPU will send an IPI to all other CPUs
 218  *  requesting that they halt execution.
 219  *
 220  * Usually (but not necessarily) called with 'other_cpus' as its arg.
 221  *
 222  *  - Signals all CPUs in map to stop.
 223  *  - Waits for each to stop.
 224  *
 225  * Returns:
 226  *  -1: error
 227  *   0: NA
 228  *   1: ok
 229  *
 230  */
 231 #if defined(__amd64__) || defined(__i386__)
 232 #define X86     1
 233 #else
 234 #define X86     0
 235 #endif
 236 static int
 237 generic_stop_cpus(cpuset_t map, u_int type)
 238 {
 239 #ifdef KTR
 240         char cpusetbuf[CPUSETBUFSIZ];
 241 #endif
 242         static volatile u_int stopping_cpu = NOCPU;
 243         int i;
 244         volatile cpuset_t *cpus;
 245
 246         KASSERT(
 247             type == IPI_STOP || type == IPI_STOP_HARD
 248 #if X86
 249             || type == IPI_SUSPEND
 250 #endif
 251             , ("%s: invalid stop type", __func__));
 252
 253         if (!smp_started)
 254                 return (0);
 255
 256         CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
 257             cpusetobj_strprint(cpusetbuf, &map), type);
 258
 259 #if X86
 260         /*
 261          * When suspending, ensure there are are no IPIs in progress.
 262          * IPIs that have been issued, but not yet delivered (e.g.
 263          * not pending on a vCPU when running under virtualization)
 264          * will be lost, violating FreeBSD's assumption of reliable
 265          * IPI delivery.
 266          */
 267         if (type == IPI_SUSPEND)
 268                 mtx_lock_spin(&smp_ipi_mtx);
 269 #endif
 270
 271 #if X86
 272         if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
 273 #endif
 274         if (stopping_cpu != PCPU_GET(cpuid))
 275                 while (atomic_cmpset_int(&stopping_cpu, NOCPU,
 276                     PCPU_GET(cpuid)) == 0)
 277                         while (stopping_cpu != NOCPU)
 278                                 cpu_spinwait(); /* spin */
 279
 280         /* send the stop IPI to all CPUs in map */
 281         ipi_selected(map, type);
 282 #if X86
 283         }
 284 #endif
 285
 286 #if X86
 287         if (type == IPI_SUSPEND)
 288                 cpus = &suspended_cpus;
 289         else
 290 #endif
 291                 cpus = &stopped_cpus;
 292
 293         i = 0;
 294         while (!CPU_SUBSET(cpus, &map)) {
 295                 /* spin */
 296                 cpu_spinwait();
 297                 i++;
 298                 if (i == 100000000) {
 299                         printf("timeout stopping cpus\n");
 300                         break;
 301                 }
 302         }
 303
 304 #if X86
 305         if (type == IPI_SUSPEND)
 306                 mtx_unlock_spin(&smp_ipi_mtx);
 307 #endif
 308
 309         stopping_cpu = NOCPU;
 310         return (1);
 311 }
 312
 313 int
 314 stop_cpus(cpuset_t map)
 315 {
 316
 317         return (generic_stop_cpus(map, IPI_STOP));
 318 }
 319
 320 int
 321 stop_cpus_hard(cpuset_t map)
 322 {
 323
 324         return (generic_stop_cpus(map, IPI_STOP_HARD));
 325 }
 326
 327 #if X86
 328 int
 329 suspend_cpus(cpuset_t map)
 330 {
 331
 332         return (generic_stop_cpus(map, IPI_SUSPEND));
 333 }
 334 #endif
 335
 336 /*
 337  * Called by a CPU to restart stopped CPUs.
 338  *
 339  * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
 340  *
 341  *  - Signals all CPUs in map to restart.
 342  *  - Waits for each to restart.
 343  *
 344  * Returns:
 345  *  -1: error
 346  *   0: NA
 347  *   1: ok
 348  */
 349 static int
 350 generic_restart_cpus(cpuset_t map, u_int type)
 351 {
 352 #ifdef KTR
 353         char cpusetbuf[CPUSETBUFSIZ];
 354 #endif
 355         volatile cpuset_t *cpus;
 356
 357 #if X86
 358         KASSERT(type == IPI_STOP || type == IPI_STOP_HARD
 359             || type == IPI_SUSPEND, ("%s: invalid stop type", __func__));
 360
 361         if (!smp_started)
 362                 return (0);
 363
 364         CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
 365
 366         if (type == IPI_SUSPEND)
 367                 cpus = &resuming_cpus;
 368         else
 369                 cpus = &stopped_cpus;
 370
 371         /* signal other cpus to restart */
 372         if (type == IPI_SUSPEND)
 373                 CPU_COPY_STORE_REL(&map, &toresume_cpus);
 374         else
 375                 CPU_COPY_STORE_REL(&map, &started_cpus);
 376
 377         /*
 378          * Wake up any CPUs stopped with MWAIT.  From MI code we can't tell if
 379          * MONITOR/MWAIT is enabled, but the potentially redundant writes are
 380          * relatively inexpensive.
 381          */
 382         if (type == IPI_STOP) {
 383                 struct monitorbuf *mb;
 384                 u_int id;
 385
 386                 CPU_FOREACH(id) {
 387                         if (!CPU_ISSET(id, &map))
 388                                 continue;
 389
 390                         mb = &pcpu_find(id)->pc_monitorbuf;
 391                         atomic_store_int(&mb->stop_state,
 392                             MONITOR_STOPSTATE_RUNNING);
 393                 }
 394         }
 395
 396         if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
 397                 /* wait for each to clear its bit */
 398                 while (CPU_OVERLAP(cpus, &map))
 399                         cpu_spinwait();
 400         }
 401 #else /* !X86 */
 402         KASSERT(type == IPI_STOP || type == IPI_STOP_HARD,
 403             ("%s: invalid stop type", __func__));
 404
 405         if (!smp_started)
 406                 return (0);
 407
 408         CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
 409
 410         cpus = &stopped_cpus;
 411
 412         /* signal other cpus to restart */
 413         CPU_COPY_STORE_REL(&map, &started_cpus);
 414
 415         /* wait for each to clear its bit */
 416         while (CPU_OVERLAP(cpus, &map))
 417                 cpu_spinwait();
 418 #endif
 419         return (1);
 420 }
 421
 422 int
 423 restart_cpus(cpuset_t map)
 424 {
 425
 426         return (generic_restart_cpus(map, IPI_STOP));
 427 }
 428
 429 #if X86
 430 int
 431 resume_cpus(cpuset_t map)
 432 {
 433
 434         return (generic_restart_cpus(map, IPI_SUSPEND));
 435 }
 436 #endif
 437 #undef X86
 438
 439 /*
 440  * All-CPU rendezvous.  CPUs are signalled, all execute the setup function
 441  * (if specified), rendezvous, execute the action function (if specified),
 442  * rendezvous again, execute the teardown function (if specified), and then
 443  * resume.
 444  *
 445  * Note that the supplied external functions _must_ be reentrant and aware
 446  * that they are running in parallel and in an unknown lock context.
 447  */
 448 void
 449 smp_rendezvous_action(void)
 450 {
 451         struct thread *td;
 452         void *local_func_arg;
 453         void (*local_setup_func)(void*);
 454         void (*local_action_func)(void*);
 455         void (*local_teardown_func)(void*);
 456 #ifdef INVARIANTS
 457         int owepreempt;
 458 #endif
 459
 460         /* Ensure we have up-to-date values. */
 461         atomic_add_acq_int(&smp_rv_waiters[0], 1);
 462         while (smp_rv_waiters[0] < smp_rv_ncpus)
 463                 cpu_spinwait();
 464
 465         /* Fetch rendezvous parameters after acquire barrier. */
 466         local_func_arg = smp_rv_func_arg;
 467         local_setup_func = smp_rv_setup_func;
 468         local_action_func = smp_rv_action_func;
 469         local_teardown_func = smp_rv_teardown_func;
 470
 471         /*
 472          * Use a nested critical section to prevent any preemptions
 473          * from occurring during a rendezvous action routine.
 474          * Specifically, if a rendezvous handler is invoked via an IPI
 475          * and the interrupted thread was in the critical_exit()
 476          * function after setting td_critnest to 0 but before
 477          * performing a deferred preemption, this routine can be
 478          * invoked with td_critnest set to 0 and td_owepreempt true.
 479          * In that case, a critical_exit() during the rendezvous
 480          * action would trigger a preemption which is not permitted in
 481          * a rendezvous action.  To fix this, wrap all of the
 482          * rendezvous action handlers in a critical section.  We
 483          * cannot use a regular critical section however as having
 484          * critical_exit() preempt from this routine would also be
 485          * problematic (the preemption must not occur before the IPI
 486          * has been acknowledged via an EOI).  Instead, we
 487          * intentionally ignore td_owepreempt when leaving the
 488          * critical section.  This should be harmless because we do
 489          * not permit rendezvous action routines to schedule threads,
 490          * and thus td_owepreempt should never transition from 0 to 1
 491          * during this routine.
 492          */
 493         td = curthread;
 494         td->td_critnest++;
 495 #ifdef INVARIANTS
 496         owepreempt = td->td_owepreempt;
 497 #endif
 498
 499         /*
 500          * If requested, run a setup function before the main action
 501          * function.  Ensure all CPUs have completed the setup
 502          * function before moving on to the action function.
 503          */
 504         if (local_setup_func != smp_no_rendezvous_barrier) {
 505                 if (local_setup_func != NULL)
 506                         local_setup_func(local_func_arg);
 507                 atomic_add_int(&smp_rv_waiters[1], 1);
 508                 while (smp_rv_waiters[1] < smp_rv_ncpus)
 509                         cpu_spinwait();
 510         }
 511
 512         if (local_action_func != NULL)
 513                 local_action_func(local_func_arg);
 514
 515         if (local_teardown_func != smp_no_rendezvous_barrier) {
 516                 /*
 517                  * Signal that the main action has been completed.  If a
 518                  * full exit rendezvous is requested, then all CPUs will
 519                  * wait here until all CPUs have finished the main action.
 520                  */
 521                 atomic_add_int(&smp_rv_waiters[2], 1);
 522                 while (smp_rv_waiters[2] < smp_rv_ncpus)
 523                         cpu_spinwait();
 524
 525                 if (local_teardown_func != NULL)
 526                         local_teardown_func(local_func_arg);
 527         }
 528
 529         /*
 530          * Signal that the rendezvous is fully completed by this CPU.
 531          * This means that no member of smp_rv_* pseudo-structure will be
 532          * accessed by this target CPU after this point; in particular,
 533          * memory pointed by smp_rv_func_arg.
 534          *
 535          * The release semantic ensures that all accesses performed by
 536          * the current CPU are visible when smp_rendezvous_cpus()
 537          * returns, by synchronizing with the
 538          * atomic_load_acq_int(&smp_rv_waiters[3]).
 539          */
 540         atomic_add_rel_int(&smp_rv_waiters[3], 1);
 541
 542         td->td_critnest--;
 543         KASSERT(owepreempt == td->td_owepreempt,
 544             ("rendezvous action changed td_owepreempt"));
 545 }
 546
 547 void
 548 smp_rendezvous_cpus(cpuset_t map,
 549         void (* setup_func)(void *),
 550         void (* action_func)(void *),
 551         void (* teardown_func)(void *),
 552         void *arg)
 553 {
 554         int curcpumap, i, ncpus = 0;
 555
 556         /* See comments in the !SMP case. */
 557         if (!smp_started) {
 558                 spinlock_enter();
 559                 if (setup_func != NULL)
 560                         setup_func(arg);
 561                 if (action_func != NULL)
 562                         action_func(arg);
 563                 if (teardown_func != NULL)
 564                         teardown_func(arg);
 565                 spinlock_exit();
 566                 return;
 567         }
 568
 569         /*
 570          * Make sure we come here with interrupts enabled.  Otherwise we
 571          * livelock if smp_ipi_mtx is owned by a thread which sent us an IPI.
 572          */
 573         MPASS(curthread->td_md.md_spinlock_count == 0);
 574
 575         CPU_FOREACH(i) {
 576                 if (CPU_ISSET(i, &map))
 577                         ncpus++;
 578         }
 579         if (ncpus == 0)
 580                 panic("ncpus is 0 with non-zero map");
 581
 582         mtx_lock_spin(&smp_ipi_mtx);
 583
 584         /* Pass rendezvous parameters via global variables. */
 585         smp_rv_ncpus = ncpus;
 586         smp_rv_setup_func = setup_func;
 587         smp_rv_action_func = action_func;
 588         smp_rv_teardown_func = teardown_func;
 589         smp_rv_func_arg = arg;
 590         smp_rv_waiters[1] = 0;
 591         smp_rv_waiters[2] = 0;
 592         smp_rv_waiters[3] = 0;
 593         atomic_store_rel_int(&smp_rv_waiters[0], 0);
 594
 595         /*
 596          * Signal other processors, which will enter the IPI with
 597          * interrupts off.
 598          */
 599         curcpumap = CPU_ISSET(curcpu, &map);
 600         CPU_CLR(curcpu, &map);
 601         ipi_selected(map, IPI_RENDEZVOUS);
 602
 603         /* Check if the current CPU is in the map */
 604         if (curcpumap != 0)
 605                 smp_rendezvous_action();
 606
 607         /*
 608          * Ensure that the master CPU waits for all the other
 609          * CPUs to finish the rendezvous, so that smp_rv_*
 610          * pseudo-structure and the arg are guaranteed to not
 611          * be in use.
 612          *
 613          * Load acquire synchronizes with the release add in
 614          * smp_rendezvous_action(), which ensures that our caller sees
 615          * all memory actions done by the called functions on other
 616          * CPUs.
 617          */
 618         while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
 619                 cpu_spinwait();
 620
 621         mtx_unlock_spin(&smp_ipi_mtx);
 622 }
 623
 624 void
 625 smp_rendezvous(void (* setup_func)(void *),
 626                void (* action_func)(void *),
 627                void (* teardown_func)(void *),
 628                void *arg)
 629 {
 630         smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
 631 }
 632
 633 static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];
 634
 635 static void
 636 smp_topo_fill(struct cpu_group *cg)
 637 {
 638         int c;
 639
 640         for (c = 0; c < cg->cg_children; c++)
 641                 smp_topo_fill(&cg->cg_child[c]);
 642         cg->cg_first = CPU_FFS(&cg->cg_mask) - 1;
 643         cg->cg_last = CPU_FLS(&cg->cg_mask) - 1;
 644 }
 645
 646 struct cpu_group *
 647 smp_topo(void)
 648 {
 649         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 650         struct cpu_group *top;
 651
 652         /*
 653          * Check for a fake topology request for debugging purposes.
 654          */
 655         switch (smp_topology) {
 656         case 1:
 657                 /* Dual core with no sharing.  */
 658                 top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
 659                 break;
 660         case 2:
 661                 /* No topology, all cpus are equal. */
 662                 top = smp_topo_none();
 663                 break;
 664         case 3:
 665                 /* Dual core with shared L2.  */
 666                 top = smp_topo_1level(CG_SHARE_L2, 2, 0);
 667                 break;
 668         case 4:
 669                 /* quad core, shared l3 among each package, private l2.  */
 670                 top = smp_topo_1level(CG_SHARE_L3, 4, 0);
 671                 break;
 672         case 5:
 673                 /* quad core,  2 dualcore parts on each package share l2.  */
 674                 top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
 675                 break;
 676         case 6:
 677                 /* Single-core 2xHTT */
 678                 top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
 679                 break;
 680         case 7:
 681                 /* quad core with a shared l3, 8 threads sharing L2.  */
 682                 top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
 683                     CG_FLAG_SMT);
 684                 break;
 685         default:
 686                 /* Default, ask the system what it wants. */
 687                 top = cpu_topo();
 688                 break;
 689         }
 690         /*
 691          * Verify the returned topology.
 692          */
 693         if (top->cg_count != mp_ncpus)
 694                 panic("Built bad topology at %p.  CPU count %d != %d",
 695                     top, top->cg_count, mp_ncpus);
 696         if (CPU_CMP(&top->cg_mask, &all_cpus))
 697                 panic("Built bad topology at %p.  CPU mask (%s) != (%s)",
 698                     top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
 699                     cpusetobj_strprint(cpusetbuf2, &all_cpus));
 700
 701         /*
 702          * Collapse nonsense levels that may be created out of convenience by
 703          * the MD layers.  They cause extra work in the search functions.
 704          */
 705         while (top->cg_children == 1) {
 706                 top = &top->cg_child[0];
 707                 top->cg_parent = NULL;
 708         }
 709         smp_topo_fill(top);
 710         return (top);
 711 }
 712
 713 struct cpu_group *
 714 smp_topo_alloc(u_int count)
 715 {
 716         static u_int index;
 717         u_int curr;
 718
 719         curr = index;
 720         index += count;
 721         return (&group[curr]);
 722 }
 723
 724 struct cpu_group *
 725 smp_topo_none(void)
 726 {
 727         struct cpu_group *top;
 728
 729         top = &group[0];
 730         top->cg_parent = NULL;
 731         top->cg_child = NULL;
 732         top->cg_mask = all_cpus;
 733         top->cg_count = mp_ncpus;
 734         top->cg_children = 0;
 735         top->cg_level = CG_SHARE_NONE;
 736         top->cg_flags = 0;
 737
 738         return (top);
 739 }
 740
 741 static int
 742 smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
 743     int count, int flags, int start)
 744 {
 745         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 746         cpuset_t mask;
 747         int i;
 748
 749         CPU_ZERO(&mask);
 750         for (i = 0; i < count; i++, start++)
 751                 CPU_SET(start, &mask);
 752         child->cg_parent = parent;
 753         child->cg_child = NULL;
 754         child->cg_children = 0;
 755         child->cg_level = share;
 756         child->cg_count = count;
 757         child->cg_flags = flags;
 758         child->cg_mask = mask;
 759         parent->cg_children++;
 760         for (; parent != NULL; parent = parent->cg_parent) {
 761                 if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
 762                         panic("Duplicate children in %p.  mask (%s) child (%s)",
 763                             parent,
 764                             cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
 765                             cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
 766                 CPU_OR(&parent->cg_mask, &parent->cg_mask, &child->cg_mask);
 767                 parent->cg_count += child->cg_count;
 768         }
 769
 770         return (start);
 771 }
 772
 773 struct cpu_group *
 774 smp_topo_1level(int share, int count, int flags)
 775 {
 776         struct cpu_group *child;
 777         struct cpu_group *top;
 778         int packages;
 779         int cpu;
 780         int i;
 781
 782         cpu = 0;
 783         top = &group[0];
 784         packages = mp_ncpus / count;
 785         top->cg_child = child = &group[1];
 786         top->cg_level = CG_SHARE_NONE;
 787         for (i = 0; i < packages; i++, child++)
 788                 cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
 789         return (top);
 790 }
 791
 792 struct cpu_group *
 793 smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
 794     int l1flags)
 795 {
 796         struct cpu_group *top;
 797         struct cpu_group *l1g;
 798         struct cpu_group *l2g;
 799         int cpu;
 800         int i;
 801         int j;
 802
 803         cpu = 0;
 804         top = &group[0];
 805         l2g = &group[1];
 806         top->cg_child = l2g;
 807         top->cg_level = CG_SHARE_NONE;
 808         top->cg_children = mp_ncpus / (l2count * l1count);
 809         l1g = l2g + top->cg_children;
 810         for (i = 0; i < top->cg_children; i++, l2g++) {
 811                 l2g->cg_parent = top;
 812                 l2g->cg_child = l1g;
 813                 l2g->cg_level = l2share;
 814                 for (j = 0; j < l2count; j++, l1g++)
 815                         cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
 816                             l1flags, cpu);
 817         }
 818         return (top);
 819 }
 820
 821 struct cpu_group *
 822 smp_topo_find(struct cpu_group *top, int cpu)
 823 {
 824         struct cpu_group *cg;
 825         cpuset_t mask;
 826         int children;
 827         int i;
 828
 829         CPU_SETOF(cpu, &mask);
 830         cg = top;
 831         for (;;) {
 832                 if (!CPU_OVERLAP(&cg->cg_mask, &mask))
 833                         return (NULL);
 834                 if (cg->cg_children == 0)
 835                         return (cg);
 836                 children = cg->cg_children;
 837                 for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
 838                         if (CPU_OVERLAP(&cg->cg_mask, &mask))
 839                                 break;
 840         }
 841         return (NULL);
 842 }
 843 #else /* !SMP */
 844
 845 void
 846 smp_rendezvous_cpus(cpuset_t map,
 847         void (*setup_func)(void *),
 848         void (*action_func)(void *),
 849         void (*teardown_func)(void *),
 850         void *arg)
 851 {
 852         /*
 853          * In the !SMP case we just need to ensure the same initial conditions
 854          * as the SMP case.
 855          */
 856         spinlock_enter();
 857         if (setup_func != NULL)
 858                 setup_func(arg);
 859         if (action_func != NULL)
 860                 action_func(arg);
 861         if (teardown_func != NULL)
 862                 teardown_func(arg);
 863         spinlock_exit();
 864 }
 865
 866 void
 867 smp_rendezvous(void (*setup_func)(void *),
 868                void (*action_func)(void *),
 869                void (*teardown_func)(void *),
 870                void *arg)
 871 {
 872
 873         smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func,
 874             arg);
 875 }
 876
 877 /*
 878  * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
 879  * APIs will still work using this dummy support.
 880  */
 881 static void
 882 mp_setvariables_for_up(void *dummy)
 883 {
 884         mp_ncpus = 1;
 885         mp_ncores = 1;
 886         mp_maxid = PCPU_GET(cpuid);
 887         CPU_SETOF(mp_maxid, &all_cpus);
 888         KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
 889 }
 890 SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
 891     mp_setvariables_for_up, NULL);
 892 #endif /* SMP */
 893
 894 void
 895 smp_no_rendezvous_barrier(void *dummy)
 896 {
 897 #ifdef SMP
 898         KASSERT((!smp_started),("smp_no_rendezvous called and smp is started"));
 899 #endif
 900 }
 901
 902 void
 903 smp_rendezvous_cpus_retry(cpuset_t map,
 904         void (* setup_func)(void *),
 905         void (* action_func)(void *),
 906         void (* teardown_func)(void *),
 907         void (* wait_func)(void *, int),
 908         struct smp_rendezvous_cpus_retry_arg *arg)
 909 {
 910         int cpu;
 911
 912         CPU_COPY(&map, &arg->cpus);
 913
 914         /*
 915          * Only one CPU to execute on.
 916          */
 917         if (!smp_started) {
 918                 spinlock_enter();
 919                 if (setup_func != NULL)
 920                         setup_func(arg);
 921                 if (action_func != NULL)
 922                         action_func(arg);
 923                 if (teardown_func != NULL)
 924                         teardown_func(arg);
 925                 spinlock_exit();
 926                 return;
 927         }
 928
 929         /*
 930          * Execute an action on all specified CPUs while retrying until they
 931          * all acknowledge completion.
 932          */
 933         for (;;) {
 934                 smp_rendezvous_cpus(
 935                     arg->cpus,
 936                     setup_func,
 937                     action_func,
 938                     teardown_func,
 939                     arg);
 940
 941                 if (CPU_EMPTY(&arg->cpus))
 942                         break;
 943
 944                 CPU_FOREACH(cpu) {
 945                         if (!CPU_ISSET(cpu, &arg->cpus))
 946                                 continue;
 947                         wait_func(arg, cpu);
 948                 }
 949         }
 950 }
 951
 952 void
 953 smp_rendezvous_cpus_done(struct smp_rendezvous_cpus_retry_arg *arg)
 954 {
 955
 956         CPU_CLR_ATOMIC(curcpu, &arg->cpus);
 957 }
 958
 959 /*
 960  * If (prio & PDROP) == 0:
 961  * Wait for specified idle threads to switch once.  This ensures that even
 962  * preempted threads have cycled through the switch function once,
 963  * exiting their codepaths.  This allows us to change global pointers
 964  * with no other synchronization.
 965  * If (prio & PDROP) != 0:
 966  * Force the specified CPUs to switch context at least once.
 967  */
 968 int
 969 quiesce_cpus(cpuset_t map, const char *wmesg, int prio)
 970 {
 971         struct pcpu *pcpu;
 972         u_int *gen;
 973         int error;
 974         int cpu;
 975
 976         error = 0;
 977         if ((prio & PDROP) == 0) {
 978                 gen = malloc(sizeof(u_int) * MAXCPU, M_TEMP, M_WAITOK);
 979                 for (cpu = 0; cpu <= mp_maxid; cpu++) {
 980                         if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 981                                 continue;
 982                         pcpu = pcpu_find(cpu);
 983                         gen[cpu] = pcpu->pc_idlethread->td_generation;
 984                 }
 985         }
 986         for (cpu = 0; cpu <= mp_maxid; cpu++) {
 987                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 988                         continue;
 989                 pcpu = pcpu_find(cpu);
 990                 thread_lock(curthread);
 991                 sched_bind(curthread, cpu);
 992                 thread_unlock(curthread);
 993                 if ((prio & PDROP) != 0)
 994                         continue;
 995                 while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
 996                         error = tsleep(quiesce_cpus, prio & ~PDROP, wmesg, 1);
 997                         if (error != EWOULDBLOCK)
 998                                 goto out;
 999                         error = 0;
1000                 }
1001         }
1002 out:
1003         thread_lock(curthread);
1004         sched_unbind(curthread);
1005         thread_unlock(curthread);
1006         if ((prio & PDROP) == 0)
1007                 free(gen, M_TEMP);
1008
1009         return (error);
1010 }
1011
1012 int
1013 quiesce_all_cpus(const char *wmesg, int prio)
1014 {
1015
1016         return quiesce_cpus(all_cpus, wmesg, prio);
1017 }
1018
1019 /*
1020  * Observe all CPUs not executing in critical section.
1021  * We are not in one so the check for us is safe. If the found
1022  * thread changes to something else we know the section was
1023  * exited as well.
1024  */
1025 void
1026 quiesce_all_critical(void)
1027 {
1028         struct thread *td, *newtd;
1029         struct pcpu *pcpu;
1030         int cpu;
1031
1032         MPASS(curthread->td_critnest == 0);
1033
1034         CPU_FOREACH(cpu) {
1035                 pcpu = cpuid_to_pcpu[cpu];
1036                 td = pcpu->pc_curthread;
1037                 for (;;) {
1038                         if (td->td_critnest == 0)
1039                                 break;
1040                         cpu_spinwait();
1041                         newtd = (struct thread *)
1042                             atomic_load_acq_ptr((void *)pcpu->pc_curthread);
1043                         if (td != newtd)
1044                                 break;
1045                 }
1046         }
1047 }
1048
1049 static void
1050 cpus_fence_seq_cst_issue(void *arg __unused)
1051 {
1052
1053         atomic_thread_fence_seq_cst();
1054 }
1055
1056 /*
1057  * Send an IPI forcing a sequentially consistent fence.
1058  *
1059  * Allows replacement of an explicitly fence with a compiler barrier.
1060  * Trades speed up during normal execution for a significant slowdown when
1061  * the barrier is needed.
1062  */
1063 void
1064 cpus_fence_seq_cst(void)
1065 {
1066
1067 #ifdef SMP
1068         smp_rendezvous(
1069             smp_no_rendezvous_barrier,
1070             cpus_fence_seq_cst_issue,
1071             smp_no_rendezvous_barrier,
1072             NULL
1073         );
1074 #else
1075         cpus_fence_seq_cst_issue(NULL);
1076 #endif
1077 }
1078
1079 /* Extra care is taken with this sysctl because the data type is volatile */
1080 static int
1081 sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS)
1082 {
1083         int error, active;
1084
1085         active = smp_started;
1086         error = SYSCTL_OUT(req, &active, sizeof(active));
1087         return (error);
1088 }
1089
1090 #ifdef SMP
1091 void
1092 topo_init_node(struct topo_node *node)
1093 {
1094
1095         bzero(node, sizeof(*node));
1096         TAILQ_INIT(&node->children);
1097 }
1098
1099 void
1100 topo_init_root(struct topo_node *root)
1101 {
1102
1103         topo_init_node(root);
1104         root->type = TOPO_TYPE_SYSTEM;
1105 }
1106
1107 /*
1108  * Add a child node with the given ID under the given parent.
1109  * Do nothing if there is already a child with that ID.
1110  */
1111 struct topo_node *
1112 topo_add_node_by_hwid(struct topo_node *parent, int hwid,
1113     topo_node_type type, uintptr_t subtype)
1114 {
1115         struct topo_node *node;
1116
1117         TAILQ_FOREACH_REVERSE(node, &parent->children,
1118             topo_children, siblings) {
1119                 if (node->hwid == hwid
1120                     && node->type == type && node->subtype == subtype) {
1121                         return (node);
1122                 }
1123         }
1124
1125         node = malloc(sizeof(*node), M_TOPO, M_WAITOK);
1126         topo_init_node(node);
1127         node->parent = parent;
1128         node->hwid = hwid;
1129         node->type = type;
1130         node->subtype = subtype;
1131         TAILQ_INSERT_TAIL(&parent->children, node, siblings);
1132         parent->nchildren++;
1133
1134         return (node);
1135 }
1136
1137 /*
1138  * Find a child node with the given ID under the given parent.
1139  */
1140 struct topo_node *
1141 topo_find_node_by_hwid(struct topo_node *parent, int hwid,
1142     topo_node_type type, uintptr_t subtype)
1143 {
1144
1145         struct topo_node *node;
1146
1147         TAILQ_FOREACH(node, &parent->children, siblings) {
1148                 if (node->hwid == hwid
1149                     && node->type == type && node->subtype == subtype) {
1150                         return (node);
1151                 }
1152         }
1153
1154         return (NULL);
1155 }
1156
1157 /*
1158  * Given a node change the order of its parent's child nodes such
1159  * that the node becomes the firt child while preserving the cyclic
1160  * order of the children.  In other words, the given node is promoted
1161  * by rotation.
1162  */
1163 void
1164 topo_promote_child(struct topo_node *child)
1165 {
1166         struct topo_node *next;
1167         struct topo_node *node;
1168         struct topo_node *parent;
1169
1170         parent = child->parent;
1171         next = TAILQ_NEXT(child, siblings);
1172         TAILQ_REMOVE(&parent->children, child, siblings);
1173         TAILQ_INSERT_HEAD(&parent->children, child, siblings);
1174
1175         while (next != NULL) {
1176                 node = next;
1177                 next = TAILQ_NEXT(node, siblings);
1178                 TAILQ_REMOVE(&parent->children, node, siblings);
1179                 TAILQ_INSERT_AFTER(&parent->children, child, node, siblings);
1180                 child = node;
1181         }
1182 }
1183
1184 /*
1185  * Iterate to the next node in the depth-first search (traversal) of
1186  * the topology tree.
1187  */
1188 struct topo_node *
1189 topo_next_node(struct topo_node *top, struct topo_node *node)
1190 {
1191         struct topo_node *next;
1192
1193         if ((next = TAILQ_FIRST(&node->children)) != NULL)
1194                 return (next);
1195
1196         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1197                 return (next);
1198
1199         while (node != top && (node = node->parent) != top)
1200                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1201                         return (next);
1202
1203         return (NULL);
1204 }
1205
1206 /*
1207  * Iterate to the next node in the depth-first search of the topology tree,
1208  * but without descending below the current node.
1209  */
1210 struct topo_node *
1211 topo_next_nonchild_node(struct topo_node *top, struct topo_node *node)
1212 {
1213         struct topo_node *next;
1214
1215         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1216                 return (next);
1217
1218         while (node != top && (node = node->parent) != top)
1219                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1220                         return (next);
1221
1222         return (NULL);
1223 }
1224
1225 /*
1226  * Assign the given ID to the given topology node that represents a logical
1227  * processor.
1228  */
1229 void
1230 topo_set_pu_id(struct topo_node *node, cpuid_t id)
1231 {
1232
1233         KASSERT(node->type == TOPO_TYPE_PU,
1234             ("topo_set_pu_id: wrong node type: %u", node->type));
1235         KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0,
1236             ("topo_set_pu_id: cpuset already not empty"));
1237         node->id = id;
1238         CPU_SET(id, &node->cpuset);
1239         node->cpu_count = 1;
1240         node->subtype = 1;
1241
1242         while ((node = node->parent) != NULL) {
1243                 KASSERT(!CPU_ISSET(id, &node->cpuset),
1244                     ("logical ID %u is already set in node %p", id, node));
1245                 CPU_SET(id, &node->cpuset);
1246                 node->cpu_count++;
1247         }
1248 }
1249
1250 static struct topology_spec {
1251         topo_node_type  type;
1252         bool            match_subtype;
1253         uintptr_t       subtype;
1254 } topology_level_table[TOPO_LEVEL_COUNT] = {
1255         [TOPO_LEVEL_PKG] = { .type = TOPO_TYPE_PKG, },
1256         [TOPO_LEVEL_GROUP] = { .type = TOPO_TYPE_GROUP, },
1257         [TOPO_LEVEL_CACHEGROUP] = {
1258                 .type = TOPO_TYPE_CACHE,
1259                 .match_subtype = true,
1260                 .subtype = CG_SHARE_L3,
1261         },
1262         [TOPO_LEVEL_CORE] = { .type = TOPO_TYPE_CORE, },
1263         [TOPO_LEVEL_THREAD] = { .type = TOPO_TYPE_PU, },
1264 };
1265
1266 static bool
1267 topo_analyze_table(struct topo_node *root, int all, enum topo_level level,
1268     struct topo_analysis *results)
1269 {
1270         struct topology_spec *spec;
1271         struct topo_node *node;
1272         int count;
1273
1274         if (level >= TOPO_LEVEL_COUNT)
1275                 return (true);
1276
1277         spec = &topology_level_table[level];
1278         count = 0;
1279         node = topo_next_node(root, root);
1280
1281         while (node != NULL) {
1282                 if (node->type != spec->type ||
1283                     (spec->match_subtype && node->subtype != spec->subtype)) {
1284                         node = topo_next_node(root, node);
1285                         continue;
1286                 }
1287                 if (!all && CPU_EMPTY(&node->cpuset)) {
1288                         node = topo_next_nonchild_node(root, node);
1289                         continue;
1290                 }
1291
1292                 count++;
1293
1294                 if (!topo_analyze_table(node, all, level + 1, results))
1295                         return (false);
1296
1297                 node = topo_next_nonchild_node(root, node);
1298         }
1299
1300         /* No explicit subgroups is essentially one subgroup. */
1301         if (count == 0) {
1302                 count = 1;
1303
1304                 if (!topo_analyze_table(root, all, level + 1, results))
1305                         return (false);
1306         }
1307
1308         if (results->entities[level] == -1)
1309                 results->entities[level] = count;
1310         else if (results->entities[level] != count)
1311                 return (false);
1312
1313         return (true);
1314 }
1315
1316 /*
1317  * Check if the topology is uniform, that is, each package has the same number
1318  * of cores in it and each core has the same number of threads (logical
1319  * processors) in it.  If so, calculate the number of packages, the number of
1320  * groups per package, the number of cachegroups per group, and the number of
1321  * logical processors per cachegroup.  'all' parameter tells whether to include
1322  * administratively disabled logical processors into the analysis.
1323  */
1324 int
1325 topo_analyze(struct topo_node *topo_root, int all,
1326     struct topo_analysis *results)
1327 {
1328
1329         results->entities[TOPO_LEVEL_PKG] = -1;
1330         results->entities[TOPO_LEVEL_CORE] = -1;
1331         results->entities[TOPO_LEVEL_THREAD] = -1;
1332         results->entities[TOPO_LEVEL_GROUP] = -1;
1333         results->entities[TOPO_LEVEL_CACHEGROUP] = -1;
1334
1335         if (!topo_analyze_table(topo_root, all, TOPO_LEVEL_PKG, results))
1336                 return (0);
1337
1338         KASSERT(results->entities[TOPO_LEVEL_PKG] > 0,
1339                 ("bug in topology or analysis"));
1340
1341         return (1);
1342 }
1343
1344 #endif /* SMP */