sys/kern/subr_smp.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28
  29 /*
  30  * This module holds the global variables and machine independent functions
  31  * used for the kernel SMP support.
  32  */
  33
  34 #include <sys/cdefs.h>
  35 __FBSDID("$FreeBSD$");
  36
  37 #include <sys/param.h>
  38 #include <sys/systm.h>
  39 #include <sys/kernel.h>
  40 #include <sys/ktr.h>
  41 #include <sys/proc.h>
  42 #include <sys/bus.h>
  43 #include <sys/lock.h>
  44 #include <sys/malloc.h>
  45 #include <sys/mutex.h>
  46 #include <sys/pcpu.h>
  47 #include <sys/sched.h>
  48 #include <sys/smp.h>
  49 #include <sys/sysctl.h>
  50
  51 #include <machine/cpu.h>
  52 #include <machine/smp.h>
  53
  54 #include "opt_sched.h"
  55
  56 #ifdef SMP
  57 MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data");
  58
  59 volatile cpuset_t stopped_cpus;
  60 volatile cpuset_t started_cpus;
  61 volatile cpuset_t suspended_cpus;
  62 cpuset_t hlt_cpus_mask;
  63 cpuset_t logical_cpus_mask;
  64
  65 void (*cpustop_restartfunc)(void);
  66 #endif
  67
  68 static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS);
  69
  70 /* This is used in modules that need to work in both SMP and UP. */
  71 cpuset_t all_cpus;
  72
  73 int mp_ncpus;
  74 /* export this for libkvm consumers. */
  75 int mp_maxcpus = MAXCPU;
  76
  77 volatile int smp_started;
  78 u_int mp_maxid;
  79
  80 static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL,
  81     "Kernel SMP");
  82
  83 SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0,
  84     "Max CPU ID.");
  85
  86 SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus,
  87     0, "Max number of CPUs that the system was compiled for.");
  88
  89 SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD|CTLTYPE_INT|CTLFLAG_MPSAFE,
  90     NULL, 0, sysctl_kern_smp_active, "I",
  91     "Indicates system is running in SMP mode");
  92
  93 int smp_disabled = 0;   /* has smp been disabled? */
  94 SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
  95     &smp_disabled, 0, "SMP has been disabled from the loader");
  96
  97 int smp_cpus = 1;       /* how many cpu's running */
  98 SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0,
  99     "Number of CPUs online");
 100
 101 int smp_topology = 0;   /* Which topology we're using. */
 102 SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0,
 103     "Topology override setting; 0 is default provided by hardware.");
 104
 105 #ifdef SMP
 106 /* Enable forwarding of a signal to a process running on a different CPU */
 107 static int forward_signal_enabled = 1;
 108 SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
 109            &forward_signal_enabled, 0,
 110            "Forwarding of a signal to a process on a different CPU");
 111
 112 /* Variables needed for SMP rendezvous. */
 113 static volatile int smp_rv_ncpus;
 114 static void (*volatile smp_rv_setup_func)(void *arg);
 115 static void (*volatile smp_rv_action_func)(void *arg);
 116 static void (*volatile smp_rv_teardown_func)(void *arg);
 117 static void *volatile smp_rv_func_arg;
 118 static volatile int smp_rv_waiters[4];
 119
 120 /*
 121  * Shared mutex to restrict busywaits between smp_rendezvous() and
 122  * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
 123  * functions trigger at once and cause multiple CPUs to busywait with
 124  * interrupts disabled.
 125  */
 126 struct mtx smp_ipi_mtx;
 127
 128 /*
 129  * Let the MD SMP code initialize mp_maxid very early if it can.
 130  */
 131 static void
 132 mp_setmaxid(void *dummy)
 133 {
 134
 135         cpu_mp_setmaxid();
 136
 137         KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__));
 138         KASSERT(mp_ncpus > 1 || mp_maxid == 0,
 139             ("%s: one CPU but mp_maxid is not zero", __func__));
 140         KASSERT(mp_maxid >= mp_ncpus - 1,
 141             ("%s: counters out of sync: max %d, count %d", __func__,
 142                 mp_maxid, mp_ncpus));
 143 }
 144 SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);
 145
 146 /*
 147  * Call the MD SMP initialization code.
 148  */
 149 static void
 150 mp_start(void *dummy)
 151 {
 152
 153         mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
 154
 155         /* Probe for MP hardware. */
 156         if (smp_disabled != 0 || cpu_mp_probe() == 0) {
 157                 mp_ncpus = 1;
 158                 CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
 159                 return;
 160         }
 161
 162         cpu_mp_start();
 163         printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
 164             mp_ncpus);
 165         cpu_mp_announce();
 166 }
 167 SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);
 168
 169 void
 170 forward_signal(struct thread *td)
 171 {
 172         int id;
 173
 174         /*
 175          * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on
 176          * this thread, so all we need to do is poke it if it is currently
 177          * executing so that it executes ast().
 178          */
 179         THREAD_LOCK_ASSERT(td, MA_OWNED);
 180         KASSERT(TD_IS_RUNNING(td),
 181             ("forward_signal: thread is not TDS_RUNNING"));
 182
 183         CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
 184
 185         if (!smp_started || cold || panicstr)
 186                 return;
 187         if (!forward_signal_enabled)
 188                 return;
 189
 190         /* No need to IPI ourself. */
 191         if (td == curthread)
 192                 return;
 193
 194         id = td->td_oncpu;
 195         if (id == NOCPU)
 196                 return;
 197         ipi_cpu(id, IPI_AST);
 198 }
 199
 200 /*
 201  * When called the executing CPU will send an IPI to all other CPUs
 202  *  requesting that they halt execution.
 203  *
 204  * Usually (but not necessarily) called with 'other_cpus' as its arg.
 205  *
 206  *  - Signals all CPUs in map to stop.
 207  *  - Waits for each to stop.
 208  *
 209  * Returns:
 210  *  -1: error
 211  *   0: NA
 212  *   1: ok
 213  *
 214  */
 215 #if defined(__amd64__) || defined(__i386__)
 216 #define X86     1
 217 #else
 218 #define X86     0
 219 #endif
 220 static int
 221 generic_stop_cpus(cpuset_t map, u_int type)
 222 {
 223 #ifdef KTR
 224         char cpusetbuf[CPUSETBUFSIZ];
 225 #endif
 226         static volatile u_int stopping_cpu = NOCPU;
 227         int i;
 228         volatile cpuset_t *cpus;
 229
 230         KASSERT(
 231             type == IPI_STOP || type == IPI_STOP_HARD
 232 #if X86
 233             || type == IPI_SUSPEND
 234 #endif
 235             , ("%s: invalid stop type", __func__));
 236
 237         if (!smp_started)
 238                 return (0);
 239
 240         CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
 241             cpusetobj_strprint(cpusetbuf, &map), type);
 242
 243 #if X86
 244         /*
 245          * When suspending, ensure there are are no IPIs in progress.
 246          * IPIs that have been issued, but not yet delivered (e.g.
 247          * not pending on a vCPU when running under virtualization)
 248          * will be lost, violating FreeBSD's assumption of reliable
 249          * IPI delivery.
 250          */
 251         if (type == IPI_SUSPEND)
 252                 mtx_lock_spin(&smp_ipi_mtx);
 253 #endif
 254
 255 #if X86
 256         if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
 257 #endif
 258         if (stopping_cpu != PCPU_GET(cpuid))
 259                 while (atomic_cmpset_int(&stopping_cpu, NOCPU,
 260                     PCPU_GET(cpuid)) == 0)
 261                         while (stopping_cpu != NOCPU)
 262                                 cpu_spinwait(); /* spin */
 263
 264         /* send the stop IPI to all CPUs in map */
 265         ipi_selected(map, type);
 266 #if X86
 267         }
 268 #endif
 269
 270 #if X86
 271         if (type == IPI_SUSPEND)
 272                 cpus = &suspended_cpus;
 273         else
 274 #endif
 275                 cpus = &stopped_cpus;
 276
 277         i = 0;
 278         while (!CPU_SUBSET(cpus, &map)) {
 279                 /* spin */
 280                 cpu_spinwait();
 281                 i++;
 282                 if (i == 100000000) {
 283                         printf("timeout stopping cpus\n");
 284                         break;
 285                 }
 286         }
 287
 288 #if X86
 289         if (type == IPI_SUSPEND)
 290                 mtx_unlock_spin(&smp_ipi_mtx);
 291 #endif
 292
 293         stopping_cpu = NOCPU;
 294         return (1);
 295 }
 296
 297 int
 298 stop_cpus(cpuset_t map)
 299 {
 300
 301         return (generic_stop_cpus(map, IPI_STOP));
 302 }
 303
 304 int
 305 stop_cpus_hard(cpuset_t map)
 306 {
 307
 308         return (generic_stop_cpus(map, IPI_STOP_HARD));
 309 }
 310
 311 #if X86
 312 int
 313 suspend_cpus(cpuset_t map)
 314 {
 315
 316         return (generic_stop_cpus(map, IPI_SUSPEND));
 317 }
 318 #endif
 319
 320 /*
 321  * Called by a CPU to restart stopped CPUs.
 322  *
 323  * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
 324  *
 325  *  - Signals all CPUs in map to restart.
 326  *  - Waits for each to restart.
 327  *
 328  * Returns:
 329  *  -1: error
 330  *   0: NA
 331  *   1: ok
 332  */
 333 static int
 334 generic_restart_cpus(cpuset_t map, u_int type)
 335 {
 336 #ifdef KTR
 337         char cpusetbuf[CPUSETBUFSIZ];
 338 #endif
 339         volatile cpuset_t *cpus;
 340
 341         KASSERT(type == IPI_STOP || type == IPI_STOP_HARD
 342 #if X86
 343             || type == IPI_SUSPEND
 344 #endif
 345             , ("%s: invalid stop type", __func__));
 346
 347         if (!smp_started)
 348                 return (0);
 349
 350         CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
 351
 352 #if X86
 353         if (type == IPI_SUSPEND)
 354                 cpus = &suspended_cpus;
 355         else
 356 #endif
 357                 cpus = &stopped_cpus;
 358
 359         /* signal other cpus to restart */
 360         CPU_COPY_STORE_REL(&map, &started_cpus);
 361
 362 #if X86
 363         if (!nmi_is_broadcast || nmi_kdb_lock == 0) {
 364 #endif
 365         /* wait for each to clear its bit */
 366         while (CPU_OVERLAP(cpus, &map))
 367                 cpu_spinwait();
 368 #if X86
 369         }
 370 #endif
 371
 372         return (1);
 373 }
 374
 375 int
 376 restart_cpus(cpuset_t map)
 377 {
 378
 379         return (generic_restart_cpus(map, IPI_STOP));
 380 }
 381
 382 #if X86
 383 int
 384 resume_cpus(cpuset_t map)
 385 {
 386
 387         return (generic_restart_cpus(map, IPI_SUSPEND));
 388 }
 389 #endif
 390 #undef X86
 391
 392 /*
 393  * All-CPU rendezvous.  CPUs are signalled, all execute the setup function
 394  * (if specified), rendezvous, execute the action function (if specified),
 395  * rendezvous again, execute the teardown function (if specified), and then
 396  * resume.
 397  *
 398  * Note that the supplied external functions _must_ be reentrant and aware
 399  * that they are running in parallel and in an unknown lock context.
 400  */
 401 void
 402 smp_rendezvous_action(void)
 403 {
 404         struct thread *td;
 405         void *local_func_arg;
 406         void (*local_setup_func)(void*);
 407         void (*local_action_func)(void*);
 408         void (*local_teardown_func)(void*);
 409 #ifdef INVARIANTS
 410         int owepreempt;
 411 #endif
 412
 413         /* Ensure we have up-to-date values. */
 414         atomic_add_acq_int(&smp_rv_waiters[0], 1);
 415         while (smp_rv_waiters[0] < smp_rv_ncpus)
 416                 cpu_spinwait();
 417
 418         /* Fetch rendezvous parameters after acquire barrier. */
 419         local_func_arg = smp_rv_func_arg;
 420         local_setup_func = smp_rv_setup_func;
 421         local_action_func = smp_rv_action_func;
 422         local_teardown_func = smp_rv_teardown_func;
 423
 424         /*
 425          * Use a nested critical section to prevent any preemptions
 426          * from occurring during a rendezvous action routine.
 427          * Specifically, if a rendezvous handler is invoked via an IPI
 428          * and the interrupted thread was in the critical_exit()
 429          * function after setting td_critnest to 0 but before
 430          * performing a deferred preemption, this routine can be
 431          * invoked with td_critnest set to 0 and td_owepreempt true.
 432          * In that case, a critical_exit() during the rendezvous
 433          * action would trigger a preemption which is not permitted in
 434          * a rendezvous action.  To fix this, wrap all of the
 435          * rendezvous action handlers in a critical section.  We
 436          * cannot use a regular critical section however as having
 437          * critical_exit() preempt from this routine would also be
 438          * problematic (the preemption must not occur before the IPI
 439          * has been acknowledged via an EOI).  Instead, we
 440          * intentionally ignore td_owepreempt when leaving the
 441          * critical section.  This should be harmless because we do
 442          * not permit rendezvous action routines to schedule threads,
 443          * and thus td_owepreempt should never transition from 0 to 1
 444          * during this routine.
 445          */
 446         td = curthread;
 447         td->td_critnest++;
 448 #ifdef INVARIANTS
 449         owepreempt = td->td_owepreempt;
 450 #endif
 451
 452         /*
 453          * If requested, run a setup function before the main action
 454          * function.  Ensure all CPUs have completed the setup
 455          * function before moving on to the action function.
 456          */
 457         if (local_setup_func != smp_no_rendezvous_barrier) {
 458                 if (smp_rv_setup_func != NULL)
 459                         smp_rv_setup_func(smp_rv_func_arg);
 460                 atomic_add_int(&smp_rv_waiters[1], 1);
 461                 while (smp_rv_waiters[1] < smp_rv_ncpus)
 462                         cpu_spinwait();
 463         }
 464
 465         if (local_action_func != NULL)
 466                 local_action_func(local_func_arg);
 467
 468         if (local_teardown_func != smp_no_rendezvous_barrier) {
 469                 /*
 470                  * Signal that the main action has been completed.  If a
 471                  * full exit rendezvous is requested, then all CPUs will
 472                  * wait here until all CPUs have finished the main action.
 473                  */
 474                 atomic_add_int(&smp_rv_waiters[2], 1);
 475                 while (smp_rv_waiters[2] < smp_rv_ncpus)
 476                         cpu_spinwait();
 477
 478                 if (local_teardown_func != NULL)
 479                         local_teardown_func(local_func_arg);
 480         }
 481
 482         /*
 483          * Signal that the rendezvous is fully completed by this CPU.
 484          * This means that no member of smp_rv_* pseudo-structure will be
 485          * accessed by this target CPU after this point; in particular,
 486          * memory pointed by smp_rv_func_arg.
 487          *
 488          * The release semantic ensures that all accesses performed by
 489          * the current CPU are visible when smp_rendezvous_cpus()
 490          * returns, by synchronizing with the
 491          * atomic_load_acq_int(&smp_rv_waiters[3]).
 492          */
 493         atomic_add_rel_int(&smp_rv_waiters[3], 1);
 494
 495         td->td_critnest--;
 496         KASSERT(owepreempt == td->td_owepreempt,
 497             ("rendezvous action changed td_owepreempt"));
 498 }
 499
 500 void
 501 smp_rendezvous_cpus(cpuset_t map,
 502         void (* setup_func)(void *),
 503         void (* action_func)(void *),
 504         void (* teardown_func)(void *),
 505         void *arg)
 506 {
 507         int curcpumap, i, ncpus = 0;
 508
 509         /* Look comments in the !SMP case. */
 510         if (!smp_started) {
 511                 spinlock_enter();
 512                 if (setup_func != NULL)
 513                         setup_func(arg);
 514                 if (action_func != NULL)
 515                         action_func(arg);
 516                 if (teardown_func != NULL)
 517                         teardown_func(arg);
 518                 spinlock_exit();
 519                 return;
 520         }
 521
 522         CPU_FOREACH(i) {
 523                 if (CPU_ISSET(i, &map))
 524                         ncpus++;
 525         }
 526         if (ncpus == 0)
 527                 panic("ncpus is 0 with non-zero map");
 528
 529         mtx_lock_spin(&smp_ipi_mtx);
 530
 531         /* Pass rendezvous parameters via global variables. */
 532         smp_rv_ncpus = ncpus;
 533         smp_rv_setup_func = setup_func;
 534         smp_rv_action_func = action_func;
 535         smp_rv_teardown_func = teardown_func;
 536         smp_rv_func_arg = arg;
 537         smp_rv_waiters[1] = 0;
 538         smp_rv_waiters[2] = 0;
 539         smp_rv_waiters[3] = 0;
 540         atomic_store_rel_int(&smp_rv_waiters[0], 0);
 541
 542         /*
 543          * Signal other processors, which will enter the IPI with
 544          * interrupts off.
 545          */
 546         curcpumap = CPU_ISSET(curcpu, &map);
 547         CPU_CLR(curcpu, &map);
 548         ipi_selected(map, IPI_RENDEZVOUS);
 549
 550         /* Check if the current CPU is in the map */
 551         if (curcpumap != 0)
 552                 smp_rendezvous_action();
 553
 554         /*
 555          * Ensure that the master CPU waits for all the other
 556          * CPUs to finish the rendezvous, so that smp_rv_*
 557          * pseudo-structure and the arg are guaranteed to not
 558          * be in use.
 559          *
 560          * Load acquire synchronizes with the release add in
 561          * smp_rendezvous_action(), which ensures that our caller sees
 562          * all memory actions done by the called functions on other
 563          * CPUs.
 564          */
 565         while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
 566                 cpu_spinwait();
 567
 568         mtx_unlock_spin(&smp_ipi_mtx);
 569 }
 570
 571 void
 572 smp_rendezvous(void (* setup_func)(void *),
 573                void (* action_func)(void *),
 574                void (* teardown_func)(void *),
 575                void *arg)
 576 {
 577         smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
 578 }
 579
 580 static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];
 581
 582 struct cpu_group *
 583 smp_topo(void)
 584 {
 585         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 586         struct cpu_group *top;
 587
 588         /*
 589          * Check for a fake topology request for debugging purposes.
 590          */
 591         switch (smp_topology) {
 592         case 1:
 593                 /* Dual core with no sharing.  */
 594                 top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
 595                 break;
 596         case 2:
 597                 /* No topology, all cpus are equal. */
 598                 top = smp_topo_none();
 599                 break;
 600         case 3:
 601                 /* Dual core with shared L2.  */
 602                 top = smp_topo_1level(CG_SHARE_L2, 2, 0);
 603                 break;
 604         case 4:
 605                 /* quad core, shared l3 among each package, private l2.  */
 606                 top = smp_topo_1level(CG_SHARE_L3, 4, 0);
 607                 break;
 608         case 5:
 609                 /* quad core,  2 dualcore parts on each package share l2.  */
 610                 top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
 611                 break;
 612         case 6:
 613                 /* Single-core 2xHTT */
 614                 top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
 615                 break;
 616         case 7:
 617                 /* quad core with a shared l3, 8 threads sharing L2.  */
 618                 top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
 619                     CG_FLAG_SMT);
 620                 break;
 621         default:
 622                 /* Default, ask the system what it wants. */
 623                 top = cpu_topo();
 624                 break;
 625         }
 626         /*
 627          * Verify the returned topology.
 628          */
 629         if (top->cg_count != mp_ncpus)
 630                 panic("Built bad topology at %p.  CPU count %d != %d",
 631                     top, top->cg_count, mp_ncpus);
 632         if (CPU_CMP(&top->cg_mask, &all_cpus))
 633                 panic("Built bad topology at %p.  CPU mask (%s) != (%s)",
 634                     top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
 635                     cpusetobj_strprint(cpusetbuf2, &all_cpus));
 636
 637         /*
 638          * Collapse nonsense levels that may be created out of convenience by
 639          * the MD layers.  They cause extra work in the search functions.
 640          */
 641         while (top->cg_children == 1) {
 642                 top = &top->cg_child[0];
 643                 top->cg_parent = NULL;
 644         }
 645         return (top);
 646 }
 647
 648 struct cpu_group *
 649 smp_topo_alloc(u_int count)
 650 {
 651         static u_int index;
 652         u_int curr;
 653
 654         curr = index;
 655         index += count;
 656         return (&group[curr]);
 657 }
 658
 659 struct cpu_group *
 660 smp_topo_none(void)
 661 {
 662         struct cpu_group *top;
 663
 664         top = &group[0];
 665         top->cg_parent = NULL;
 666         top->cg_child = NULL;
 667         top->cg_mask = all_cpus;
 668         top->cg_count = mp_ncpus;
 669         top->cg_children = 0;
 670         top->cg_level = CG_SHARE_NONE;
 671         top->cg_flags = 0;
 672
 673         return (top);
 674 }
 675
 676 static int
 677 smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
 678     int count, int flags, int start)
 679 {
 680         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 681         cpuset_t mask;
 682         int i;
 683
 684         CPU_ZERO(&mask);
 685         for (i = 0; i < count; i++, start++)
 686                 CPU_SET(start, &mask);
 687         child->cg_parent = parent;
 688         child->cg_child = NULL;
 689         child->cg_children = 0;
 690         child->cg_level = share;
 691         child->cg_count = count;
 692         child->cg_flags = flags;
 693         child->cg_mask = mask;
 694         parent->cg_children++;
 695         for (; parent != NULL; parent = parent->cg_parent) {
 696                 if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
 697                         panic("Duplicate children in %p.  mask (%s) child (%s)",
 698                             parent,
 699                             cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
 700                             cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
 701                 CPU_OR(&parent->cg_mask, &child->cg_mask);
 702                 parent->cg_count += child->cg_count;
 703         }
 704
 705         return (start);
 706 }
 707
 708 struct cpu_group *
 709 smp_topo_1level(int share, int count, int flags)
 710 {
 711         struct cpu_group *child;
 712         struct cpu_group *top;
 713         int packages;
 714         int cpu;
 715         int i;
 716
 717         cpu = 0;
 718         top = &group[0];
 719         packages = mp_ncpus / count;
 720         top->cg_child = child = &group[1];
 721         top->cg_level = CG_SHARE_NONE;
 722         for (i = 0; i < packages; i++, child++)
 723                 cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
 724         return (top);
 725 }
 726
 727 struct cpu_group *
 728 smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
 729     int l1flags)
 730 {
 731         struct cpu_group *top;
 732         struct cpu_group *l1g;
 733         struct cpu_group *l2g;
 734         int cpu;
 735         int i;
 736         int j;
 737
 738         cpu = 0;
 739         top = &group[0];
 740         l2g = &group[1];
 741         top->cg_child = l2g;
 742         top->cg_level = CG_SHARE_NONE;
 743         top->cg_children = mp_ncpus / (l2count * l1count);
 744         l1g = l2g + top->cg_children;
 745         for (i = 0; i < top->cg_children; i++, l2g++) {
 746                 l2g->cg_parent = top;
 747                 l2g->cg_child = l1g;
 748                 l2g->cg_level = l2share;
 749                 for (j = 0; j < l2count; j++, l1g++)
 750                         cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
 751                             l1flags, cpu);
 752         }
 753         return (top);
 754 }
 755
 756
 757 struct cpu_group *
 758 smp_topo_find(struct cpu_group *top, int cpu)
 759 {
 760         struct cpu_group *cg;
 761         cpuset_t mask;
 762         int children;
 763         int i;
 764
 765         CPU_SETOF(cpu, &mask);
 766         cg = top;
 767         for (;;) {
 768                 if (!CPU_OVERLAP(&cg->cg_mask, &mask))
 769                         return (NULL);
 770                 if (cg->cg_children == 0)
 771                         return (cg);
 772                 children = cg->cg_children;
 773                 for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
 774                         if (CPU_OVERLAP(&cg->cg_mask, &mask))
 775                                 break;
 776         }
 777         return (NULL);
 778 }
 779 #else /* !SMP */
 780
 781 void
 782 smp_rendezvous_cpus(cpuset_t map,
 783         void (*setup_func)(void *),
 784         void (*action_func)(void *),
 785         void (*teardown_func)(void *),
 786         void *arg)
 787 {
 788         /*
 789          * In the !SMP case we just need to ensure the same initial conditions
 790          * as the SMP case.
 791          */
 792         spinlock_enter();
 793         if (setup_func != NULL)
 794                 setup_func(arg);
 795         if (action_func != NULL)
 796                 action_func(arg);
 797         if (teardown_func != NULL)
 798                 teardown_func(arg);
 799         spinlock_exit();
 800 }
 801
 802 void
 803 smp_rendezvous(void (*setup_func)(void *),
 804                void (*action_func)(void *),
 805                void (*teardown_func)(void *),
 806                void *arg)
 807 {
 808
 809         smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func,
 810             arg);
 811 }
 812
 813 /*
 814  * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
 815  * APIs will still work using this dummy support.
 816  */
 817 static void
 818 mp_setvariables_for_up(void *dummy)
 819 {
 820         mp_ncpus = 1;
 821         mp_maxid = PCPU_GET(cpuid);
 822         CPU_SETOF(mp_maxid, &all_cpus);
 823         KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
 824 }
 825 SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
 826     mp_setvariables_for_up, NULL);
 827 #endif /* SMP */
 828
 829 void
 830 smp_no_rendezvous_barrier(void *dummy)
 831 {
 832 #ifdef SMP
 833         KASSERT((!smp_started),("smp_no_rendezvous called and smp is started"));
 834 #endif
 835 }
 836
 837 /*
 838  * Wait for specified idle threads to switch once.  This ensures that even
 839  * preempted threads have cycled through the switch function once,
 840  * exiting their codepaths.  This allows us to change global pointers
 841  * with no other synchronization.
 842  */
 843 int
 844 quiesce_cpus(cpuset_t map, const char *wmesg, int prio)
 845 {
 846         struct pcpu *pcpu;
 847         u_int gen[MAXCPU];
 848         int error;
 849         int cpu;
 850
 851         error = 0;
 852         for (cpu = 0; cpu <= mp_maxid; cpu++) {
 853                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 854                         continue;
 855                 pcpu = pcpu_find(cpu);
 856                 gen[cpu] = pcpu->pc_idlethread->td_generation;
 857         }
 858         for (cpu = 0; cpu <= mp_maxid; cpu++) {
 859                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 860                         continue;
 861                 pcpu = pcpu_find(cpu);
 862                 thread_lock(curthread);
 863                 sched_bind(curthread, cpu);
 864                 thread_unlock(curthread);
 865                 while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
 866                         error = tsleep(quiesce_cpus, prio, wmesg, 1);
 867                         if (error != EWOULDBLOCK)
 868                                 goto out;
 869                         error = 0;
 870                 }
 871         }
 872 out:
 873         thread_lock(curthread);
 874         sched_unbind(curthread);
 875         thread_unlock(curthread);
 876
 877         return (error);
 878 }
 879
 880 int
 881 quiesce_all_cpus(const char *wmesg, int prio)
 882 {
 883
 884         return quiesce_cpus(all_cpus, wmesg, prio);
 885 }
 886
 887 /* Extra care is taken with this sysctl because the data type is volatile */
 888 static int
 889 sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS)
 890 {
 891         int error, active;
 892
 893         active = smp_started;
 894         error = SYSCTL_OUT(req, &active, sizeof(active));
 895         return (error);
 896 }
 897
 898
 899 #ifdef SMP
 900 void
 901 topo_init_node(struct topo_node *node)
 902 {
 903
 904         bzero(node, sizeof(*node));
 905         TAILQ_INIT(&node->children);
 906 }
 907
 908 void
 909 topo_init_root(struct topo_node *root)
 910 {
 911
 912         topo_init_node(root);
 913         root->type = TOPO_TYPE_SYSTEM;
 914 }
 915
 916 /*
 917  * Add a child node with the given ID under the given parent.
 918  * Do nothing if there is already a child with that ID.
 919  */
 920 struct topo_node *
 921 topo_add_node_by_hwid(struct topo_node *parent, int hwid,
 922     topo_node_type type, uintptr_t subtype)
 923 {
 924         struct topo_node *node;
 925
 926         TAILQ_FOREACH_REVERSE(node, &parent->children,
 927             topo_children, siblings) {
 928                 if (node->hwid == hwid
 929                     && node->type == type && node->subtype == subtype) {
 930                         return (node);
 931                 }
 932         }
 933
 934         node = malloc(sizeof(*node), M_TOPO, M_WAITOK);
 935         topo_init_node(node);
 936         node->parent = parent;
 937         node->hwid = hwid;
 938         node->type = type;
 939         node->subtype = subtype;
 940         TAILQ_INSERT_TAIL(&parent->children, node, siblings);
 941         parent->nchildren++;
 942
 943         return (node);
 944 }
 945
 946 /*
 947  * Find a child node with the given ID under the given parent.
 948  */
 949 struct topo_node *
 950 topo_find_node_by_hwid(struct topo_node *parent, int hwid,
 951     topo_node_type type, uintptr_t subtype)
 952 {
 953
 954         struct topo_node *node;
 955
 956         TAILQ_FOREACH(node, &parent->children, siblings) {
 957                 if (node->hwid == hwid
 958                     && node->type == type && node->subtype == subtype) {
 959                         return (node);
 960                 }
 961         }
 962
 963         return (NULL);
 964 }
 965
 966 /*
 967  * Given a node change the order of its parent's child nodes such
 968  * that the node becomes the firt child while preserving the cyclic
 969  * order of the children.  In other words, the given node is promoted
 970  * by rotation.
 971  */
 972 void
 973 topo_promote_child(struct topo_node *child)
 974 {
 975         struct topo_node *next;
 976         struct topo_node *node;
 977         struct topo_node *parent;
 978
 979         parent = child->parent;
 980         next = TAILQ_NEXT(child, siblings);
 981         TAILQ_REMOVE(&parent->children, child, siblings);
 982         TAILQ_INSERT_HEAD(&parent->children, child, siblings);
 983
 984         while (next != NULL) {
 985                 node = next;
 986                 next = TAILQ_NEXT(node, siblings);
 987                 TAILQ_REMOVE(&parent->children, node, siblings);
 988                 TAILQ_INSERT_AFTER(&parent->children, child, node, siblings);
 989                 child = node;
 990         }
 991 }
 992
 993 /*
 994  * Iterate to the next node in the depth-first search (traversal) of
 995  * the topology tree.
 996  */
 997 struct topo_node *
 998 topo_next_node(struct topo_node *top, struct topo_node *node)
 999 {
1000         struct topo_node *next;
1001
1002         if ((next = TAILQ_FIRST(&node->children)) != NULL)
1003                 return (next);
1004
1005         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1006                 return (next);
1007
1008         while (node != top && (node = node->parent) != top)
1009                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1010                         return (next);
1011
1012         return (NULL);
1013 }
1014
1015 /*
1016  * Iterate to the next node in the depth-first search of the topology tree,
1017  * but without descending below the current node.
1018  */
1019 struct topo_node *
1020 topo_next_nonchild_node(struct topo_node *top, struct topo_node *node)
1021 {
1022         struct topo_node *next;
1023
1024         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1025                 return (next);
1026
1027         while (node != top && (node = node->parent) != top)
1028                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1029                         return (next);
1030
1031         return (NULL);
1032 }
1033
1034 /*
1035  * Assign the given ID to the given topology node that represents a logical
1036  * processor.
1037  */
1038 void
1039 topo_set_pu_id(struct topo_node *node, cpuid_t id)
1040 {
1041
1042         KASSERT(node->type == TOPO_TYPE_PU,
1043             ("topo_set_pu_id: wrong node type: %u", node->type));
1044         KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0,
1045             ("topo_set_pu_id: cpuset already not empty"));
1046         node->id = id;
1047         CPU_SET(id, &node->cpuset);
1048         node->cpu_count = 1;
1049         node->subtype = 1;
1050
1051         while ((node = node->parent) != NULL) {
1052                 KASSERT(!CPU_ISSET(id, &node->cpuset),
1053                     ("logical ID %u is already set in node %p", id, node));
1054                 CPU_SET(id, &node->cpuset);
1055                 node->cpu_count++;
1056         }
1057 }
1058
1059 static struct topology_spec {
1060         topo_node_type  type;
1061         bool            match_subtype;
1062         uintptr_t       subtype;
1063 } topology_level_table[TOPO_LEVEL_COUNT] = {
1064         [TOPO_LEVEL_PKG] = { .type = TOPO_TYPE_PKG, },
1065         [TOPO_LEVEL_GROUP] = { .type = TOPO_TYPE_GROUP, },
1066         [TOPO_LEVEL_CACHEGROUP] = {
1067                 .type = TOPO_TYPE_CACHE,
1068                 .match_subtype = true,
1069                 .subtype = CG_SHARE_L3,
1070         },
1071         [TOPO_LEVEL_CORE] = { .type = TOPO_TYPE_CORE, },
1072         [TOPO_LEVEL_THREAD] = { .type = TOPO_TYPE_PU, },
1073 };
1074
1075 static bool
1076 topo_analyze_table(struct topo_node *root, int all, enum topo_level level,
1077     struct topo_analysis *results)
1078 {
1079         struct topology_spec *spec;
1080         struct topo_node *node;
1081         int count;
1082
1083         if (level >= TOPO_LEVEL_COUNT)
1084                 return (true);
1085
1086         spec = &topology_level_table[level];
1087         count = 0;
1088         node = topo_next_node(root, root);
1089
1090         while (node != NULL) {
1091                 if (node->type != spec->type ||
1092                     (spec->match_subtype && node->subtype != spec->subtype)) {
1093                         node = topo_next_node(root, node);
1094                         continue;
1095                 }
1096                 if (!all && CPU_EMPTY(&node->cpuset)) {
1097                         node = topo_next_nonchild_node(root, node);
1098                         continue;
1099                 }
1100
1101                 count++;
1102
1103                 if (!topo_analyze_table(node, all, level + 1, results))
1104                         return (false);
1105
1106                 node = topo_next_nonchild_node(root, node);
1107         }
1108
1109         /* No explicit subgroups is essentially one subgroup. */
1110         if (count == 0) {
1111                 count = 1;
1112
1113                 if (!topo_analyze_table(root, all, level + 1, results))
1114                         return (false);
1115         }
1116
1117         if (results->entities[level] == -1)
1118                 results->entities[level] = count;
1119         else if (results->entities[level] != count)
1120                 return (false);
1121
1122         return (true);
1123 }
1124
1125 /*
1126  * Check if the topology is uniform, that is, each package has the same number
1127  * of cores in it and each core has the same number of threads (logical
1128  * processors) in it.  If so, calculate the number of packages, the number of
1129  * groups per package, the number of cachegroups per group, and the number of
1130  * logical processors per cachegroup.  'all' parameter tells whether to include
1131  * administratively disabled logical processors into the analysis.
1132  */
1133 int
1134 topo_analyze(struct topo_node *topo_root, int all,
1135     struct topo_analysis *results)
1136 {
1137
1138         results->entities[TOPO_LEVEL_PKG] = -1;
1139         results->entities[TOPO_LEVEL_CORE] = -1;
1140         results->entities[TOPO_LEVEL_THREAD] = -1;
1141         results->entities[TOPO_LEVEL_GROUP] = -1;
1142         results->entities[TOPO_LEVEL_CACHEGROUP] = -1;
1143
1144         if (!topo_analyze_table(topo_root, all, TOPO_LEVEL_PKG, results))
1145                 return (0);
1146
1147         KASSERT(results->entities[TOPO_LEVEL_PKG] > 0,
1148                 ("bug in topology or analysis"));
1149
1150         return (1);
1151 }
1152
1153 #endif /* SMP */
1154