sys/kern/subr_smp.c

   1 /*-
   2  * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>.
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26
  27 /*
  28  * This module holds the global variables and machine independent functions
  29  * used for the kernel SMP support.
  30  */
  31
  32 #include <sys/cdefs.h>
  33 __FBSDID("$FreeBSD$");
  34
  35 #include <sys/param.h>
  36 #include <sys/systm.h>
  37 #include <sys/kernel.h>
  38 #include <sys/ktr.h>
  39 #include <sys/proc.h>
  40 #include <sys/bus.h>
  41 #include <sys/lock.h>
  42 #include <sys/malloc.h>
  43 #include <sys/mutex.h>
  44 #include <sys/pcpu.h>
  45 #include <sys/sched.h>
  46 #include <sys/smp.h>
  47 #include <sys/sysctl.h>
  48
  49 #include <machine/cpu.h>
  50 #include <machine/smp.h>
  51
  52 #include "opt_sched.h"
  53
  54 #ifdef SMP
  55 MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data");
  56
  57 volatile cpuset_t stopped_cpus;
  58 volatile cpuset_t started_cpus;
  59 volatile cpuset_t suspended_cpus;
  60 cpuset_t hlt_cpus_mask;
  61 cpuset_t logical_cpus_mask;
  62
  63 void (*cpustop_restartfunc)(void);
  64 #endif
  65
  66 static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS);
  67
  68 /* This is used in modules that need to work in both SMP and UP. */
  69 cpuset_t all_cpus;
  70
  71 int mp_ncpus;
  72 /* export this for libkvm consumers. */
  73 int mp_maxcpus = MAXCPU;
  74
  75 volatile int smp_started;
  76 u_int mp_maxid;
  77
  78 static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL,
  79     "Kernel SMP");
  80
  81 SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0,
  82     "Max CPU ID.");
  83
  84 SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus,
  85     0, "Max number of CPUs that the system was compiled for.");
  86
  87 SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD | CTLTYPE_INT, NULL, 0,
  88     sysctl_kern_smp_active, "I", "Indicates system is running in SMP mode");
  89
  90 int smp_disabled = 0;   /* has smp been disabled? */
  91 SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD,
  92     &smp_disabled, 0, "SMP has been disabled from the loader");
  93
  94 int smp_cpus = 1;       /* how many cpu's running */
  95 SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0,
  96     "Number of CPUs online");
  97
  98 int smp_topology = 0;   /* Which topology we're using. */
  99 SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0,
 100     "Topology override setting; 0 is default provided by hardware.");
 101
 102 #ifdef SMP
 103 /* Enable forwarding of a signal to a process running on a different CPU */
 104 static int forward_signal_enabled = 1;
 105 SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW,
 106            &forward_signal_enabled, 0,
 107            "Forwarding of a signal to a process on a different CPU");
 108
 109 /* Variables needed for SMP rendezvous. */
 110 static volatile int smp_rv_ncpus;
 111 static void (*volatile smp_rv_setup_func)(void *arg);
 112 static void (*volatile smp_rv_action_func)(void *arg);
 113 static void (*volatile smp_rv_teardown_func)(void *arg);
 114 static void *volatile smp_rv_func_arg;
 115 static volatile int smp_rv_waiters[4];
 116
 117 /*
 118  * Shared mutex to restrict busywaits between smp_rendezvous() and
 119  * smp(_targeted)_tlb_shootdown().  A deadlock occurs if both of these
 120  * functions trigger at once and cause multiple CPUs to busywait with
 121  * interrupts disabled.
 122  */
 123 struct mtx smp_ipi_mtx;
 124
 125 /*
 126  * Let the MD SMP code initialize mp_maxid very early if it can.
 127  */
 128 static void
 129 mp_setmaxid(void *dummy)
 130 {
 131
 132         cpu_mp_setmaxid();
 133
 134         KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__));
 135         KASSERT(mp_ncpus > 1 || mp_maxid == 0,
 136             ("%s: one CPU but mp_maxid is not zero", __func__));
 137         KASSERT(mp_maxid >= mp_ncpus - 1,
 138             ("%s: counters out of sync: max %d, count %d", __func__,
 139                 mp_maxid, mp_ncpus));
 140 }
 141 SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL);
 142
 143 /*
 144  * Call the MD SMP initialization code.
 145  */
 146 static void
 147 mp_start(void *dummy)
 148 {
 149
 150         mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
 151
 152         /* Probe for MP hardware. */
 153         if (smp_disabled != 0 || cpu_mp_probe() == 0) {
 154                 mp_ncpus = 1;
 155                 CPU_SETOF(PCPU_GET(cpuid), &all_cpus);
 156                 return;
 157         }
 158
 159         cpu_mp_start();
 160         printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
 161             mp_ncpus);
 162         cpu_mp_announce();
 163 }
 164 SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL);
 165
 166 void
 167 forward_signal(struct thread *td)
 168 {
 169         int id;
 170
 171         /*
 172          * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on
 173          * this thread, so all we need to do is poke it if it is currently
 174          * executing so that it executes ast().
 175          */
 176         THREAD_LOCK_ASSERT(td, MA_OWNED);
 177         KASSERT(TD_IS_RUNNING(td),
 178             ("forward_signal: thread is not TDS_RUNNING"));
 179
 180         CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc);
 181
 182         if (!smp_started || cold || panicstr)
 183                 return;
 184         if (!forward_signal_enabled)
 185                 return;
 186
 187         /* No need to IPI ourself. */
 188         if (td == curthread)
 189                 return;
 190
 191         id = td->td_oncpu;
 192         if (id == NOCPU)
 193                 return;
 194         ipi_cpu(id, IPI_AST);
 195 }
 196
 197 /*
 198  * When called the executing CPU will send an IPI to all other CPUs
 199  *  requesting that they halt execution.
 200  *
 201  * Usually (but not necessarily) called with 'other_cpus' as its arg.
 202  *
 203  *  - Signals all CPUs in map to stop.
 204  *  - Waits for each to stop.
 205  *
 206  * Returns:
 207  *  -1: error
 208  *   0: NA
 209  *   1: ok
 210  *
 211  */
 212 static int
 213 generic_stop_cpus(cpuset_t map, u_int type)
 214 {
 215 #ifdef KTR
 216         char cpusetbuf[CPUSETBUFSIZ];
 217 #endif
 218         static volatile u_int stopping_cpu = NOCPU;
 219         int i;
 220         volatile cpuset_t *cpus;
 221
 222         KASSERT(
 223 #if defined(__amd64__) || defined(__i386__)
 224             type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND,
 225 #else
 226             type == IPI_STOP || type == IPI_STOP_HARD,
 227 #endif
 228             ("%s: invalid stop type", __func__));
 229
 230         if (!smp_started)
 231                 return (0);
 232
 233         CTR2(KTR_SMP, "stop_cpus(%s) with %u type",
 234             cpusetobj_strprint(cpusetbuf, &map), type);
 235
 236 #if defined(__amd64__) || defined(__i386__)
 237         /*
 238          * When suspending, ensure there are are no IPIs in progress.
 239          * IPIs that have been issued, but not yet delivered (e.g.
 240          * not pending on a vCPU when running under virtualization)
 241          * will be lost, violating FreeBSD's assumption of reliable
 242          * IPI delivery.
 243          */
 244         if (type == IPI_SUSPEND)
 245                 mtx_lock_spin(&smp_ipi_mtx);
 246 #endif
 247
 248         if (stopping_cpu != PCPU_GET(cpuid))
 249                 while (atomic_cmpset_int(&stopping_cpu, NOCPU,
 250                     PCPU_GET(cpuid)) == 0)
 251                         while (stopping_cpu != NOCPU)
 252                                 cpu_spinwait(); /* spin */
 253
 254         /* send the stop IPI to all CPUs in map */
 255         ipi_selected(map, type);
 256
 257 #if defined(__amd64__) || defined(__i386__)
 258         if (type == IPI_SUSPEND)
 259                 cpus = &suspended_cpus;
 260         else
 261 #endif
 262                 cpus = &stopped_cpus;
 263
 264         i = 0;
 265         while (!CPU_SUBSET(cpus, &map)) {
 266                 /* spin */
 267                 cpu_spinwait();
 268                 i++;
 269                 if (i == 100000000) {
 270                         printf("timeout stopping cpus\n");
 271                         break;
 272                 }
 273         }
 274
 275 #if defined(__amd64__) || defined(__i386__)
 276         if (type == IPI_SUSPEND)
 277                 mtx_unlock_spin(&smp_ipi_mtx);
 278 #endif
 279
 280         stopping_cpu = NOCPU;
 281         return (1);
 282 }
 283
 284 int
 285 stop_cpus(cpuset_t map)
 286 {
 287
 288         return (generic_stop_cpus(map, IPI_STOP));
 289 }
 290
 291 int
 292 stop_cpus_hard(cpuset_t map)
 293 {
 294
 295         return (generic_stop_cpus(map, IPI_STOP_HARD));
 296 }
 297
 298 #if defined(__amd64__) || defined(__i386__)
 299 int
 300 suspend_cpus(cpuset_t map)
 301 {
 302
 303         return (generic_stop_cpus(map, IPI_SUSPEND));
 304 }
 305 #endif
 306
 307 /*
 308  * Called by a CPU to restart stopped CPUs.
 309  *
 310  * Usually (but not necessarily) called with 'stopped_cpus' as its arg.
 311  *
 312  *  - Signals all CPUs in map to restart.
 313  *  - Waits for each to restart.
 314  *
 315  * Returns:
 316  *  -1: error
 317  *   0: NA
 318  *   1: ok
 319  */
 320 static int
 321 generic_restart_cpus(cpuset_t map, u_int type)
 322 {
 323 #ifdef KTR
 324         char cpusetbuf[CPUSETBUFSIZ];
 325 #endif
 326         volatile cpuset_t *cpus;
 327
 328         KASSERT(
 329 #if defined(__amd64__) || defined(__i386__)
 330             type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND,
 331 #else
 332             type == IPI_STOP || type == IPI_STOP_HARD,
 333 #endif
 334             ("%s: invalid stop type", __func__));
 335
 336         if (!smp_started)
 337                 return 0;
 338
 339         CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map));
 340
 341 #if defined(__amd64__) || defined(__i386__)
 342         if (type == IPI_SUSPEND)
 343                 cpus = &suspended_cpus;
 344         else
 345 #endif
 346                 cpus = &stopped_cpus;
 347
 348         /* signal other cpus to restart */
 349         CPU_COPY_STORE_REL(&map, &started_cpus);
 350
 351         /* wait for each to clear its bit */
 352         while (CPU_OVERLAP(cpus, &map))
 353                 cpu_spinwait();
 354
 355         return 1;
 356 }
 357
 358 int
 359 restart_cpus(cpuset_t map)
 360 {
 361
 362         return (generic_restart_cpus(map, IPI_STOP));
 363 }
 364
 365 #if defined(__amd64__) || defined(__i386__)
 366 int
 367 resume_cpus(cpuset_t map)
 368 {
 369
 370         return (generic_restart_cpus(map, IPI_SUSPEND));
 371 }
 372 #endif
 373
 374 /*
 375  * All-CPU rendezvous.  CPUs are signalled, all execute the setup function
 376  * (if specified), rendezvous, execute the action function (if specified),
 377  * rendezvous again, execute the teardown function (if specified), and then
 378  * resume.
 379  *
 380  * Note that the supplied external functions _must_ be reentrant and aware
 381  * that they are running in parallel and in an unknown lock context.
 382  */
 383 void
 384 smp_rendezvous_action(void)
 385 {
 386         struct thread *td;
 387         void *local_func_arg;
 388         void (*local_setup_func)(void*);
 389         void (*local_action_func)(void*);
 390         void (*local_teardown_func)(void*);
 391 #ifdef INVARIANTS
 392         int owepreempt;
 393 #endif
 394
 395         /* Ensure we have up-to-date values. */
 396         atomic_add_acq_int(&smp_rv_waiters[0], 1);
 397         while (smp_rv_waiters[0] < smp_rv_ncpus)
 398                 cpu_spinwait();
 399
 400         /* Fetch rendezvous parameters after acquire barrier. */
 401         local_func_arg = smp_rv_func_arg;
 402         local_setup_func = smp_rv_setup_func;
 403         local_action_func = smp_rv_action_func;
 404         local_teardown_func = smp_rv_teardown_func;
 405
 406         /*
 407          * Use a nested critical section to prevent any preemptions
 408          * from occurring during a rendezvous action routine.
 409          * Specifically, if a rendezvous handler is invoked via an IPI
 410          * and the interrupted thread was in the critical_exit()
 411          * function after setting td_critnest to 0 but before
 412          * performing a deferred preemption, this routine can be
 413          * invoked with td_critnest set to 0 and td_owepreempt true.
 414          * In that case, a critical_exit() during the rendezvous
 415          * action would trigger a preemption which is not permitted in
 416          * a rendezvous action.  To fix this, wrap all of the
 417          * rendezvous action handlers in a critical section.  We
 418          * cannot use a regular critical section however as having
 419          * critical_exit() preempt from this routine would also be
 420          * problematic (the preemption must not occur before the IPI
 421          * has been acknowledged via an EOI).  Instead, we
 422          * intentionally ignore td_owepreempt when leaving the
 423          * critical section.  This should be harmless because we do
 424          * not permit rendezvous action routines to schedule threads,
 425          * and thus td_owepreempt should never transition from 0 to 1
 426          * during this routine.
 427          */
 428         td = curthread;
 429         td->td_critnest++;
 430 #ifdef INVARIANTS
 431         owepreempt = td->td_owepreempt;
 432 #endif
 433
 434         /*
 435          * If requested, run a setup function before the main action
 436          * function.  Ensure all CPUs have completed the setup
 437          * function before moving on to the action function.
 438          */
 439         if (local_setup_func != smp_no_rendevous_barrier) {
 440                 if (smp_rv_setup_func != NULL)
 441                         smp_rv_setup_func(smp_rv_func_arg);
 442                 atomic_add_int(&smp_rv_waiters[1], 1);
 443                 while (smp_rv_waiters[1] < smp_rv_ncpus)
 444                         cpu_spinwait();
 445         }
 446
 447         if (local_action_func != NULL)
 448                 local_action_func(local_func_arg);
 449
 450         if (local_teardown_func != smp_no_rendevous_barrier) {
 451                 /*
 452                  * Signal that the main action has been completed.  If a
 453                  * full exit rendezvous is requested, then all CPUs will
 454                  * wait here until all CPUs have finished the main action.
 455                  */
 456                 atomic_add_int(&smp_rv_waiters[2], 1);
 457                 while (smp_rv_waiters[2] < smp_rv_ncpus)
 458                         cpu_spinwait();
 459
 460                 if (local_teardown_func != NULL)
 461                         local_teardown_func(local_func_arg);
 462         }
 463
 464         /*
 465          * Signal that the rendezvous is fully completed by this CPU.
 466          * This means that no member of smp_rv_* pseudo-structure will be
 467          * accessed by this target CPU after this point; in particular,
 468          * memory pointed by smp_rv_func_arg.
 469          *
 470          * The release semantic ensures that all accesses performed by
 471          * the current CPU are visible when smp_rendezvous_cpus()
 472          * returns, by synchronizing with the
 473          * atomic_load_acq_int(&smp_rv_waiters[3]).
 474          */
 475         atomic_add_rel_int(&smp_rv_waiters[3], 1);
 476
 477         td->td_critnest--;
 478         KASSERT(owepreempt == td->td_owepreempt,
 479             ("rendezvous action changed td_owepreempt"));
 480 }
 481
 482 void
 483 smp_rendezvous_cpus(cpuset_t map,
 484         void (* setup_func)(void *),
 485         void (* action_func)(void *),
 486         void (* teardown_func)(void *),
 487         void *arg)
 488 {
 489         int curcpumap, i, ncpus = 0;
 490
 491         /* Look comments in the !SMP case. */
 492         if (!smp_started) {
 493                 spinlock_enter();
 494                 if (setup_func != NULL)
 495                         setup_func(arg);
 496                 if (action_func != NULL)
 497                         action_func(arg);
 498                 if (teardown_func != NULL)
 499                         teardown_func(arg);
 500                 spinlock_exit();
 501                 return;
 502         }
 503
 504         CPU_FOREACH(i) {
 505                 if (CPU_ISSET(i, &map))
 506                         ncpus++;
 507         }
 508         if (ncpus == 0)
 509                 panic("ncpus is 0 with non-zero map");
 510
 511         mtx_lock_spin(&smp_ipi_mtx);
 512
 513         /* Pass rendezvous parameters via global variables. */
 514         smp_rv_ncpus = ncpus;
 515         smp_rv_setup_func = setup_func;
 516         smp_rv_action_func = action_func;
 517         smp_rv_teardown_func = teardown_func;
 518         smp_rv_func_arg = arg;
 519         smp_rv_waiters[1] = 0;
 520         smp_rv_waiters[2] = 0;
 521         smp_rv_waiters[3] = 0;
 522         atomic_store_rel_int(&smp_rv_waiters[0], 0);
 523
 524         /*
 525          * Signal other processors, which will enter the IPI with
 526          * interrupts off.
 527          */
 528         curcpumap = CPU_ISSET(curcpu, &map);
 529         CPU_CLR(curcpu, &map);
 530         ipi_selected(map, IPI_RENDEZVOUS);
 531
 532         /* Check if the current CPU is in the map */
 533         if (curcpumap != 0)
 534                 smp_rendezvous_action();
 535
 536         /*
 537          * Ensure that the master CPU waits for all the other
 538          * CPUs to finish the rendezvous, so that smp_rv_*
 539          * pseudo-structure and the arg are guaranteed to not
 540          * be in use.
 541          *
 542          * Load acquire synchronizes with the release add in
 543          * smp_rendezvous_action(), which ensures that our caller sees
 544          * all memory actions done by the called functions on other
 545          * CPUs.
 546          */
 547         while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus)
 548                 cpu_spinwait();
 549
 550         mtx_unlock_spin(&smp_ipi_mtx);
 551 }
 552
 553 void
 554 smp_rendezvous(void (* setup_func)(void *),
 555                void (* action_func)(void *),
 556                void (* teardown_func)(void *),
 557                void *arg)
 558 {
 559         smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg);
 560 }
 561
 562 static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1];
 563
 564 struct cpu_group *
 565 smp_topo(void)
 566 {
 567         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 568         struct cpu_group *top;
 569
 570         /*
 571          * Check for a fake topology request for debugging purposes.
 572          */
 573         switch (smp_topology) {
 574         case 1:
 575                 /* Dual core with no sharing.  */
 576                 top = smp_topo_1level(CG_SHARE_NONE, 2, 0);
 577                 break;
 578         case 2:
 579                 /* No topology, all cpus are equal. */
 580                 top = smp_topo_none();
 581                 break;
 582         case 3:
 583                 /* Dual core with shared L2.  */
 584                 top = smp_topo_1level(CG_SHARE_L2, 2, 0);
 585                 break;
 586         case 4:
 587                 /* quad core, shared l3 among each package, private l2.  */
 588                 top = smp_topo_1level(CG_SHARE_L3, 4, 0);
 589                 break;
 590         case 5:
 591                 /* quad core,  2 dualcore parts on each package share l2.  */
 592                 top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0);
 593                 break;
 594         case 6:
 595                 /* Single-core 2xHTT */
 596                 top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT);
 597                 break;
 598         case 7:
 599                 /* quad core with a shared l3, 8 threads sharing L2.  */
 600                 top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8,
 601                     CG_FLAG_SMT);
 602                 break;
 603         default:
 604                 /* Default, ask the system what it wants. */
 605                 top = cpu_topo();
 606                 break;
 607         }
 608         /*
 609          * Verify the returned topology.
 610          */
 611         if (top->cg_count != mp_ncpus)
 612                 panic("Built bad topology at %p.  CPU count %d != %d",
 613                     top, top->cg_count, mp_ncpus);
 614         if (CPU_CMP(&top->cg_mask, &all_cpus))
 615                 panic("Built bad topology at %p.  CPU mask (%s) != (%s)",
 616                     top, cpusetobj_strprint(cpusetbuf, &top->cg_mask),
 617                     cpusetobj_strprint(cpusetbuf2, &all_cpus));
 618         return (top);
 619 }
 620
 621 struct cpu_group *
 622 smp_topo_alloc(u_int count)
 623 {
 624         static u_int index;
 625         u_int curr;
 626
 627         curr = index;
 628         index += count;
 629         return (&group[curr]);
 630 }
 631
 632 struct cpu_group *
 633 smp_topo_none(void)
 634 {
 635         struct cpu_group *top;
 636
 637         top = &group[0];
 638         top->cg_parent = NULL;
 639         top->cg_child = NULL;
 640         top->cg_mask = all_cpus;
 641         top->cg_count = mp_ncpus;
 642         top->cg_children = 0;
 643         top->cg_level = CG_SHARE_NONE;
 644         top->cg_flags = 0;
 645
 646         return (top);
 647 }
 648
 649 static int
 650 smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share,
 651     int count, int flags, int start)
 652 {
 653         char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ];
 654         cpuset_t mask;
 655         int i;
 656
 657         CPU_ZERO(&mask);
 658         for (i = 0; i < count; i++, start++)
 659                 CPU_SET(start, &mask);
 660         child->cg_parent = parent;
 661         child->cg_child = NULL;
 662         child->cg_children = 0;
 663         child->cg_level = share;
 664         child->cg_count = count;
 665         child->cg_flags = flags;
 666         child->cg_mask = mask;
 667         parent->cg_children++;
 668         for (; parent != NULL; parent = parent->cg_parent) {
 669                 if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask))
 670                         panic("Duplicate children in %p.  mask (%s) child (%s)",
 671                             parent,
 672                             cpusetobj_strprint(cpusetbuf, &parent->cg_mask),
 673                             cpusetobj_strprint(cpusetbuf2, &child->cg_mask));
 674                 CPU_OR(&parent->cg_mask, &child->cg_mask);
 675                 parent->cg_count += child->cg_count;
 676         }
 677
 678         return (start);
 679 }
 680
 681 struct cpu_group *
 682 smp_topo_1level(int share, int count, int flags)
 683 {
 684         struct cpu_group *child;
 685         struct cpu_group *top;
 686         int packages;
 687         int cpu;
 688         int i;
 689
 690         cpu = 0;
 691         top = &group[0];
 692         packages = mp_ncpus / count;
 693         top->cg_child = child = &group[1];
 694         top->cg_level = CG_SHARE_NONE;
 695         for (i = 0; i < packages; i++, child++)
 696                 cpu = smp_topo_addleaf(top, child, share, count, flags, cpu);
 697         return (top);
 698 }
 699
 700 struct cpu_group *
 701 smp_topo_2level(int l2share, int l2count, int l1share, int l1count,
 702     int l1flags)
 703 {
 704         struct cpu_group *top;
 705         struct cpu_group *l1g;
 706         struct cpu_group *l2g;
 707         int cpu;
 708         int i;
 709         int j;
 710
 711         cpu = 0;
 712         top = &group[0];
 713         l2g = &group[1];
 714         top->cg_child = l2g;
 715         top->cg_level = CG_SHARE_NONE;
 716         top->cg_children = mp_ncpus / (l2count * l1count);
 717         l1g = l2g + top->cg_children;
 718         for (i = 0; i < top->cg_children; i++, l2g++) {
 719                 l2g->cg_parent = top;
 720                 l2g->cg_child = l1g;
 721                 l2g->cg_level = l2share;
 722                 for (j = 0; j < l2count; j++, l1g++)
 723                         cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count,
 724                             l1flags, cpu);
 725         }
 726         return (top);
 727 }
 728
 729
 730 struct cpu_group *
 731 smp_topo_find(struct cpu_group *top, int cpu)
 732 {
 733         struct cpu_group *cg;
 734         cpuset_t mask;
 735         int children;
 736         int i;
 737
 738         CPU_SETOF(cpu, &mask);
 739         cg = top;
 740         for (;;) {
 741                 if (!CPU_OVERLAP(&cg->cg_mask, &mask))
 742                         return (NULL);
 743                 if (cg->cg_children == 0)
 744                         return (cg);
 745                 children = cg->cg_children;
 746                 for (i = 0, cg = cg->cg_child; i < children; cg++, i++)
 747                         if (CPU_OVERLAP(&cg->cg_mask, &mask))
 748                                 break;
 749         }
 750         return (NULL);
 751 }
 752 #else /* !SMP */
 753
 754 void
 755 smp_rendezvous_cpus(cpuset_t map,
 756         void (*setup_func)(void *),
 757         void (*action_func)(void *),
 758         void (*teardown_func)(void *),
 759         void *arg)
 760 {
 761         /*
 762          * In the !SMP case we just need to ensure the same initial conditions
 763          * as the SMP case.
 764          */
 765         spinlock_enter();
 766         if (setup_func != NULL)
 767                 setup_func(arg);
 768         if (action_func != NULL)
 769                 action_func(arg);
 770         if (teardown_func != NULL)
 771                 teardown_func(arg);
 772         spinlock_exit();
 773 }
 774
 775 void
 776 smp_rendezvous(void (*setup_func)(void *),
 777                void (*action_func)(void *),
 778                void (*teardown_func)(void *),
 779                void *arg)
 780 {
 781
 782         /* Look comments in the smp_rendezvous_cpus() case. */
 783         spinlock_enter();
 784         if (setup_func != NULL)
 785                 setup_func(arg);
 786         if (action_func != NULL)
 787                 action_func(arg);
 788         if (teardown_func != NULL)
 789                 teardown_func(arg);
 790         spinlock_exit();
 791 }
 792
 793 /*
 794  * Provide dummy SMP support for UP kernels.  Modules that need to use SMP
 795  * APIs will still work using this dummy support.
 796  */
 797 static void
 798 mp_setvariables_for_up(void *dummy)
 799 {
 800         mp_ncpus = 1;
 801         mp_maxid = PCPU_GET(cpuid);
 802         CPU_SETOF(mp_maxid, &all_cpus);
 803         KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero"));
 804 }
 805 SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST,
 806     mp_setvariables_for_up, NULL);
 807 #endif /* SMP */
 808
 809 void
 810 smp_no_rendevous_barrier(void *dummy)
 811 {
 812 #ifdef SMP
 813         KASSERT((!smp_started),("smp_no_rendevous called and smp is started"));
 814 #endif
 815 }
 816
 817 /*
 818  * Wait specified idle threads to switch once.  This ensures that even
 819  * preempted threads have cycled through the switch function once,
 820  * exiting their codepaths.  This allows us to change global pointers
 821  * with no other synchronization.
 822  */
 823 int
 824 quiesce_cpus(cpuset_t map, const char *wmesg, int prio)
 825 {
 826         struct pcpu *pcpu;
 827         u_int gen[MAXCPU];
 828         int error;
 829         int cpu;
 830
 831         error = 0;
 832         for (cpu = 0; cpu <= mp_maxid; cpu++) {
 833                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 834                         continue;
 835                 pcpu = pcpu_find(cpu);
 836                 gen[cpu] = pcpu->pc_idlethread->td_generation;
 837         }
 838         for (cpu = 0; cpu <= mp_maxid; cpu++) {
 839                 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
 840                         continue;
 841                 pcpu = pcpu_find(cpu);
 842                 thread_lock(curthread);
 843                 sched_bind(curthread, cpu);
 844                 thread_unlock(curthread);
 845                 while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
 846                         error = tsleep(quiesce_cpus, prio, wmesg, 1);
 847                         if (error != EWOULDBLOCK)
 848                                 goto out;
 849                         error = 0;
 850                 }
 851         }
 852 out:
 853         thread_lock(curthread);
 854         sched_unbind(curthread);
 855         thread_unlock(curthread);
 856
 857         return (error);
 858 }
 859
 860 int
 861 quiesce_all_cpus(const char *wmesg, int prio)
 862 {
 863
 864         return quiesce_cpus(all_cpus, wmesg, prio);
 865 }
 866
 867 /* Extra care is taken with this sysctl because the data type is volatile */
 868 static int
 869 sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS)
 870 {
 871         int error, active;
 872
 873         active = smp_started;
 874         error = SYSCTL_OUT(req, &active, sizeof(active));
 875         return (error);
 876 }
 877
 878
 879 #ifdef SMP
 880 void
 881 topo_init_node(struct topo_node *node)
 882 {
 883
 884         bzero(node, sizeof(*node));
 885         TAILQ_INIT(&node->children);
 886 }
 887
 888 void
 889 topo_init_root(struct topo_node *root)
 890 {
 891
 892         topo_init_node(root);
 893         root->type = TOPO_TYPE_SYSTEM;
 894 }
 895
 896 /*
 897  * Add a child node with the given ID under the given parent.
 898  * Do nothing if there is already a child with that ID.
 899  */
 900 struct topo_node *
 901 topo_add_node_by_hwid(struct topo_node *parent, int hwid,
 902     topo_node_type type, uintptr_t subtype)
 903 {
 904         struct topo_node *node;
 905
 906         TAILQ_FOREACH_REVERSE(node, &parent->children,
 907             topo_children, siblings) {
 908                 if (node->hwid == hwid
 909                     && node->type == type && node->subtype == subtype) {
 910                         return (node);
 911                 }
 912         }
 913
 914         node = malloc(sizeof(*node), M_TOPO, M_WAITOK);
 915         topo_init_node(node);
 916         node->parent = parent;
 917         node->hwid = hwid;
 918         node->type = type;
 919         node->subtype = subtype;
 920         TAILQ_INSERT_TAIL(&parent->children, node, siblings);
 921         parent->nchildren++;
 922
 923         return (node);
 924 }
 925
 926 /*
 927  * Find a child node with the given ID under the given parent.
 928  */
 929 struct topo_node *
 930 topo_find_node_by_hwid(struct topo_node *parent, int hwid,
 931     topo_node_type type, uintptr_t subtype)
 932 {
 933
 934         struct topo_node *node;
 935
 936         TAILQ_FOREACH(node, &parent->children, siblings) {
 937                 if (node->hwid == hwid
 938                     && node->type == type && node->subtype == subtype) {
 939                         return (node);
 940                 }
 941         }
 942
 943         return (NULL);
 944 }
 945
 946 /*
 947  * Given a node change the order of its parent's child nodes such
 948  * that the node becomes the firt child while preserving the cyclic
 949  * order of the children.  In other words, the given node is promoted
 950  * by rotation.
 951  */
 952 void
 953 topo_promote_child(struct topo_node *child)
 954 {
 955         struct topo_node *next;
 956         struct topo_node *node;
 957         struct topo_node *parent;
 958
 959         parent = child->parent;
 960         next = TAILQ_NEXT(child, siblings);
 961         TAILQ_REMOVE(&parent->children, child, siblings);
 962         TAILQ_INSERT_HEAD(&parent->children, child, siblings);
 963
 964         while (next != NULL) {
 965                 node = next;
 966                 next = TAILQ_NEXT(node, siblings);
 967                 TAILQ_REMOVE(&parent->children, node, siblings);
 968                 TAILQ_INSERT_AFTER(&parent->children, child, node, siblings);
 969                 child = node;
 970         }
 971 }
 972
 973 /*
 974  * Iterate to the next node in the depth-first search (traversal) of
 975  * the topology tree.
 976  */
 977 struct topo_node *
 978 topo_next_node(struct topo_node *top, struct topo_node *node)
 979 {
 980         struct topo_node *next;
 981
 982         if ((next = TAILQ_FIRST(&node->children)) != NULL)
 983                 return (next);
 984
 985         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 986                 return (next);
 987
 988         while ((node = node->parent) != top)
 989                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
 990                         return (next);
 991
 992         return (NULL);
 993 }
 994
 995 /*
 996  * Iterate to the next node in the depth-first search of the topology tree,
 997  * but without descending below the current node.
 998  */
 999 struct topo_node *
1000 topo_next_nonchild_node(struct topo_node *top, struct topo_node *node)
1001 {
1002         struct topo_node *next;
1003
1004         if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1005                 return (next);
1006
1007         while ((node = node->parent) != top)
1008                 if ((next = TAILQ_NEXT(node, siblings)) != NULL)
1009                         return (next);
1010
1011         return (NULL);
1012 }
1013
1014 /*
1015  * Assign the given ID to the given topology node that represents a logical
1016  * processor.
1017  */
1018 void
1019 topo_set_pu_id(struct topo_node *node, cpuid_t id)
1020 {
1021
1022         KASSERT(node->type == TOPO_TYPE_PU,
1023             ("topo_set_pu_id: wrong node type: %u", node->type));
1024         KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0,
1025             ("topo_set_pu_id: cpuset already not empty"));
1026         node->id = id;
1027         CPU_SET(id, &node->cpuset);
1028         node->cpu_count = 1;
1029         node->subtype = 1;
1030
1031         while ((node = node->parent) != NULL) {
1032                 KASSERT(!CPU_ISSET(id, &node->cpuset),
1033                     ("logical ID %u is already set in node %p", id, node));
1034                 CPU_SET(id, &node->cpuset);
1035                 node->cpu_count++;
1036         }
1037 }
1038
1039 /*
1040  * Check if the topology is uniform, that is, each package has the same number
1041  * of cores in it and each core has the same number of threads (logical
1042  * processors) in it.  If so, calculate the number of package, the number of
1043  * cores per package and the number of logical processors per core.
1044  * 'all' parameter tells whether to include administratively disabled logical
1045  * processors into the analysis.
1046  */
1047 int
1048 topo_analyze(struct topo_node *topo_root, int all,
1049     int *pkg_count, int *cores_per_pkg, int *thrs_per_core)
1050 {
1051         struct topo_node *pkg_node;
1052         struct topo_node *core_node;
1053         struct topo_node *pu_node;
1054         int thrs_per_pkg;
1055         int cpp_counter;
1056         int tpc_counter;
1057         int tpp_counter;
1058
1059         *pkg_count = 0;
1060         *cores_per_pkg = -1;
1061         *thrs_per_core = -1;
1062         thrs_per_pkg = -1;
1063         pkg_node = topo_root;
1064         while (pkg_node != NULL) {
1065                 if (pkg_node->type != TOPO_TYPE_PKG) {
1066                         pkg_node = topo_next_node(topo_root, pkg_node);
1067                         continue;
1068                 }
1069                 if (!all && CPU_EMPTY(&pkg_node->cpuset)) {
1070                         pkg_node = topo_next_nonchild_node(topo_root, pkg_node);
1071                         continue;
1072                 }
1073
1074                 (*pkg_count)++;
1075
1076                 cpp_counter = 0;
1077                 tpp_counter = 0;
1078                 core_node = pkg_node;
1079                 while (core_node != NULL) {
1080                         if (core_node->type == TOPO_TYPE_CORE) {
1081                                 if (!all && CPU_EMPTY(&core_node->cpuset)) {
1082                                         core_node =
1083                                             topo_next_nonchild_node(pkg_node,
1084                                                 core_node);
1085                                         continue;
1086                                 }
1087
1088                                 cpp_counter++;
1089
1090                                 tpc_counter = 0;
1091                                 pu_node = core_node;
1092                                 while (pu_node != NULL) {
1093                                         if (pu_node->type == TOPO_TYPE_PU &&
1094                                             (all || !CPU_EMPTY(&pu_node->cpuset)))
1095                                                 tpc_counter++;
1096                                         pu_node = topo_next_node(core_node,
1097                                             pu_node);
1098                                 }
1099
1100                                 if (*thrs_per_core == -1)
1101                                         *thrs_per_core = tpc_counter;
1102                                 else if (*thrs_per_core != tpc_counter)
1103                                         return (0);
1104
1105                                 core_node = topo_next_nonchild_node(pkg_node,
1106                                     core_node);
1107                         } else {
1108                                 /* PU node directly under PKG. */
1109                                 if (core_node->type == TOPO_TYPE_PU &&
1110                                    (all || !CPU_EMPTY(&core_node->cpuset)))
1111                                         tpp_counter++;
1112                                 core_node = topo_next_node(pkg_node,
1113                                     core_node);
1114                         }
1115                 }
1116
1117                 if (*cores_per_pkg == -1)
1118                         *cores_per_pkg = cpp_counter;
1119                 else if (*cores_per_pkg != cpp_counter)
1120                         return (0);
1121                 if (thrs_per_pkg == -1)
1122                         thrs_per_pkg = tpp_counter;
1123                 else if (thrs_per_pkg != tpp_counter)
1124                         return (0);
1125
1126                 pkg_node = topo_next_nonchild_node(topo_root, pkg_node);
1127         }
1128
1129         KASSERT(*pkg_count > 0,
1130                 ("bug in topology or analysis"));
1131         if (*cores_per_pkg == 0) {
1132                 KASSERT(*thrs_per_core == -1 && thrs_per_pkg > 0,
1133                         ("bug in topology or analysis"));
1134                 *thrs_per_core = thrs_per_pkg;
1135         }
1136
1137         return (1);
1138 }
1139 #endif /* SMP */
1140