sys/kern/subr_epoch.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   3  *
   4  * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25  * SUCH DAMAGE.
  26  *
  27  */
  28
  29 #include <sys/cdefs.h>
  30 __FBSDID("$FreeBSD$");
  31
  32 #include <sys/param.h>
  33 #include <sys/types.h>
  34 #include <sys/systm.h>
  35 #include <sys/counter.h>
  36 #include <sys/epoch.h>
  37 #include <sys/gtaskqueue.h>
  38 #include <sys/kernel.h>
  39 #include <sys/limits.h>
  40 #include <sys/lock.h>
  41 #include <sys/malloc.h>
  42 #include <sys/mutex.h>
  43 #include <sys/pcpu.h>
  44 #include <sys/proc.h>
  45 #include <sys/sched.h>
  46 #include <sys/sx.h>
  47 #include <sys/smp.h>
  48 #include <sys/sysctl.h>
  49 #include <sys/turnstile.h>
  50 #include <vm/vm.h>
  51 #include <vm/vm_extern.h>
  52 #include <vm/vm_kern.h>
  53 #include <vm/uma.h>
  54
  55 #include <ck_epoch.h>
  56
  57 #ifdef __amd64__
  58 #define EPOCH_ALIGN CACHE_LINE_SIZE*2
  59 #else
  60 #define EPOCH_ALIGN CACHE_LINE_SIZE
  61 #endif
  62
  63 TAILQ_HEAD (epoch_tdlist, epoch_tracker);
  64 typedef struct epoch_record {
  65         ck_epoch_record_t er_record;
  66         volatile struct epoch_tdlist er_tdlist;
  67         volatile uint32_t er_gen;
  68         uint32_t er_cpuid;
  69         /* fields above are part of KBI and cannot be modified */
  70         struct epoch_context er_drain_ctx;
  71         struct epoch *er_parent;
  72 #ifdef INVARIANTS
  73         /* Used to verify record ownership for non-preemptible epochs. */
  74         struct thread *er_td;
  75 #endif
  76 } __aligned(EPOCH_ALIGN)     *epoch_record_t;
  77
  78 struct epoch {
  79         struct ck_epoch e_epoch __aligned(EPOCH_ALIGN);
  80         epoch_record_t e_pcpu_record;
  81         int     e_in_use;
  82         int     e_flags;
  83         /* fields above are part of KBI and cannot be modified */
  84         struct sx e_drain_sx;
  85         struct mtx e_drain_mtx;
  86         volatile int e_drain_count;
  87 };
  88
  89 /* arbitrary --- needs benchmarking */
  90 #define MAX_ADAPTIVE_SPIN 100
  91 #define MAX_EPOCHS 64
  92
  93 CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context));
  94 SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW, 0, "epoch information");
  95 SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW, 0, "epoch stats");
  96
  97 /* Stats. */
  98 static counter_u64_t block_count;
  99
 100 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, nblocked, CTLFLAG_RW,
 101     &block_count, "# of times a thread was in an epoch when epoch_wait was called");
 102 static counter_u64_t migrate_count;
 103
 104 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, migrations, CTLFLAG_RW,
 105     &migrate_count, "# of times thread was migrated to another CPU in epoch_wait");
 106 static counter_u64_t turnstile_count;
 107
 108 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, ncontended, CTLFLAG_RW,
 109     &turnstile_count, "# of times a thread was blocked on a lock in an epoch during an epoch_wait");
 110 static counter_u64_t switch_count;
 111
 112 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, switches, CTLFLAG_RW,
 113     &switch_count, "# of times a thread voluntarily context switched in epoch_wait");
 114 static counter_u64_t epoch_call_count;
 115
 116 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_calls, CTLFLAG_RW,
 117     &epoch_call_count, "# of times a callback was deferred");
 118 static counter_u64_t epoch_call_task_count;
 119
 120 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_call_tasks, CTLFLAG_RW,
 121     &epoch_call_task_count, "# of times a callback task was run");
 122
 123 TAILQ_HEAD (threadlist, thread);
 124
 125 CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry,
 126     ck_epoch_entry_container)
 127
 128 static struct epoch epoch_array[MAX_EPOCHS];
 129
 130 DPCPU_DEFINE(struct grouptask, epoch_cb_task);
 131 DPCPU_DEFINE(int, epoch_cb_count);
 132
 133 static __read_mostly int inited;
 134 __read_mostly epoch_t global_epoch;
 135 __read_mostly epoch_t global_epoch_preempt;
 136
 137 static void epoch_call_task(void *context __unused);
 138 static  uma_zone_t pcpu_zone_record;
 139
 140 static struct sx epoch_sx;
 141
 142 #define EPOCH_LOCK() sx_xlock(&epoch_sx)
 143 #define EPOCH_UNLOCK() sx_xunlock(&epoch_sx)
 144
 145 static void
 146 epoch_init(void *arg __unused)
 147 {
 148         int cpu;
 149
 150         block_count = counter_u64_alloc(M_WAITOK);
 151         migrate_count = counter_u64_alloc(M_WAITOK);
 152         turnstile_count = counter_u64_alloc(M_WAITOK);
 153         switch_count = counter_u64_alloc(M_WAITOK);
 154         epoch_call_count = counter_u64_alloc(M_WAITOK);
 155         epoch_call_task_count = counter_u64_alloc(M_WAITOK);
 156
 157         pcpu_zone_record = uma_zcreate("epoch_record pcpu",
 158             sizeof(struct epoch_record), NULL, NULL, NULL, NULL,
 159             UMA_ALIGN_PTR, UMA_ZONE_PCPU);
 160         CPU_FOREACH(cpu) {
 161                 GROUPTASK_INIT(DPCPU_ID_PTR(cpu, epoch_cb_task), 0,
 162                     epoch_call_task, NULL);
 163                 taskqgroup_attach_cpu(qgroup_softirq,
 164                     DPCPU_ID_PTR(cpu, epoch_cb_task), NULL, cpu, -1,
 165                     "epoch call task");
 166         }
 167         sx_init(&epoch_sx, "epoch-sx");
 168         inited = 1;
 169         global_epoch = epoch_alloc(0);
 170         global_epoch_preempt = epoch_alloc(EPOCH_PREEMPT);
 171 }
 172 SYSINIT(epoch, SI_SUB_TASKQ + 1, SI_ORDER_FIRST, epoch_init, NULL);
 173
 174 #if !defined(EARLY_AP_STARTUP)
 175 static void
 176 epoch_init_smp(void *dummy __unused)
 177 {
 178         inited = 2;
 179 }
 180 SYSINIT(epoch_smp, SI_SUB_SMP + 1, SI_ORDER_FIRST, epoch_init_smp, NULL);
 181 #endif
 182
 183 static void
 184 epoch_ctor(epoch_t epoch)
 185 {
 186         epoch_record_t er;
 187         int cpu;
 188
 189         epoch->e_pcpu_record = uma_zalloc_pcpu(pcpu_zone_record, M_WAITOK);
 190         CPU_FOREACH(cpu) {
 191                 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
 192                 bzero(er, sizeof(*er));
 193                 ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL);
 194                 TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist);
 195                 er->er_cpuid = cpu;
 196                 er->er_parent = epoch;
 197         }
 198 }
 199
 200 static void
 201 epoch_adjust_prio(struct thread *td, u_char prio)
 202 {
 203
 204         thread_lock(td);
 205         sched_prio(td, prio);
 206         thread_unlock(td);
 207 }
 208
 209 epoch_t
 210 epoch_alloc(int flags)
 211 {
 212         epoch_t epoch;
 213         int i;
 214
 215         if (__predict_false(!inited))
 216                 panic("%s called too early in boot", __func__);
 217
 218         EPOCH_LOCK();
 219
 220         /*
 221          * Find a free index in the epoch array. If no free index is
 222          * found, try to use the index after the last one.
 223          */
 224         for (i = 0;; i++) {
 225                 /*
 226                  * If too many epochs are currently allocated,
 227                  * return NULL.
 228                  */
 229                 if (i == MAX_EPOCHS) {
 230                         epoch = NULL;
 231                         goto done;
 232                 }
 233                 if (epoch_array[i].e_in_use == 0)
 234                         break;
 235         }
 236
 237         epoch = epoch_array + i;
 238         ck_epoch_init(&epoch->e_epoch);
 239         epoch_ctor(epoch);
 240         epoch->e_flags = flags;
 241         sx_init(&epoch->e_drain_sx, "epoch-drain-sx");
 242         mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF);
 243
 244         /*
 245          * Set e_in_use last, because when this field is set the
 246          * epoch_call_task() function will start scanning this epoch
 247          * structure.
 248          */
 249         atomic_store_rel_int(&epoch->e_in_use, 1);
 250 done:
 251         EPOCH_UNLOCK();
 252         return (epoch);
 253 }
 254
 255 void
 256 epoch_free(epoch_t epoch)
 257 {
 258 #ifdef INVARIANTS
 259         int cpu;
 260 #endif
 261
 262         EPOCH_LOCK();
 263
 264         MPASS(epoch->e_in_use != 0);
 265
 266         epoch_drain_callbacks(epoch);
 267
 268         atomic_store_rel_int(&epoch->e_in_use, 0);
 269         /*
 270          * Make sure the epoch_call_task() function see e_in_use equal
 271          * to zero, by calling epoch_wait() on the global_epoch:
 272          */
 273         epoch_wait(global_epoch);
 274 #ifdef INVARIANTS
 275         CPU_FOREACH(cpu) {
 276                 epoch_record_t er;
 277
 278                 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
 279
 280                 /*
 281                  * Sanity check: none of the records should be in use anymore.
 282                  * We drained callbacks above and freeing the pcpu records is
 283                  * imminent.
 284                  */
 285                 MPASS(er->er_td == NULL);
 286                 MPASS(TAILQ_EMPTY(&er->er_tdlist));
 287         }
 288 #endif
 289         uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record);
 290         mtx_destroy(&epoch->e_drain_mtx);
 291         sx_destroy(&epoch->e_drain_sx);
 292         memset(epoch, 0, sizeof(*epoch));
 293
 294         EPOCH_UNLOCK();
 295 }
 296
 297 static epoch_record_t
 298 epoch_currecord(epoch_t epoch)
 299 {
 300
 301         return (zpcpu_get_cpu(epoch->e_pcpu_record, curcpu));
 302 }
 303
 304 #define INIT_CHECK(epoch)                                       \
 305         do {                                                    \
 306                 if (__predict_false((epoch) == NULL))           \
 307                         return;                                 \
 308         } while (0)
 309
 310 void
 311 epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et)
 312 {
 313         struct epoch_record *er;
 314         struct thread *td;
 315
 316         MPASS(cold || epoch != NULL);
 317         INIT_CHECK(epoch);
 318         MPASS(epoch->e_flags & EPOCH_PREEMPT);
 319 #ifdef EPOCH_TRACKER_DEBUG
 320         et->et_magic_pre = EPOCH_MAGIC0;
 321         et->et_magic_post = EPOCH_MAGIC1;
 322 #endif
 323         td = curthread;
 324         et->et_td = td;
 325         td->td_epochnest++;
 326         critical_enter();
 327         sched_pin();
 328
 329         td->td_pre_epoch_prio = td->td_priority;
 330         er = epoch_currecord(epoch);
 331         /* Record-level tracking is reserved for non-preemptible epochs. */
 332         MPASS(er->er_td == NULL);
 333         TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link);
 334         ck_epoch_begin(&er->er_record, &et->et_section);
 335         critical_exit();
 336 }
 337
 338 void
 339 epoch_enter(epoch_t epoch)
 340 {
 341         struct thread *td;
 342         epoch_record_t er;
 343
 344         MPASS(cold || epoch != NULL);
 345         INIT_CHECK(epoch);
 346         td = curthread;
 347
 348         td->td_epochnest++;
 349         critical_enter();
 350         er = epoch_currecord(epoch);
 351 #ifdef INVARIANTS
 352         if (er->er_record.active == 0) {
 353                 MPASS(er->er_td == NULL);
 354                 er->er_td = curthread;
 355         } else {
 356                 /* We've recursed, just make sure our accounting isn't wrong. */
 357                 MPASS(er->er_td == curthread);
 358         }
 359 #endif
 360         ck_epoch_begin(&er->er_record, NULL);
 361 }
 362
 363 void
 364 epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et)
 365 {
 366         struct epoch_record *er;
 367         struct thread *td;
 368
 369         INIT_CHECK(epoch);
 370         td = curthread;
 371         critical_enter();
 372         sched_unpin();
 373         MPASS(td->td_epochnest);
 374         td->td_epochnest--;
 375         er = epoch_currecord(epoch);
 376         MPASS(epoch->e_flags & EPOCH_PREEMPT);
 377         MPASS(et != NULL);
 378         MPASS(et->et_td == td);
 379 #ifdef EPOCH_TRACKER_DEBUG
 380         MPASS(et->et_magic_pre == EPOCH_MAGIC0);
 381         MPASS(et->et_magic_post == EPOCH_MAGIC1);
 382         et->et_magic_pre = 0;
 383         et->et_magic_post = 0;
 384 #endif
 385 #ifdef INVARIANTS
 386         et->et_td = (void*)0xDEADBEEF;
 387         /* Record-level tracking is reserved for non-preemptible epochs. */
 388         MPASS(er->er_td == NULL);
 389 #endif
 390         ck_epoch_end(&er->er_record, &et->et_section);
 391         TAILQ_REMOVE(&er->er_tdlist, et, et_link);
 392         er->er_gen++;
 393         if (__predict_false(td->td_pre_epoch_prio != td->td_priority))
 394                 epoch_adjust_prio(td, td->td_pre_epoch_prio);
 395         critical_exit();
 396 }
 397
 398 void
 399 epoch_exit(epoch_t epoch)
 400 {
 401         struct thread *td;
 402         epoch_record_t er;
 403
 404         INIT_CHECK(epoch);
 405         td = curthread;
 406         MPASS(td->td_epochnest);
 407         td->td_epochnest--;
 408         er = epoch_currecord(epoch);
 409         ck_epoch_end(&er->er_record, NULL);
 410 #ifdef INVARIANTS
 411         MPASS(er->er_td == curthread);
 412         if (er->er_record.active == 0)
 413                 er->er_td = NULL;
 414 #endif
 415         critical_exit();
 416 }
 417
 418 /*
 419  * epoch_block_handler_preempt() is a callback from the CK code when another
 420  * thread is currently in an epoch section.
 421  */
 422 static void
 423 epoch_block_handler_preempt(struct ck_epoch *global __unused,
 424     ck_epoch_record_t *cr, void *arg __unused)
 425 {
 426         epoch_record_t record;
 427         struct thread *td, *owner, *curwaittd;
 428         struct epoch_tracker *tdwait;
 429         struct turnstile *ts;
 430         struct lock_object *lock;
 431         int spincount, gen;
 432         int locksheld __unused;
 433
 434         record = __containerof(cr, struct epoch_record, er_record);
 435         td = curthread;
 436         locksheld = td->td_locks;
 437         spincount = 0;
 438         counter_u64_add(block_count, 1);
 439         /*
 440          * We lost a race and there's no longer any threads
 441          * on the CPU in an epoch section.
 442          */
 443         if (TAILQ_EMPTY(&record->er_tdlist))
 444                 return;
 445
 446         if (record->er_cpuid != curcpu) {
 447                 /*
 448                  * If the head of the list is running, we can wait for it
 449                  * to remove itself from the list and thus save us the
 450                  * overhead of a migration
 451                  */
 452                 gen = record->er_gen;
 453                 thread_unlock(td);
 454                 /*
 455                  * We can't actually check if the waiting thread is running
 456                  * so we simply poll for it to exit before giving up and
 457                  * migrating.
 458                  */
 459                 do {
 460                         cpu_spinwait();
 461                 } while (!TAILQ_EMPTY(&record->er_tdlist) &&
 462                                  gen == record->er_gen &&
 463                                  spincount++ < MAX_ADAPTIVE_SPIN);
 464                 thread_lock(td);
 465                 /*
 466                  * If the generation has changed we can poll again
 467                  * otherwise we need to migrate.
 468                  */
 469                 if (gen != record->er_gen)
 470                         return;
 471                 /*
 472                  * Being on the same CPU as that of the record on which
 473                  * we need to wait allows us access to the thread
 474                  * list associated with that CPU. We can then examine the
 475                  * oldest thread in the queue and wait on its turnstile
 476                  * until it resumes and so on until a grace period
 477                  * elapses.
 478                  *
 479                  */
 480                 counter_u64_add(migrate_count, 1);
 481                 sched_bind(td, record->er_cpuid);
 482                 /*
 483                  * At this point we need to return to the ck code
 484                  * to scan to see if a grace period has elapsed.
 485                  * We can't move on to check the thread list, because
 486                  * in the meantime new threads may have arrived that
 487                  * in fact belong to a different epoch.
 488                  */
 489                 return;
 490         }
 491         /*
 492          * Try to find a thread in an epoch section on this CPU
 493          * waiting on a turnstile. Otherwise find the lowest
 494          * priority thread (highest prio value) and drop our priority
 495          * to match to allow it to run.
 496          */
 497         TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) {
 498                 /*
 499                  * Propagate our priority to any other waiters to prevent us
 500                  * from starving them. They will have their original priority
 501                  * restore on exit from epoch_wait().
 502                  */
 503                 curwaittd = tdwait->et_td;
 504                 if (!TD_IS_INHIBITED(curwaittd) && curwaittd->td_priority > td->td_priority) {
 505                         critical_enter();
 506                         thread_unlock(td);
 507                         thread_lock(curwaittd);
 508                         sched_prio(curwaittd, td->td_priority);
 509                         thread_unlock(curwaittd);
 510                         thread_lock(td);
 511                         critical_exit();
 512                 }
 513                 if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) &&
 514                     ((ts = curwaittd->td_blocked) != NULL)) {
 515                         /*
 516                          * We unlock td to allow turnstile_wait to reacquire
 517                          * the thread lock. Before unlocking it we enter a
 518                          * critical section to prevent preemption after we
 519                          * reenable interrupts by dropping the thread lock in
 520                          * order to prevent curwaittd from getting to run.
 521                          */
 522                         critical_enter();
 523                         thread_unlock(td);
 524
 525                         if (turnstile_lock(ts, &lock, &owner)) {
 526                                 if (ts == curwaittd->td_blocked) {
 527                                         MPASS(TD_IS_INHIBITED(curwaittd) &&
 528                                             TD_ON_LOCK(curwaittd));
 529                                         critical_exit();
 530                                         turnstile_wait(ts, owner,
 531                                             curwaittd->td_tsqueue);
 532                                         counter_u64_add(turnstile_count, 1);
 533                                         thread_lock(td);
 534                                         return;
 535                                 }
 536                                 turnstile_unlock(ts, lock);
 537                         }
 538                         thread_lock(td);
 539                         critical_exit();
 540                         KASSERT(td->td_locks == locksheld,
 541                             ("%d extra locks held", td->td_locks - locksheld));
 542                 }
 543         }
 544         /*
 545          * We didn't find any threads actually blocked on a lock
 546          * so we have nothing to do except context switch away.
 547          */
 548         counter_u64_add(switch_count, 1);
 549         mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
 550
 551         /*
 552          * Release the thread lock while yielding to
 553          * allow other threads to acquire the lock
 554          * pointed to by TDQ_LOCKPTR(td). Else a
 555          * deadlock like situation might happen. (HPS)
 556          */
 557         thread_unlock(td);
 558         thread_lock(td);
 559 }
 560
 561 void
 562 epoch_wait_preempt(epoch_t epoch)
 563 {
 564         struct thread *td;
 565         int was_bound;
 566         int old_cpu;
 567         int old_pinned;
 568         u_char old_prio;
 569         int locks __unused;
 570
 571         MPASS(cold || epoch != NULL);
 572         INIT_CHECK(epoch);
 573         td = curthread;
 574 #ifdef INVARIANTS
 575         locks = curthread->td_locks;
 576         MPASS(epoch->e_flags & EPOCH_PREEMPT);
 577         if ((epoch->e_flags & EPOCH_LOCKED) == 0)
 578                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 579                     "epoch_wait() can be long running");
 580         KASSERT(!in_epoch(epoch), ("epoch_wait_preempt() called in the middle "
 581             "of an epoch section of the same epoch"));
 582 #endif
 583         DROP_GIANT();
 584         thread_lock(td);
 585
 586         old_cpu = PCPU_GET(cpuid);
 587         old_pinned = td->td_pinned;
 588         old_prio = td->td_priority;
 589         was_bound = sched_is_bound(td);
 590         sched_unbind(td);
 591         td->td_pinned = 0;
 592         sched_bind(td, old_cpu);
 593
 594         ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt,
 595             NULL);
 596
 597         /* restore CPU binding, if any */
 598         if (was_bound != 0) {
 599                 sched_bind(td, old_cpu);
 600         } else {
 601                 /* get thread back to initial CPU, if any */
 602                 if (old_pinned != 0)
 603                         sched_bind(td, old_cpu);
 604                 sched_unbind(td);
 605         }
 606         /* restore pinned after bind */
 607         td->td_pinned = old_pinned;
 608
 609         /* restore thread priority */
 610         sched_prio(td, old_prio);
 611         thread_unlock(td);
 612         PICKUP_GIANT();
 613         KASSERT(td->td_locks == locks,
 614             ("%d residual locks held", td->td_locks - locks));
 615 }
 616
 617 static void
 618 epoch_block_handler(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused,
 619     void *arg __unused)
 620 {
 621         cpu_spinwait();
 622 }
 623
 624 void
 625 epoch_wait(epoch_t epoch)
 626 {
 627
 628         MPASS(cold || epoch != NULL);
 629         INIT_CHECK(epoch);
 630         MPASS(epoch->e_flags == 0);
 631         critical_enter();
 632         ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL);
 633         critical_exit();
 634 }
 635
 636 void
 637 epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t))
 638 {
 639         epoch_record_t er;
 640         ck_epoch_entry_t *cb;
 641
 642         cb = (void *)ctx;
 643
 644         MPASS(callback);
 645         /* too early in boot to have epoch set up */
 646         if (__predict_false(epoch == NULL))
 647                 goto boottime;
 648 #if !defined(EARLY_AP_STARTUP)
 649         if (__predict_false(inited < 2))
 650                 goto boottime;
 651 #endif
 652
 653         critical_enter();
 654         *DPCPU_PTR(epoch_cb_count) += 1;
 655         er = epoch_currecord(epoch);
 656         ck_epoch_call(&er->er_record, cb, (ck_epoch_cb_t *)callback);
 657         critical_exit();
 658         return;
 659 boottime:
 660         callback(ctx);
 661 }
 662
 663 static void
 664 epoch_call_task(void *arg __unused)
 665 {
 666         ck_stack_entry_t *cursor, *head, *next;
 667         ck_epoch_record_t *record;
 668         epoch_record_t er;
 669         epoch_t epoch;
 670         ck_stack_t cb_stack;
 671         int i, npending, total;
 672
 673         ck_stack_init(&cb_stack);
 674         critical_enter();
 675         epoch_enter(global_epoch);
 676         for (total = i = 0; i != MAX_EPOCHS; i++) {
 677                 epoch = epoch_array + i;
 678                 if (__predict_false(
 679                     atomic_load_acq_int(&epoch->e_in_use) == 0))
 680                         continue;
 681                 er = epoch_currecord(epoch);
 682                 record = &er->er_record;
 683                 if ((npending = record->n_pending) == 0)
 684                         continue;
 685                 ck_epoch_poll_deferred(record, &cb_stack);
 686                 total += npending - record->n_pending;
 687         }
 688         epoch_exit(global_epoch);
 689         *DPCPU_PTR(epoch_cb_count) -= total;
 690         critical_exit();
 691
 692         counter_u64_add(epoch_call_count, total);
 693         counter_u64_add(epoch_call_task_count, 1);
 694
 695         head = ck_stack_batch_pop_npsc(&cb_stack);
 696         for (cursor = head; cursor != NULL; cursor = next) {
 697                 struct ck_epoch_entry *entry =
 698                     ck_epoch_entry_container(cursor);
 699
 700                 next = CK_STACK_NEXT(cursor);
 701                 entry->function(entry);
 702         }
 703 }
 704
 705 static int
 706 in_epoch_verbose_preempt(epoch_t epoch, int dump_onfail)
 707 {
 708         epoch_record_t er;
 709         struct epoch_tracker *tdwait;
 710         struct thread *td;
 711
 712         MPASS(epoch != NULL);
 713         MPASS((epoch->e_flags & EPOCH_PREEMPT) != 0);
 714         td = curthread;
 715         if (td->td_epochnest == 0)
 716                 return (0);
 717         critical_enter();
 718         er = epoch_currecord(epoch);
 719         TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link)
 720                 if (tdwait->et_td == td) {
 721                         critical_exit();
 722                         return (1);
 723                 }
 724 #ifdef INVARIANTS
 725         if (dump_onfail) {
 726                 MPASS(td->td_pinned);
 727                 printf("cpu: %d id: %d\n", curcpu, td->td_tid);
 728                 TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link)
 729                         printf("td_tid: %d ", tdwait->et_td->td_tid);
 730                 printf("\n");
 731         }
 732 #endif
 733         critical_exit();
 734         return (0);
 735 }
 736
 737 #ifdef INVARIANTS
 738 static void
 739 epoch_assert_nocpu(epoch_t epoch, struct thread *td)
 740 {
 741         epoch_record_t er;
 742         int cpu;
 743         bool crit;
 744
 745         crit = td->td_critnest > 0;
 746
 747         /* Check for a critical section mishap. */
 748         CPU_FOREACH(cpu) {
 749                 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
 750                 KASSERT(er->er_td != td,
 751                     ("%s critical section in epoch from cpu %d",
 752                     (crit ? "exited" : "re-entered"), cpu));
 753         }
 754 }
 755 #else
 756 #define epoch_assert_nocpu(e, td)
 757 #endif
 758
 759 int
 760 in_epoch_verbose(epoch_t epoch, int dump_onfail)
 761 {
 762         epoch_record_t er;
 763         struct thread *td;
 764
 765         if (__predict_false((epoch) == NULL))
 766                 return (0);
 767         if ((epoch->e_flags & EPOCH_PREEMPT) != 0)
 768                 return (in_epoch_verbose_preempt(epoch, dump_onfail));
 769
 770         /*
 771          * The thread being in a critical section is a necessary
 772          * condition to be correctly inside a non-preemptible epoch,
 773          * so it's definitely not in this epoch.
 774          */
 775         td = curthread;
 776         if (td->td_critnest == 0) {
 777                 epoch_assert_nocpu(epoch, td);
 778                 return (0);
 779         }
 780
 781         /*
 782          * The current cpu is in a critical section, so the epoch record will be
 783          * stable for the rest of this function.  Knowing that the record is not
 784          * active is sufficient for knowing whether we're in this epoch or not,
 785          * since it's a pcpu record.
 786          */
 787         er = epoch_currecord(epoch);
 788         if (er->er_record.active == 0) {
 789                 epoch_assert_nocpu(epoch, td);
 790                 return (0);
 791         }
 792
 793         MPASS(er->er_td == td);
 794         return (1);
 795 }
 796
 797 int
 798 in_epoch(epoch_t epoch)
 799 {
 800         return (in_epoch_verbose(epoch, 0));
 801 }
 802
 803 static void
 804 epoch_drain_cb(struct epoch_context *ctx)
 805 {
 806         struct epoch *epoch =
 807             __containerof(ctx, struct epoch_record, er_drain_ctx)->er_parent;
 808
 809         if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) {
 810                 mtx_lock(&epoch->e_drain_mtx);
 811                 wakeup(epoch);
 812                 mtx_unlock(&epoch->e_drain_mtx);
 813         }
 814 }
 815
 816 void
 817 epoch_drain_callbacks(epoch_t epoch)
 818 {
 819         epoch_record_t er;
 820         struct thread *td;
 821         int was_bound;
 822         int old_pinned;
 823         int old_cpu;
 824         int cpu;
 825
 826         WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 827             "epoch_drain_callbacks() may sleep!");
 828
 829         /* too early in boot to have epoch set up */
 830         if (__predict_false(epoch == NULL))
 831                 return;
 832 #if !defined(EARLY_AP_STARTUP)
 833         if (__predict_false(inited < 2))
 834                 return;
 835 #endif
 836         DROP_GIANT();
 837
 838         sx_xlock(&epoch->e_drain_sx);
 839         mtx_lock(&epoch->e_drain_mtx);
 840
 841         td = curthread;
 842         thread_lock(td);
 843         old_cpu = PCPU_GET(cpuid);
 844         old_pinned = td->td_pinned;
 845         was_bound = sched_is_bound(td);
 846         sched_unbind(td);
 847         td->td_pinned = 0;
 848
 849         CPU_FOREACH(cpu)
 850                 epoch->e_drain_count++;
 851         CPU_FOREACH(cpu) {
 852                 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
 853                 sched_bind(td, cpu);
 854                 epoch_call(epoch, &er->er_drain_ctx, &epoch_drain_cb);
 855         }
 856
 857         /* restore CPU binding, if any */
 858         if (was_bound != 0) {
 859                 sched_bind(td, old_cpu);
 860         } else {
 861                 /* get thread back to initial CPU, if any */
 862                 if (old_pinned != 0)
 863                         sched_bind(td, old_cpu);
 864                 sched_unbind(td);
 865         }
 866         /* restore pinned after bind */
 867         td->td_pinned = old_pinned;
 868
 869         thread_unlock(td);
 870
 871         while (epoch->e_drain_count != 0)
 872                 msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0);
 873
 874         mtx_unlock(&epoch->e_drain_mtx);
 875         sx_xunlock(&epoch->e_drain_sx);
 876
 877         PICKUP_GIANT();
 878 }
 879
 880 /* for binary compatibility */
 881
 882 struct epoch_tracker_KBI {
 883         void *datap[3];
 884 #ifdef EPOCH_TRACKER_DEBUG
 885         int datai[5];
 886 #else
 887         int datai[1];
 888 #endif
 889 } __aligned(sizeof(void *));
 890
 891 CTASSERT(sizeof(struct epoch_tracker_KBI) >= sizeof(struct epoch_tracker));
 892
 893 void
 894 epoch_enter_preempt_KBI(epoch_t epoch, epoch_tracker_t et)
 895 {
 896         epoch_enter_preempt(epoch, et);
 897 }
 898
 899 void
 900 epoch_exit_preempt_KBI(epoch_t epoch, epoch_tracker_t et)
 901 {
 902         epoch_exit_preempt(epoch, et);
 903 }
 904
 905 void
 906 epoch_enter_KBI(epoch_t epoch)
 907 {
 908         epoch_enter(epoch);
 909 }
 910
 911 void
 912 epoch_exit_KBI(epoch_t epoch)
 913 {
 914         epoch_exit(epoch);
 915 }