sys/kern/subr_epoch.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-2-Clause
   3  *
   4  * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25  * SUCH DAMAGE.
  26  *
  27  */
  28
  29 #include <sys/cdefs.h>
  30 #include <sys/param.h>
  31 #include <sys/systm.h>
  32 #include <sys/counter.h>
  33 #include <sys/epoch.h>
  34 #include <sys/gtaskqueue.h>
  35 #include <sys/kernel.h>
  36 #include <sys/limits.h>
  37 #include <sys/lock.h>
  38 #include <sys/malloc.h>
  39 #include <sys/mutex.h>
  40 #include <sys/pcpu.h>
  41 #include <sys/proc.h>
  42 #include <sys/sched.h>
  43 #include <sys/sx.h>
  44 #include <sys/smp.h>
  45 #include <sys/sysctl.h>
  46 #include <sys/turnstile.h>
  47 #ifdef EPOCH_TRACE
  48 #include <machine/stdarg.h>
  49 #include <sys/stack.h>
  50 #include <sys/tree.h>
  51 #endif
  52 #include <vm/vm.h>
  53 #include <vm/vm_extern.h>
  54 #include <vm/vm_kern.h>
  55 #include <vm/uma.h>
  56
  57 #include <ck_epoch.h>
  58
  59 #ifdef __amd64__
  60 #define EPOCH_ALIGN CACHE_LINE_SIZE*2
  61 #else
  62 #define EPOCH_ALIGN CACHE_LINE_SIZE
  63 #endif
  64
  65 TAILQ_HEAD (epoch_tdlist, epoch_tracker);
  66 typedef struct epoch_record {
  67         ck_epoch_record_t er_record;
  68         struct epoch_context er_drain_ctx;
  69         struct epoch *er_parent;
  70         volatile struct epoch_tdlist er_tdlist;
  71         volatile uint32_t er_gen;
  72         uint32_t er_cpuid;
  73 #ifdef INVARIANTS
  74         /* Used to verify record ownership for non-preemptible epochs. */
  75         struct thread *er_td;
  76 #endif
  77 } __aligned(EPOCH_ALIGN)     *epoch_record_t;
  78
  79 struct epoch {
  80         struct ck_epoch e_epoch __aligned(EPOCH_ALIGN);
  81         epoch_record_t e_pcpu_record;
  82         int     e_in_use;
  83         int     e_flags;
  84         struct sx e_drain_sx;
  85         struct mtx e_drain_mtx;
  86         volatile int e_drain_count;
  87         const char *e_name;
  88 };
  89
  90 /* arbitrary --- needs benchmarking */
  91 #define MAX_ADAPTIVE_SPIN 100
  92 #define MAX_EPOCHS 64
  93
  94 CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context));
  95 SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  96     "epoch information");
  97 SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  98     "epoch stats");
  99
 100 /* Stats. */
 101 static counter_u64_t block_count;
 102
 103 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, nblocked, CTLFLAG_RW,
 104     &block_count, "# of times a thread was in an epoch when epoch_wait was called");
 105 static counter_u64_t migrate_count;
 106
 107 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, migrations, CTLFLAG_RW,
 108     &migrate_count, "# of times thread was migrated to another CPU in epoch_wait");
 109 static counter_u64_t turnstile_count;
 110
 111 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, ncontended, CTLFLAG_RW,
 112     &turnstile_count, "# of times a thread was blocked on a lock in an epoch during an epoch_wait");
 113 static counter_u64_t switch_count;
 114
 115 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, switches, CTLFLAG_RW,
 116     &switch_count, "# of times a thread voluntarily context switched in epoch_wait");
 117 static counter_u64_t epoch_call_count;
 118
 119 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_calls, CTLFLAG_RW,
 120     &epoch_call_count, "# of times a callback was deferred");
 121 static counter_u64_t epoch_call_task_count;
 122
 123 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_call_tasks, CTLFLAG_RW,
 124     &epoch_call_task_count, "# of times a callback task was run");
 125
 126 TAILQ_HEAD (threadlist, thread);
 127
 128 CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry,
 129     ck_epoch_entry_container)
 130
 131 static struct epoch epoch_array[MAX_EPOCHS];
 132
 133 DPCPU_DEFINE(struct grouptask, epoch_cb_task);
 134 DPCPU_DEFINE(int, epoch_cb_count);
 135
 136 static __read_mostly int inited;
 137 __read_mostly epoch_t global_epoch;
 138 __read_mostly epoch_t global_epoch_preempt;
 139
 140 static void epoch_call_task(void *context __unused);
 141 static  uma_zone_t pcpu_zone_record;
 142
 143 static struct sx epoch_sx;
 144
 145 #define EPOCH_LOCK() sx_xlock(&epoch_sx)
 146 #define EPOCH_UNLOCK() sx_xunlock(&epoch_sx)
 147
 148 #ifdef EPOCH_TRACE
 149 struct stackentry {
 150         RB_ENTRY(stackentry) se_node;
 151         struct stack se_stack;
 152 };
 153
 154 static int
 155 stackentry_compare(struct stackentry *a, struct stackentry *b)
 156 {
 157
 158         if (a->se_stack.depth > b->se_stack.depth)
 159                 return (1);
 160         if (a->se_stack.depth < b->se_stack.depth)
 161                 return (-1);
 162         for (int i = 0; i < a->se_stack.depth; i++) {
 163                 if (a->se_stack.pcs[i] > b->se_stack.pcs[i])
 164                         return (1);
 165                 if (a->se_stack.pcs[i] < b->se_stack.pcs[i])
 166                         return (-1);
 167         }
 168
 169         return (0);
 170 }
 171
 172 RB_HEAD(stacktree, stackentry) epoch_stacks = RB_INITIALIZER(&epoch_stacks);
 173 RB_GENERATE_STATIC(stacktree, stackentry, se_node, stackentry_compare);
 174
 175 static struct mtx epoch_stacks_lock;
 176 MTX_SYSINIT(epochstacks, &epoch_stacks_lock, "epoch_stacks", MTX_DEF);
 177
 178 static bool epoch_trace_stack_print = true;
 179 SYSCTL_BOOL(_kern_epoch, OID_AUTO, trace_stack_print, CTLFLAG_RWTUN,
 180     &epoch_trace_stack_print, 0, "Print stack traces on epoch reports");
 181
 182 static void epoch_trace_report(const char *fmt, ...) __printflike(1, 2);
 183 static inline void
 184 epoch_trace_report(const char *fmt, ...)
 185 {
 186         va_list ap;
 187         struct stackentry se, *new;
 188
 189         stack_zero(&se.se_stack);       /* XXX: is it really needed? */
 190         stack_save(&se.se_stack);
 191
 192         /* Tree is never reduced - go lockless. */
 193         if (RB_FIND(stacktree, &epoch_stacks, &se) != NULL)
 194                 return;
 195
 196         new = malloc(sizeof(*new), M_STACK, M_NOWAIT);
 197         if (new != NULL) {
 198                 bcopy(&se.se_stack, &new->se_stack, sizeof(struct stack));
 199
 200                 mtx_lock(&epoch_stacks_lock);
 201                 new = RB_INSERT(stacktree, &epoch_stacks, new);
 202                 mtx_unlock(&epoch_stacks_lock);
 203                 if (new != NULL)
 204                         free(new, M_STACK);
 205         }
 206
 207         va_start(ap, fmt);
 208         (void)vprintf(fmt, ap);
 209         va_end(ap);
 210         if (epoch_trace_stack_print)
 211                 stack_print_ddb(&se.se_stack);
 212 }
 213
 214 static inline void
 215 epoch_trace_enter(struct thread *td, epoch_t epoch, epoch_tracker_t et,
 216     const char *file, int line)
 217 {
 218         epoch_tracker_t iet;
 219
 220         SLIST_FOREACH(iet, &td->td_epochs, et_tlink) {
 221                 if (iet->et_epoch != epoch)
 222                         continue;
 223                 epoch_trace_report("Recursively entering epoch %s "
 224                     "at %s:%d, previously entered at %s:%d\n",
 225                     epoch->e_name, file, line,
 226                     iet->et_file, iet->et_line);
 227         }
 228         et->et_epoch = epoch;
 229         et->et_file = file;
 230         et->et_line = line;
 231         SLIST_INSERT_HEAD(&td->td_epochs, et, et_tlink);
 232 }
 233
 234 static inline void
 235 epoch_trace_exit(struct thread *td, epoch_t epoch, epoch_tracker_t et,
 236     const char *file, int line)
 237 {
 238
 239         if (SLIST_FIRST(&td->td_epochs) != et) {
 240                 epoch_trace_report("Exiting epoch %s in a not nested order "
 241                     "at %s:%d. Most recently entered %s at %s:%d\n",
 242                     epoch->e_name,
 243                     file, line,
 244                     SLIST_FIRST(&td->td_epochs)->et_epoch->e_name,
 245                     SLIST_FIRST(&td->td_epochs)->et_file,
 246                     SLIST_FIRST(&td->td_epochs)->et_line);
 247                 /* This will panic if et is not anywhere on td_epochs. */
 248                 SLIST_REMOVE(&td->td_epochs, et, epoch_tracker, et_tlink);
 249         } else
 250                 SLIST_REMOVE_HEAD(&td->td_epochs, et_tlink);
 251 }
 252
 253 /* Used by assertions that check thread state before going to sleep. */
 254 void
 255 epoch_trace_list(struct thread *td)
 256 {
 257         epoch_tracker_t iet;
 258
 259         SLIST_FOREACH(iet, &td->td_epochs, et_tlink)
 260                 printf("Epoch %s entered at %s:%d\n", iet->et_epoch->e_name,
 261                     iet->et_file, iet->et_line);
 262 }
 263 #endif /* EPOCH_TRACE */
 264
 265 static void
 266 epoch_init(void *arg __unused)
 267 {
 268         int cpu;
 269
 270         block_count = counter_u64_alloc(M_WAITOK);
 271         migrate_count = counter_u64_alloc(M_WAITOK);
 272         turnstile_count = counter_u64_alloc(M_WAITOK);
 273         switch_count = counter_u64_alloc(M_WAITOK);
 274         epoch_call_count = counter_u64_alloc(M_WAITOK);
 275         epoch_call_task_count = counter_u64_alloc(M_WAITOK);
 276
 277         pcpu_zone_record = uma_zcreate("epoch_record pcpu",
 278             sizeof(struct epoch_record), NULL, NULL, NULL, NULL,
 279             UMA_ALIGN_PTR, UMA_ZONE_PCPU);
 280         CPU_FOREACH(cpu) {
 281                 GROUPTASK_INIT(DPCPU_ID_PTR(cpu, epoch_cb_task), 0,
 282                     epoch_call_task, NULL);
 283                 taskqgroup_attach_cpu(qgroup_softirq,
 284                     DPCPU_ID_PTR(cpu, epoch_cb_task), NULL, cpu, NULL, NULL,
 285                     "epoch call task");
 286         }
 287 #ifdef EPOCH_TRACE
 288         SLIST_INIT(&thread0.td_epochs);
 289 #endif
 290         sx_init(&epoch_sx, "epoch-sx");
 291         inited = 1;
 292         global_epoch = epoch_alloc("Global", 0);
 293         global_epoch_preempt = epoch_alloc("Global preemptible", EPOCH_PREEMPT);
 294 }
 295 SYSINIT(epoch, SI_SUB_EPOCH, SI_ORDER_FIRST, epoch_init, NULL);
 296
 297 #if !defined(EARLY_AP_STARTUP)
 298 static void
 299 epoch_init_smp(void *dummy __unused)
 300 {
 301         inited = 2;
 302 }
 303 SYSINIT(epoch_smp, SI_SUB_SMP + 1, SI_ORDER_FIRST, epoch_init_smp, NULL);
 304 #endif
 305
 306 static void
 307 epoch_ctor(epoch_t epoch)
 308 {
 309         epoch_record_t er;
 310         int cpu;
 311
 312         epoch->e_pcpu_record = uma_zalloc_pcpu(pcpu_zone_record, M_WAITOK);
 313         CPU_FOREACH(cpu) {
 314                 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
 315                 bzero(er, sizeof(*er));
 316                 ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL);
 317                 TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist);
 318                 er->er_cpuid = cpu;
 319                 er->er_parent = epoch;
 320         }
 321 }
 322
 323 static void
 324 epoch_adjust_prio(struct thread *td, u_char prio)
 325 {
 326
 327         thread_lock(td);
 328         sched_prio(td, prio);
 329         thread_unlock(td);
 330 }
 331
 332 epoch_t
 333 epoch_alloc(const char *name, int flags)
 334 {
 335         epoch_t epoch;
 336         int i;
 337
 338         MPASS(name != NULL);
 339
 340         if (__predict_false(!inited))
 341                 panic("%s called too early in boot", __func__);
 342
 343         EPOCH_LOCK();
 344
 345         /*
 346          * Find a free index in the epoch array. If no free index is
 347          * found, try to use the index after the last one.
 348          */
 349         for (i = 0;; i++) {
 350                 /*
 351                  * If too many epochs are currently allocated,
 352                  * return NULL.
 353                  */
 354                 if (i == MAX_EPOCHS) {
 355                         epoch = NULL;
 356                         goto done;
 357                 }
 358                 if (epoch_array[i].e_in_use == 0)
 359                         break;
 360         }
 361
 362         epoch = epoch_array + i;
 363         ck_epoch_init(&epoch->e_epoch);
 364         epoch_ctor(epoch);
 365         epoch->e_flags = flags;
 366         epoch->e_name = name;
 367         sx_init(&epoch->e_drain_sx, "epoch-drain-sx");
 368         mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF);
 369
 370         /*
 371          * Set e_in_use last, because when this field is set the
 372          * epoch_call_task() function will start scanning this epoch
 373          * structure.
 374          */
 375         atomic_store_rel_int(&epoch->e_in_use, 1);
 376 done:
 377         EPOCH_UNLOCK();
 378         return (epoch);
 379 }
 380
 381 void
 382 epoch_free(epoch_t epoch)
 383 {
 384 #ifdef INVARIANTS
 385         int cpu;
 386 #endif
 387
 388         EPOCH_LOCK();
 389
 390         MPASS(epoch->e_in_use != 0);
 391
 392         epoch_drain_callbacks(epoch);
 393
 394         atomic_store_rel_int(&epoch->e_in_use, 0);
 395         /*
 396          * Make sure the epoch_call_task() function see e_in_use equal
 397          * to zero, by calling epoch_wait() on the global_epoch:
 398          */
 399         epoch_wait(global_epoch);
 400 #ifdef INVARIANTS
 401         CPU_FOREACH(cpu) {
 402                 epoch_record_t er;
 403
 404                 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
 405
 406                 /*
 407                  * Sanity check: none of the records should be in use anymore.
 408                  * We drained callbacks above and freeing the pcpu records is
 409                  * imminent.
 410                  */
 411                 MPASS(er->er_td == NULL);
 412                 MPASS(TAILQ_EMPTY(&er->er_tdlist));
 413         }
 414 #endif
 415         uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record);
 416         mtx_destroy(&epoch->e_drain_mtx);
 417         sx_destroy(&epoch->e_drain_sx);
 418         memset(epoch, 0, sizeof(*epoch));
 419
 420         EPOCH_UNLOCK();
 421 }
 422
 423 static epoch_record_t
 424 epoch_currecord(epoch_t epoch)
 425 {
 426
 427         return (zpcpu_get(epoch->e_pcpu_record));
 428 }
 429
 430 #define INIT_CHECK(epoch)                                       \
 431         do {                                                    \
 432                 if (__predict_false((epoch) == NULL))           \
 433                         return;                                 \
 434         } while (0)
 435
 436 void
 437 _epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE)
 438 {
 439         struct epoch_record *er;
 440         struct thread *td;
 441
 442         MPASS(cold || epoch != NULL);
 443         td = curthread;
 444         MPASS(kstack_contains(td, (vm_offset_t)et, sizeof(*et)));
 445
 446         INIT_CHECK(epoch);
 447         MPASS(epoch->e_flags & EPOCH_PREEMPT);
 448
 449 #ifdef EPOCH_TRACE
 450         epoch_trace_enter(td, epoch, et, file, line);
 451 #endif
 452         et->et_td = td;
 453         THREAD_NO_SLEEPING();
 454         critical_enter();
 455         sched_pin();
 456         et->et_old_priority = td->td_priority;
 457         er = epoch_currecord(epoch);
 458         /* Record-level tracking is reserved for non-preemptible epochs. */
 459         MPASS(er->er_td == NULL);
 460         TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link);
 461         ck_epoch_begin(&er->er_record, &et->et_section);
 462         critical_exit();
 463 }
 464
 465 void
 466 epoch_enter(epoch_t epoch)
 467 {
 468         epoch_record_t er;
 469
 470         MPASS(cold || epoch != NULL);
 471         INIT_CHECK(epoch);
 472         critical_enter();
 473         er = epoch_currecord(epoch);
 474 #ifdef INVARIANTS
 475         if (er->er_record.active == 0) {
 476                 MPASS(er->er_td == NULL);
 477                 er->er_td = curthread;
 478         } else {
 479                 /* We've recursed, just make sure our accounting isn't wrong. */
 480                 MPASS(er->er_td == curthread);
 481         }
 482 #endif
 483         ck_epoch_begin(&er->er_record, NULL);
 484 }
 485
 486 void
 487 _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE)
 488 {
 489         struct epoch_record *er;
 490         struct thread *td;
 491
 492         INIT_CHECK(epoch);
 493         td = curthread;
 494         critical_enter();
 495         sched_unpin();
 496         THREAD_SLEEPING_OK();
 497         er = epoch_currecord(epoch);
 498         MPASS(epoch->e_flags & EPOCH_PREEMPT);
 499         MPASS(et != NULL);
 500         MPASS(et->et_td == td);
 501 #ifdef INVARIANTS
 502         et->et_td = (void*)0xDEADBEEF;
 503         /* Record-level tracking is reserved for non-preemptible epochs. */
 504         MPASS(er->er_td == NULL);
 505 #endif
 506         ck_epoch_end(&er->er_record, &et->et_section);
 507         TAILQ_REMOVE(&er->er_tdlist, et, et_link);
 508         er->er_gen++;
 509         if (__predict_false(et->et_old_priority != td->td_priority))
 510                 epoch_adjust_prio(td, et->et_old_priority);
 511         critical_exit();
 512 #ifdef EPOCH_TRACE
 513         epoch_trace_exit(td, epoch, et, file, line);
 514 #endif
 515 }
 516
 517 void
 518 epoch_exit(epoch_t epoch)
 519 {
 520         epoch_record_t er;
 521
 522         INIT_CHECK(epoch);
 523         er = epoch_currecord(epoch);
 524         ck_epoch_end(&er->er_record, NULL);
 525 #ifdef INVARIANTS
 526         MPASS(er->er_td == curthread);
 527         if (er->er_record.active == 0)
 528                 er->er_td = NULL;
 529 #endif
 530         critical_exit();
 531 }
 532
 533 /*
 534  * epoch_block_handler_preempt() is a callback from the CK code when another
 535  * thread is currently in an epoch section.
 536  */
 537 static void
 538 epoch_block_handler_preempt(struct ck_epoch *global __unused,
 539     ck_epoch_record_t *cr, void *arg __unused)
 540 {
 541         epoch_record_t record;
 542         struct thread *td, *owner, *curwaittd;
 543         struct epoch_tracker *tdwait;
 544         struct turnstile *ts;
 545         struct lock_object *lock;
 546         int spincount, gen;
 547         int locksheld __unused;
 548
 549         record = __containerof(cr, struct epoch_record, er_record);
 550         td = curthread;
 551         locksheld = td->td_locks;
 552         spincount = 0;
 553         counter_u64_add(block_count, 1);
 554         /*
 555          * We lost a race and there's no longer any threads
 556          * on the CPU in an epoch section.
 557          */
 558         if (TAILQ_EMPTY(&record->er_tdlist))
 559                 return;
 560
 561         if (record->er_cpuid != curcpu) {
 562                 /*
 563                  * If the head of the list is running, we can wait for it
 564                  * to remove itself from the list and thus save us the
 565                  * overhead of a migration
 566                  */
 567                 gen = record->er_gen;
 568                 thread_unlock(td);
 569                 /*
 570                  * We can't actually check if the waiting thread is running
 571                  * so we simply poll for it to exit before giving up and
 572                  * migrating.
 573                  */
 574                 do {
 575                         cpu_spinwait();
 576                 } while (!TAILQ_EMPTY(&record->er_tdlist) &&
 577                                  gen == record->er_gen &&
 578                                  spincount++ < MAX_ADAPTIVE_SPIN);
 579                 thread_lock(td);
 580                 /*
 581                  * If the generation has changed we can poll again
 582                  * otherwise we need to migrate.
 583                  */
 584                 if (gen != record->er_gen)
 585                         return;
 586                 /*
 587                  * Being on the same CPU as that of the record on which
 588                  * we need to wait allows us access to the thread
 589                  * list associated with that CPU. We can then examine the
 590                  * oldest thread in the queue and wait on its turnstile
 591                  * until it resumes and so on until a grace period
 592                  * elapses.
 593                  *
 594                  */
 595                 counter_u64_add(migrate_count, 1);
 596                 sched_bind(td, record->er_cpuid);
 597                 /*
 598                  * At this point we need to return to the ck code
 599                  * to scan to see if a grace period has elapsed.
 600                  * We can't move on to check the thread list, because
 601                  * in the meantime new threads may have arrived that
 602                  * in fact belong to a different epoch.
 603                  */
 604                 return;
 605         }
 606         /*
 607          * Try to find a thread in an epoch section on this CPU
 608          * waiting on a turnstile. Otherwise find the lowest
 609          * priority thread (highest prio value) and drop our priority
 610          * to match to allow it to run.
 611          */
 612         TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) {
 613                 /*
 614                  * Propagate our priority to any other waiters to prevent us
 615                  * from starving them. They will have their original priority
 616                  * restore on exit from epoch_wait().
 617                  */
 618                 curwaittd = tdwait->et_td;
 619                 if (!TD_IS_INHIBITED(curwaittd) && curwaittd->td_priority > td->td_priority) {
 620                         critical_enter();
 621                         thread_unlock(td);
 622                         thread_lock(curwaittd);
 623                         sched_prio(curwaittd, td->td_priority);
 624                         thread_unlock(curwaittd);
 625                         thread_lock(td);
 626                         critical_exit();
 627                 }
 628                 if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) &&
 629                     ((ts = curwaittd->td_blocked) != NULL)) {
 630                         /*
 631                          * We unlock td to allow turnstile_wait to reacquire
 632                          * the thread lock. Before unlocking it we enter a
 633                          * critical section to prevent preemption after we
 634                          * reenable interrupts by dropping the thread lock in
 635                          * order to prevent curwaittd from getting to run.
 636                          */
 637                         critical_enter();
 638                         thread_unlock(td);
 639
 640                         if (turnstile_lock(ts, &lock, &owner)) {
 641                                 if (ts == curwaittd->td_blocked) {
 642                                         MPASS(TD_IS_INHIBITED(curwaittd) &&
 643                                             TD_ON_LOCK(curwaittd));
 644                                         critical_exit();
 645                                         turnstile_wait(ts, owner,
 646                                             curwaittd->td_tsqueue);
 647                                         counter_u64_add(turnstile_count, 1);
 648                                         thread_lock(td);
 649                                         return;
 650                                 }
 651                                 turnstile_unlock(ts, lock);
 652                         }
 653                         thread_lock(td);
 654                         critical_exit();
 655                         KASSERT(td->td_locks == locksheld,
 656                             ("%d extra locks held", td->td_locks - locksheld));
 657                 }
 658         }
 659         /*
 660          * We didn't find any threads actually blocked on a lock
 661          * so we have nothing to do except context switch away.
 662          */
 663         counter_u64_add(switch_count, 1);
 664         mi_switch(SW_VOL | SWT_RELINQUISH);
 665         /*
 666          * It is important the thread lock is dropped while yielding
 667          * to allow other threads to acquire the lock pointed to by
 668          * TDQ_LOCKPTR(td). Currently mi_switch() will unlock the
 669          * thread lock before returning. Else a deadlock like
 670          * situation might happen.
 671          */
 672         thread_lock(td);
 673 }
 674
 675 void
 676 epoch_wait_preempt(epoch_t epoch)
 677 {
 678         struct thread *td;
 679         int was_bound;
 680         int old_cpu;
 681         int old_pinned;
 682         u_char old_prio;
 683         int locks __unused;
 684
 685         MPASS(cold || epoch != NULL);
 686         INIT_CHECK(epoch);
 687         td = curthread;
 688 #ifdef INVARIANTS
 689         locks = curthread->td_locks;
 690         MPASS(epoch->e_flags & EPOCH_PREEMPT);
 691         if ((epoch->e_flags & EPOCH_LOCKED) == 0)
 692                 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 693                     "epoch_wait() can be long running");
 694         KASSERT(!in_epoch(epoch), ("epoch_wait_preempt() called in the middle "
 695             "of an epoch section of the same epoch"));
 696 #endif
 697         DROP_GIANT();
 698         thread_lock(td);
 699
 700         old_cpu = PCPU_GET(cpuid);
 701         old_pinned = td->td_pinned;
 702         old_prio = td->td_priority;
 703         was_bound = sched_is_bound(td);
 704         sched_unbind(td);
 705         td->td_pinned = 0;
 706         sched_bind(td, old_cpu);
 707
 708         ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt,
 709             NULL);
 710
 711         /* restore CPU binding, if any */
 712         if (was_bound != 0) {
 713                 sched_bind(td, old_cpu);
 714         } else {
 715                 /* get thread back to initial CPU, if any */
 716                 if (old_pinned != 0)
 717                         sched_bind(td, old_cpu);
 718                 sched_unbind(td);
 719         }
 720         /* restore pinned after bind */
 721         td->td_pinned = old_pinned;
 722
 723         /* restore thread priority */
 724         sched_prio(td, old_prio);
 725         thread_unlock(td);
 726         PICKUP_GIANT();
 727         KASSERT(td->td_locks == locks,
 728             ("%d residual locks held", td->td_locks - locks));
 729 }
 730
 731 static void
 732 epoch_block_handler(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused,
 733     void *arg __unused)
 734 {
 735         cpu_spinwait();
 736 }
 737
 738 void
 739 epoch_wait(epoch_t epoch)
 740 {
 741
 742         MPASS(cold || epoch != NULL);
 743         INIT_CHECK(epoch);
 744         MPASS(epoch->e_flags == 0);
 745         critical_enter();
 746         ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL);
 747         critical_exit();
 748 }
 749
 750 void
 751 epoch_call(epoch_t epoch, epoch_callback_t callback, epoch_context_t ctx)
 752 {
 753         epoch_record_t er;
 754         ck_epoch_entry_t *cb;
 755
 756         cb = (void *)ctx;
 757
 758         MPASS(callback);
 759         /* too early in boot to have epoch set up */
 760         if (__predict_false(epoch == NULL))
 761                 goto boottime;
 762 #if !defined(EARLY_AP_STARTUP)
 763         if (__predict_false(inited < 2))
 764                 goto boottime;
 765 #endif
 766
 767         critical_enter();
 768         *DPCPU_PTR(epoch_cb_count) += 1;
 769         er = epoch_currecord(epoch);
 770         ck_epoch_call(&er->er_record, cb, (ck_epoch_cb_t *)callback);
 771         critical_exit();
 772         return;
 773 boottime:
 774         callback(ctx);
 775 }
 776
 777 static void
 778 epoch_call_task(void *arg __unused)
 779 {
 780         ck_stack_entry_t *cursor, *head, *next;
 781         ck_epoch_record_t *record;
 782         epoch_record_t er;
 783         epoch_t epoch;
 784         ck_stack_t cb_stack;
 785         int i, npending, total;
 786
 787         ck_stack_init(&cb_stack);
 788         critical_enter();
 789         epoch_enter(global_epoch);
 790         for (total = i = 0; i != MAX_EPOCHS; i++) {
 791                 epoch = epoch_array + i;
 792                 if (__predict_false(
 793                     atomic_load_acq_int(&epoch->e_in_use) == 0))
 794                         continue;
 795                 er = epoch_currecord(epoch);
 796                 record = &er->er_record;
 797                 if ((npending = record->n_pending) == 0)
 798                         continue;
 799                 ck_epoch_poll_deferred(record, &cb_stack);
 800                 total += npending - record->n_pending;
 801         }
 802         epoch_exit(global_epoch);
 803         *DPCPU_PTR(epoch_cb_count) -= total;
 804         critical_exit();
 805
 806         counter_u64_add(epoch_call_count, total);
 807         counter_u64_add(epoch_call_task_count, 1);
 808
 809         head = ck_stack_batch_pop_npsc(&cb_stack);
 810         for (cursor = head; cursor != NULL; cursor = next) {
 811                 struct ck_epoch_entry *entry =
 812                     ck_epoch_entry_container(cursor);
 813
 814                 next = CK_STACK_NEXT(cursor);
 815                 entry->function(entry);
 816         }
 817 }
 818
 819 static int
 820 in_epoch_verbose_preempt(epoch_t epoch, int dump_onfail)
 821 {
 822         epoch_record_t er;
 823         struct epoch_tracker *tdwait;
 824         struct thread *td;
 825
 826         MPASS(epoch != NULL);
 827         MPASS((epoch->e_flags & EPOCH_PREEMPT) != 0);
 828         td = curthread;
 829         if (THREAD_CAN_SLEEP())
 830                 return (0);
 831         critical_enter();
 832         er = epoch_currecord(epoch);
 833         TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link)
 834                 if (tdwait->et_td == td) {
 835                         critical_exit();
 836                         return (1);
 837                 }
 838 #ifdef INVARIANTS
 839         if (dump_onfail) {
 840                 MPASS(td->td_pinned);
 841                 printf("cpu: %d id: %d\n", curcpu, td->td_tid);
 842                 TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link)
 843                         printf("td_tid: %d ", tdwait->et_td->td_tid);
 844                 printf("\n");
 845         }
 846 #endif
 847         critical_exit();
 848         return (0);
 849 }
 850
 851 #ifdef INVARIANTS
 852 static void
 853 epoch_assert_nocpu(epoch_t epoch, struct thread *td)
 854 {
 855         epoch_record_t er;
 856         int cpu;
 857         bool crit;
 858
 859         crit = td->td_critnest > 0;
 860
 861         /* Check for a critical section mishap. */
 862         CPU_FOREACH(cpu) {
 863                 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
 864                 KASSERT(er->er_td != td,
 865                     ("%s critical section in epoch '%s', from cpu %d",
 866                     (crit ? "exited" : "re-entered"), epoch->e_name, cpu));
 867         }
 868 }
 869 #else
 870 #define epoch_assert_nocpu(e, td) do {} while (0)
 871 #endif
 872
 873 int
 874 in_epoch_verbose(epoch_t epoch, int dump_onfail)
 875 {
 876         epoch_record_t er;
 877         struct thread *td;
 878
 879         if (__predict_false((epoch) == NULL))
 880                 return (0);
 881         if ((epoch->e_flags & EPOCH_PREEMPT) != 0)
 882                 return (in_epoch_verbose_preempt(epoch, dump_onfail));
 883
 884         /*
 885          * The thread being in a critical section is a necessary
 886          * condition to be correctly inside a non-preemptible epoch,
 887          * so it's definitely not in this epoch.
 888          */
 889         td = curthread;
 890         if (td->td_critnest == 0) {
 891                 epoch_assert_nocpu(epoch, td);
 892                 return (0);
 893         }
 894
 895         /*
 896          * The current cpu is in a critical section, so the epoch record will be
 897          * stable for the rest of this function.  Knowing that the record is not
 898          * active is sufficient for knowing whether we're in this epoch or not,
 899          * since it's a pcpu record.
 900          */
 901         er = epoch_currecord(epoch);
 902         if (er->er_record.active == 0) {
 903                 epoch_assert_nocpu(epoch, td);
 904                 return (0);
 905         }
 906
 907         MPASS(er->er_td == td);
 908         return (1);
 909 }
 910
 911 int
 912 in_epoch(epoch_t epoch)
 913 {
 914         return (in_epoch_verbose(epoch, 0));
 915 }
 916
 917 static void
 918 epoch_drain_cb(struct epoch_context *ctx)
 919 {
 920         struct epoch *epoch =
 921             __containerof(ctx, struct epoch_record, er_drain_ctx)->er_parent;
 922
 923         if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) {
 924                 mtx_lock(&epoch->e_drain_mtx);
 925                 wakeup(epoch);
 926                 mtx_unlock(&epoch->e_drain_mtx);
 927         }
 928 }
 929
 930 void
 931 epoch_drain_callbacks(epoch_t epoch)
 932 {
 933         epoch_record_t er;
 934         struct thread *td;
 935         int was_bound;
 936         int old_pinned;
 937         int old_cpu;
 938         int cpu;
 939
 940         WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 941             "epoch_drain_callbacks() may sleep!");
 942
 943         /* too early in boot to have epoch set up */
 944         if (__predict_false(epoch == NULL))
 945                 return;
 946 #if !defined(EARLY_AP_STARTUP)
 947         if (__predict_false(inited < 2))
 948                 return;
 949 #endif
 950         DROP_GIANT();
 951
 952         sx_xlock(&epoch->e_drain_sx);
 953         mtx_lock(&epoch->e_drain_mtx);
 954
 955         td = curthread;
 956         thread_lock(td);
 957         old_cpu = PCPU_GET(cpuid);
 958         old_pinned = td->td_pinned;
 959         was_bound = sched_is_bound(td);
 960         sched_unbind(td);
 961         td->td_pinned = 0;
 962
 963         CPU_FOREACH(cpu)
 964                 epoch->e_drain_count++;
 965         CPU_FOREACH(cpu) {
 966                 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
 967                 sched_bind(td, cpu);
 968                 epoch_call(epoch, &epoch_drain_cb, &er->er_drain_ctx);
 969         }
 970
 971         /* restore CPU binding, if any */
 972         if (was_bound != 0) {
 973                 sched_bind(td, old_cpu);
 974         } else {
 975                 /* get thread back to initial CPU, if any */
 976                 if (old_pinned != 0)
 977                         sched_bind(td, old_cpu);
 978                 sched_unbind(td);
 979         }
 980         /* restore pinned after bind */
 981         td->td_pinned = old_pinned;
 982
 983         thread_unlock(td);
 984
 985         while (epoch->e_drain_count != 0)
 986                 msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0);
 987
 988         mtx_unlock(&epoch->e_drain_mtx);
 989         sx_xunlock(&epoch->e_drain_sx);
 990
 991         PICKUP_GIANT();
 992 }