sys/contrib/ck/src/ck_epoch.c

   1 /*
   2  * Copyright 2011-2015 Samy Al Bahra.
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26
  27 /*
  28  * The implementation here is inspired from the work described in:
  29  *   Fraser, K. 2004. Practical Lock-Freedom. PhD Thesis, University
  30  *   of Cambridge Computing Laboratory.
  31  */
  32
  33 #include <ck_backoff.h>
  34 #include <ck_cc.h>
  35 #include <ck_epoch.h>
  36 #include <ck_pr.h>
  37 #include <ck_stack.h>
  38 #include <ck_stdbool.h>
  39 #include <ck_string.h>
  40
  41 /*
  42  * Only three distinct values are used for reclamation, but reclamation occurs
  43  * at e+2 rather than e+1. Any thread in a "critical section" would have
  44  * acquired some snapshot (e) of the global epoch value (e_g) and set an active
  45  * flag. Any hazardous references will only occur after a full memory barrier.
  46  * For example, assume an initial e_g value of 1, e value of 0 and active value
  47  * of 0.
  48  *
  49  * ck_epoch_begin(...)
  50  *   e = e_g
  51  *   active = 1
  52  *   memory_barrier();
  53  *
  54  * Any serialized reads may observe e = 0 or e = 1 with active = 0, or e = 0 or
  55  * e = 1 with active = 1. The e_g value can only go from 1 to 2 if every thread
  56  * has already observed the value of "1" (or the value we are incrementing
  57  * from). This guarantees us that for any given value e_g, any threads with-in
  58  * critical sections (referred to as "active" threads from here on) would have
  59  * an e value of e_g-1 or e_g. This also means that hazardous references may be
  60  * shared in both e_g-1 and e_g even if they are logically deleted in e_g.
  61  *
  62  * For example, assume all threads have an e value of e_g. Another thread may
  63  * increment to e_g to e_g+1. Older threads may have a reference to an object
  64  * which is only deleted in e_g+1. It could be that reader threads are
  65  * executing some hash table look-ups, while some other writer thread (which
  66  * causes epoch counter tick) actually deletes the same items that reader
  67  * threads are looking up (this writer thread having an e value of e_g+1).
  68  * This is possible if the writer thread re-observes the epoch after the
  69  * counter tick.
  70  *
  71  * Psuedo-code for writer:
  72  *   ck_epoch_begin()
  73  *   ht_delete(x)
  74  *   ck_epoch_end()
  75  *   ck_epoch_begin()
  76  *   ht_delete(x)
  77  *   ck_epoch_end()
  78  *
  79  * Psuedo-code for reader:
  80  *   for (;;) {
  81  *      x = ht_lookup(x)
  82  *      ck_pr_inc(&x->value);
  83  *   }
  84  *
  85  * Of course, it is also possible for references logically deleted at e_g-1 to
  86  * still be accessed at e_g as threads are "active" at the same time
  87  * (real-world time) mutating shared objects.
  88  *
  89  * Now, if the epoch counter is ticked to e_g+1, then no new hazardous
  90  * references could exist to objects logically deleted at e_g-1. The reason for
  91  * this is that at e_g+1, all epoch read-side critical sections started at
  92  * e_g-1 must have been completed. If any epoch read-side critical sections at
  93  * e_g-1 were still active, then we would never increment to e_g+1 (active != 0
  94  * ^ e != e_g).  Additionally, e_g may still have hazardous references to
  95  * objects logically deleted at e_g-1 which means objects logically deleted at
  96  * e_g-1 cannot be deleted at e_g+1 unless all threads have observed e_g+1
  97  * (since it is valid for active threads to be at e_g and threads at e_g still
  98  * require safe memory accesses).
  99  *
 100  * However, at e_g+2, all active threads must be either at e_g+1 or e_g+2.
 101  * Though e_g+2 may share hazardous references with e_g+1, and e_g+1 shares
 102  * hazardous references to e_g, no active threads are at e_g or e_g-1. This
 103  * means no hazardous references could exist to objects deleted at e_g-1 (at
 104  * e_g+2).
 105  *
 106  * To summarize these important points,
 107  *   1) Active threads will always have a value of e_g or e_g-1.
 108  *   2) Items that are logically deleted e_g or e_g-1 cannot be physically
 109  *      deleted.
 110  *   3) Objects logically deleted at e_g-1 can be physically destroyed at e_g+2
 111  *      or at e_g+1 if no threads are at e_g.
 112  *
 113  * Last but not least, if we are at e_g+2, then no active thread is at e_g
 114  * which means it is safe to apply modulo-3 arithmetic to e_g value in order to
 115  * re-use e_g to represent the e_g+3 state. This means it is sufficient to
 116  * represent e_g using only the values 0, 1 or 2. Every time a thread re-visits
 117  * a e_g (which can be determined with a non-empty deferral list) it can assume
 118  * objects in the e_g deferral list involved at least three e_g transitions and
 119  * are thus, safe, for physical deletion.
 120  *
 121  * Blocking semantics for epoch reclamation have additional restrictions.
 122  * Though we only require three deferral lists, reasonable blocking semantics
 123  * must be able to more gracefully handle bursty write work-loads which could
 124  * easily cause e_g wrap-around if modulo-3 arithmetic is used. This allows for
 125  * easy-to-trigger live-lock situations. The work-around to this is to not
 126  * apply modulo arithmetic to e_g but only to deferral list indexing.
 127  */
 128 #define CK_EPOCH_GRACE 3U
 129
 130 /*
 131  * CK_EPOCH_LENGTH must be a power-of-2 (because (CK_EPOCH_LENGTH - 1) is used
 132  * as a mask, and it must be at least 3 (see comments above).
 133  */
 134 #if (CK_EPOCH_LENGTH < 3 || (CK_EPOCH_LENGTH & (CK_EPOCH_LENGTH - 1)) != 0)
 135 #error "CK_EPOCH_LENGTH must be a power of 2 and >= 3"
 136 #endif
 137
 138 enum {
 139         CK_EPOCH_STATE_USED = 0,
 140         CK_EPOCH_STATE_FREE = 1
 141 };
 142
 143 CK_STACK_CONTAINER(struct ck_epoch_record, record_next,
 144     ck_epoch_record_container)
 145 CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry,
 146     ck_epoch_entry_container)
 147
 148 #define CK_EPOCH_SENSE_MASK     (CK_EPOCH_SENSE - 1)
 149
 150 bool
 151 _ck_epoch_delref(struct ck_epoch_record *record,
 152     struct ck_epoch_section *section)
 153 {
 154         struct ck_epoch_ref *current, *other;
 155         unsigned int i = section->bucket;
 156
 157         current = &record->local.bucket[i];
 158         current->count--;
 159
 160         if (current->count > 0)
 161                 return false;
 162
 163         /*
 164          * If the current bucket no longer has any references, then
 165          * determine whether we have already transitioned into a newer
 166          * epoch. If so, then make sure to update our shared snapshot
 167          * to allow for forward progress.
 168          *
 169          * If no other active bucket exists, then the record will go
 170          * inactive in order to allow for forward progress.
 171          */
 172         other = &record->local.bucket[(i + 1) & CK_EPOCH_SENSE_MASK];
 173         if (other->count > 0 &&
 174             ((int)(current->epoch - other->epoch) < 0)) {
 175                 /*
 176                  * The other epoch value is actually the newest,
 177                  * transition to it.
 178                  */
 179                 ck_pr_store_uint(&record->epoch, other->epoch);
 180         }
 181
 182         return true;
 183 }
 184
 185 void
 186 _ck_epoch_addref(struct ck_epoch_record *record,
 187     struct ck_epoch_section *section)
 188 {
 189         struct ck_epoch *global = record->global;
 190         struct ck_epoch_ref *ref;
 191         unsigned int epoch, i;
 192
 193         epoch = ck_pr_load_uint(&global->epoch);
 194         i = epoch & CK_EPOCH_SENSE_MASK;
 195         ref = &record->local.bucket[i];
 196
 197         if (ref->count++ == 0) {
 198 #ifndef CK_MD_TSO
 199                 struct ck_epoch_ref *previous;
 200
 201                 /*
 202                  * The system has already ticked. If another non-zero bucket
 203                  * exists, make sure to order our observations with respect
 204                  * to it. Otherwise, it is possible to acquire a reference
 205                  * from the previous epoch generation.
 206                  *
 207                  * On TSO architectures, the monoticity of the global counter
 208                  * and load-{store, load} ordering are sufficient to guarantee
 209                  * this ordering.
 210                  */
 211                 previous = &record->local.bucket[(i + 1) &
 212                     CK_EPOCH_SENSE_MASK];
 213                 if (previous->count > 0)
 214                         ck_pr_fence_acqrel();
 215 #endif /* !CK_MD_TSO */
 216
 217                 /*
 218                  * If this is this is a new reference into the current
 219                  * bucket then cache the associated epoch value.
 220                  */
 221                 ref->epoch = epoch;
 222         }
 223
 224         section->bucket = i;
 225         return;
 226 }
 227
 228 void
 229 ck_epoch_init(struct ck_epoch *global)
 230 {
 231
 232         ck_stack_init(&global->records);
 233         global->epoch = 1;
 234         global->n_free = 0;
 235         ck_pr_fence_store();
 236         return;
 237 }
 238
 239 struct ck_epoch_record *
 240 ck_epoch_recycle(struct ck_epoch *global, void *ct)
 241 {
 242         struct ck_epoch_record *record;
 243         ck_stack_entry_t *cursor;
 244         unsigned int state;
 245
 246         if (ck_pr_load_uint(&global->n_free) == 0)
 247                 return NULL;
 248
 249         CK_STACK_FOREACH(&global->records, cursor) {
 250                 record = ck_epoch_record_container(cursor);
 251
 252                 if (ck_pr_load_uint(&record->state) == CK_EPOCH_STATE_FREE) {
 253                         /* Serialize with respect to deferral list clean-up. */
 254                         ck_pr_fence_load();
 255                         state = ck_pr_fas_uint(&record->state,
 256                             CK_EPOCH_STATE_USED);
 257                         if (state == CK_EPOCH_STATE_FREE) {
 258                                 ck_pr_dec_uint(&global->n_free);
 259                                 ck_pr_store_ptr(&record->ct, ct);
 260
 261                                 /*
 262                                  * The context pointer is ordered by a
 263                                  * subsequent protected section.
 264                                  */
 265                                 return record;
 266                         }
 267                 }
 268         }
 269
 270         return NULL;
 271 }
 272
 273 void
 274 ck_epoch_register(struct ck_epoch *global, struct ck_epoch_record *record,
 275     void *ct)
 276 {
 277         size_t i;
 278
 279         record->global = global;
 280         record->state = CK_EPOCH_STATE_USED;
 281         record->active = 0;
 282         record->epoch = 0;
 283         record->n_dispatch = 0;
 284         record->n_peak = 0;
 285         record->n_pending = 0;
 286         record->ct = ct;
 287         memset(&record->local, 0, sizeof record->local);
 288
 289         for (i = 0; i < CK_EPOCH_LENGTH; i++)
 290                 ck_stack_init(&record->pending[i]);
 291
 292         ck_pr_fence_store();
 293         ck_stack_push_upmc(&global->records, &record->record_next);
 294         return;
 295 }
 296
 297 void
 298 ck_epoch_unregister(struct ck_epoch_record *record)
 299 {
 300         struct ck_epoch *global = record->global;
 301         size_t i;
 302
 303         record->active = 0;
 304         record->epoch = 0;
 305         record->n_dispatch = 0;
 306         record->n_peak = 0;
 307         record->n_pending = 0;
 308         memset(&record->local, 0, sizeof record->local);
 309
 310         for (i = 0; i < CK_EPOCH_LENGTH; i++)
 311                 ck_stack_init(&record->pending[i]);
 312
 313         ck_pr_store_ptr(&record->ct, NULL);
 314         ck_pr_fence_store();
 315         ck_pr_store_uint(&record->state, CK_EPOCH_STATE_FREE);
 316         ck_pr_inc_uint(&global->n_free);
 317         return;
 318 }
 319
 320 static struct ck_epoch_record *
 321 ck_epoch_scan(struct ck_epoch *global,
 322     struct ck_epoch_record *cr,
 323     unsigned int epoch,
 324     bool *af)
 325 {
 326         ck_stack_entry_t *cursor;
 327
 328         if (cr == NULL) {
 329                 cursor = CK_STACK_FIRST(&global->records);
 330                 *af = false;
 331         } else {
 332                 cursor = &cr->record_next;
 333                 *af = true;
 334         }
 335
 336         while (cursor != NULL) {
 337                 unsigned int state, active;
 338
 339                 cr = ck_epoch_record_container(cursor);
 340
 341                 state = ck_pr_load_uint(&cr->state);
 342                 if (state & CK_EPOCH_STATE_FREE) {
 343                         cursor = CK_STACK_NEXT(cursor);
 344                         continue;
 345                 }
 346
 347                 active = ck_pr_load_uint(&cr->active);
 348                 *af |= active;
 349
 350                 if (active != 0 && ck_pr_load_uint(&cr->epoch) != epoch)
 351                         return cr;
 352
 353                 cursor = CK_STACK_NEXT(cursor);
 354         }
 355
 356         return NULL;
 357 }
 358
 359 static unsigned int
 360 ck_epoch_dispatch(struct ck_epoch_record *record, unsigned int e, ck_stack_t *deferred)
 361 {
 362         unsigned int epoch = e & (CK_EPOCH_LENGTH - 1);
 363         ck_stack_entry_t *head, *next, *cursor;
 364         unsigned int n_pending, n_peak;
 365         unsigned int i = 0;
 366
 367         head = ck_stack_batch_pop_upmc(&record->pending[epoch]);
 368         for (cursor = head; cursor != NULL; cursor = next) {
 369                 struct ck_epoch_entry *entry =
 370                     ck_epoch_entry_container(cursor);
 371
 372                 next = CK_STACK_NEXT(cursor);
 373                 if (deferred != NULL)
 374                         ck_stack_push_spnc(deferred, &entry->stack_entry);
 375                 else
 376                         entry->function(entry);
 377
 378                 i++;
 379         }
 380
 381         n_peak = ck_pr_load_uint(&record->n_peak);
 382         n_pending = ck_pr_load_uint(&record->n_pending);
 383
 384         /* We don't require accuracy around peak calculation. */
 385         if (n_pending > n_peak)
 386                 ck_pr_store_uint(&record->n_peak, n_peak);
 387
 388         if (i > 0) {
 389                 ck_pr_add_uint(&record->n_dispatch, i);
 390                 ck_pr_sub_uint(&record->n_pending, i);
 391         }
 392
 393         return i;
 394 }
 395
 396 /*
 397  * Reclaim all objects associated with a record.
 398  */
 399 void
 400 ck_epoch_reclaim(struct ck_epoch_record *record)
 401 {
 402         unsigned int epoch;
 403
 404         for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++)
 405                 ck_epoch_dispatch(record, epoch, NULL);
 406
 407         return;
 408 }
 409
 410 CK_CC_FORCE_INLINE static void
 411 epoch_block(struct ck_epoch *global, struct ck_epoch_record *cr,
 412     ck_epoch_wait_cb_t *cb, void *ct)
 413 {
 414
 415         if (cb != NULL)
 416                 cb(global, cr, ct);
 417
 418         return;
 419 }
 420
 421 /*
 422  * This function must not be called with-in read section.
 423  */
 424 void
 425 ck_epoch_synchronize_wait(struct ck_epoch *global,
 426     ck_epoch_wait_cb_t *cb, void *ct)
 427 {
 428         struct ck_epoch_record *cr;
 429         unsigned int delta, epoch, goal, i;
 430         bool active;
 431
 432         ck_pr_fence_memory();
 433
 434         /*
 435          * The observation of the global epoch must be ordered with respect to
 436          * all prior operations. The re-ordering of loads is permitted given
 437          * monoticity of global epoch counter.
 438          *
 439          * If UINT_MAX concurrent mutations were to occur then it is possible
 440          * to encounter an ABA-issue. If this is a concern, consider tuning
 441          * write-side concurrency.
 442          */
 443         delta = epoch = ck_pr_load_uint(&global->epoch);
 444         goal = epoch + CK_EPOCH_GRACE;
 445
 446         for (i = 0, cr = NULL; i < CK_EPOCH_GRACE - 1; cr = NULL, i++) {
 447                 bool r;
 448
 449                 /*
 450                  * Determine whether all threads have observed the current
 451                  * epoch with respect to the updates on invocation.
 452                  */
 453                 while (cr = ck_epoch_scan(global, cr, delta, &active),
 454                     cr != NULL) {
 455                         unsigned int e_d;
 456
 457                         ck_pr_stall();
 458
 459                         /*
 460                          * Another writer may have already observed a grace
 461                          * period.
 462                          */
 463                         e_d = ck_pr_load_uint(&global->epoch);
 464                         if (e_d == delta) {
 465                                 epoch_block(global, cr, cb, ct);
 466                                 continue;
 467                         }
 468
 469                         /*
 470                          * If the epoch has been updated, we may have already
 471                          * met our goal.
 472                          */
 473                         delta = e_d;
 474                         if ((goal > epoch) & (delta >= goal))
 475                                 goto leave;
 476
 477                         epoch_block(global, cr, cb, ct);
 478
 479                         /*
 480                          * If the epoch has been updated, then a grace period
 481                          * requires that all threads are observed idle at the
 482                          * same epoch.
 483                          */
 484                         cr = NULL;
 485                 }
 486
 487                 /*
 488                  * If we have observed all threads as inactive, then we assume
 489                  * we are at a grace period.
 490                  */
 491                 if (active == false)
 492                         break;
 493
 494                 /*
 495                  * Increment current epoch. CAS semantics are used to eliminate
 496                  * increment operations for synchronization that occurs for the
 497                  * same global epoch value snapshot.
 498                  *
 499                  * If we can guarantee there will only be one active barrier or
 500                  * epoch tick at a given time, then it is sufficient to use an
 501                  * increment operation. In a multi-barrier workload, however,
 502                  * it is possible to overflow the epoch value if we apply
 503                  * modulo-3 arithmetic.
 504                  */
 505                 r = ck_pr_cas_uint_value(&global->epoch, delta, delta + 1,
 506                     &delta);
 507
 508                 /* Order subsequent thread active checks. */
 509                 ck_pr_fence_atomic_load();
 510
 511                 /*
 512                  * If CAS has succeeded, then set delta to latest snapshot.
 513                  * Otherwise, we have just acquired latest snapshot.
 514                  */
 515                 delta = delta + r;
 516         }
 517
 518         /*
 519          * A majority of use-cases will not require full barrier semantics.
 520          * However, if non-temporal instructions are used, full barrier
 521          * semantics are necessary.
 522          */
 523 leave:
 524         ck_pr_fence_memory();
 525         return;
 526 }
 527
 528 void
 529 ck_epoch_synchronize(struct ck_epoch_record *record)
 530 {
 531
 532         ck_epoch_synchronize_wait(record->global, NULL, NULL);
 533         return;
 534 }
 535
 536 void
 537 ck_epoch_barrier(struct ck_epoch_record *record)
 538 {
 539
 540         ck_epoch_synchronize(record);
 541         ck_epoch_reclaim(record);
 542         return;
 543 }
 544
 545 void
 546 ck_epoch_barrier_wait(struct ck_epoch_record *record, ck_epoch_wait_cb_t *cb,
 547     void *ct)
 548 {
 549
 550         ck_epoch_synchronize_wait(record->global, cb, ct);
 551         ck_epoch_reclaim(record);
 552         return;
 553 }
 554
 555 /*
 556  * It may be worth it to actually apply these deferral semantics to an epoch
 557  * that was observed at ck_epoch_call time. The problem is that the latter
 558  * would require a full fence.
 559  *
 560  * ck_epoch_call will dispatch to the latest epoch snapshot that was observed.
 561  * There are cases where it will fail to reclaim as early as it could. If this
 562  * becomes a problem, we could actually use a heap for epoch buckets but that
 563  * is far from ideal too.
 564  */
 565 bool
 566 ck_epoch_poll_deferred(struct ck_epoch_record *record, ck_stack_t *deferred)
 567 {
 568         bool active;
 569         unsigned int epoch;
 570         struct ck_epoch_record *cr = NULL;
 571         struct ck_epoch *global = record->global;
 572         unsigned int n_dispatch;
 573
 574         epoch = ck_pr_load_uint(&global->epoch);
 575
 576         /* Serialize epoch snapshots with respect to global epoch. */
 577         ck_pr_fence_memory();
 578
 579         /*
 580          * At this point, epoch is the current global epoch value.
 581          * There may or may not be active threads which observed epoch - 1.
 582          * (ck_epoch_scan() will tell us that). However, there should be
 583          * no active threads which observed epoch - 2.
 584          *
 585          * Note that checking epoch - 2 is necessary, as race conditions can
 586          * allow another thread to increment the global epoch before this
 587          * thread runs.
 588          */
 589         n_dispatch = ck_epoch_dispatch(record, epoch - 2, deferred);
 590
 591         cr = ck_epoch_scan(global, cr, epoch, &active);
 592         if (cr != NULL)
 593                 return (n_dispatch > 0);
 594
 595         /* We are at a grace period if all threads are inactive. */
 596         if (active == false) {
 597                 record->epoch = epoch;
 598                 for (epoch = 0; epoch < CK_EPOCH_LENGTH; epoch++)
 599                         ck_epoch_dispatch(record, epoch, deferred);
 600
 601                 return true;
 602         }
 603
 604         /*
 605          * If an active thread exists, rely on epoch observation.
 606          *
 607          * All the active threads entered the epoch section during
 608          * the current epoch. Therefore, we can now run the handlers
 609          * for the immediately preceding epoch and attempt to
 610          * advance the epoch if it hasn't been already.
 611          */
 612         (void)ck_pr_cas_uint(&global->epoch, epoch, epoch + 1);
 613
 614         ck_epoch_dispatch(record, epoch - 1, deferred);
 615         return true;
 616 }
 617
 618 bool
 619 ck_epoch_poll(struct ck_epoch_record *record)
 620 {
 621
 622         return ck_epoch_poll_deferred(record, NULL);
 623 }