contrib/jemalloc/include/jemalloc/internal/prof.h

   1 /******************************************************************************/
   2 #ifdef JEMALLOC_H_TYPES
   3
   4 typedef struct prof_bt_s prof_bt_t;
   5 typedef struct prof_cnt_s prof_cnt_t;
   6 typedef struct prof_thr_cnt_s prof_thr_cnt_t;
   7 typedef struct prof_ctx_s prof_ctx_t;
   8 typedef struct prof_tdata_s prof_tdata_t;
   9
  10 /* Option defaults. */
  11 #define PROF_PREFIX_DEFAULT             "jeprof"
  12 #define LG_PROF_SAMPLE_DEFAULT          19
  13 #define LG_PROF_INTERVAL_DEFAULT        -1
  14
  15 /*
  16  * Hard limit on stack backtrace depth.  The version of prof_backtrace() that
  17  * is based on __builtin_return_address() necessarily has a hard-coded number
  18  * of backtrace frame handlers, and should be kept in sync with this setting.
  19  */
  20 #define PROF_BT_MAX                     128
  21
  22 /* Maximum number of backtraces to store in each per thread LRU cache. */
  23 #define PROF_TCMAX                      1024
  24
  25 /* Initial hash table size. */
  26 #define PROF_CKH_MINITEMS               64
  27
  28 /* Size of memory buffer to use when writing dump files. */
  29 #define PROF_DUMP_BUFSIZE               65536
  30
  31 /* Size of stack-allocated buffer used by prof_printf(). */
  32 #define PROF_PRINTF_BUFSIZE             128
  33
  34 /*
  35  * Number of mutexes shared among all ctx's.  No space is allocated for these
  36  * unless profiling is enabled, so it's okay to over-provision.
  37  */
  38 #define PROF_NCTX_LOCKS                 1024
  39
  40 /*
  41  * prof_tdata pointers close to NULL are used to encode state information that
  42  * is used for cleaning up during thread shutdown.
  43  */
  44 #define PROF_TDATA_STATE_REINCARNATED   ((prof_tdata_t *)(uintptr_t)1)
  45 #define PROF_TDATA_STATE_PURGATORY      ((prof_tdata_t *)(uintptr_t)2)
  46 #define PROF_TDATA_STATE_MAX            PROF_TDATA_STATE_PURGATORY
  47
  48 #endif /* JEMALLOC_H_TYPES */
  49 /******************************************************************************/
  50 #ifdef JEMALLOC_H_STRUCTS
  51
  52 struct prof_bt_s {
  53         /* Backtrace, stored as len program counters. */
  54         void            **vec;
  55         unsigned        len;
  56 };
  57
  58 #ifdef JEMALLOC_PROF_LIBGCC
  59 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
  60 typedef struct {
  61         prof_bt_t       *bt;
  62         unsigned        nignore;
  63         unsigned        max;
  64 } prof_unwind_data_t;
  65 #endif
  66
  67 struct prof_cnt_s {
  68         /*
  69          * Profiling counters.  An allocation/deallocation pair can operate on
  70          * different prof_thr_cnt_t objects that are linked into the same
  71          * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
  72          * negative.  In principle it is possible for the *bytes counters to
  73          * overflow/underflow, but a general solution would require something
  74          * like 128-bit counters; this implementation doesn't bother to solve
  75          * that problem.
  76          */
  77         int64_t         curobjs;
  78         int64_t         curbytes;
  79         uint64_t        accumobjs;
  80         uint64_t        accumbytes;
  81 };
  82
  83 struct prof_thr_cnt_s {
  84         /* Linkage into prof_ctx_t's cnts_ql. */
  85         ql_elm(prof_thr_cnt_t)  cnts_link;
  86
  87         /* Linkage into thread's LRU. */
  88         ql_elm(prof_thr_cnt_t)  lru_link;
  89
  90         /*
  91          * Associated context.  If a thread frees an object that it did not
  92          * allocate, it is possible that the context is not cached in the
  93          * thread's hash table, in which case it must be able to look up the
  94          * context, insert a new prof_thr_cnt_t into the thread's hash table,
  95          * and link it into the prof_ctx_t's cnts_ql.
  96          */
  97         prof_ctx_t              *ctx;
  98
  99         /*
 100          * Threads use memory barriers to update the counters.  Since there is
 101          * only ever one writer, the only challenge is for the reader to get a
 102          * consistent read of the counters.
 103          *
 104          * The writer uses this series of operations:
 105          *
 106          * 1) Increment epoch to an odd number.
 107          * 2) Update counters.
 108          * 3) Increment epoch to an even number.
 109          *
 110          * The reader must assure 1) that the epoch is even while it reads the
 111          * counters, and 2) that the epoch doesn't change between the time it
 112          * starts and finishes reading the counters.
 113          */
 114         unsigned                epoch;
 115
 116         /* Profiling counters. */
 117         prof_cnt_t              cnts;
 118 };
 119
 120 struct prof_ctx_s {
 121         /* Associated backtrace. */
 122         prof_bt_t               *bt;
 123
 124         /* Protects nlimbo, cnt_merged, and cnts_ql. */
 125         malloc_mutex_t          *lock;
 126
 127         /*
 128          * Number of threads that currently cause this ctx to be in a state of
 129          * limbo due to one of:
 130          *   - Initializing per thread counters associated with this ctx.
 131          *   - Preparing to destroy this ctx.
 132          * nlimbo must be 1 (single destroyer) in order to safely destroy the
 133          * ctx.
 134          */
 135         unsigned                nlimbo;
 136
 137         /* Temporary storage for summation during dump. */
 138         prof_cnt_t              cnt_summed;
 139
 140         /* When threads exit, they merge their stats into cnt_merged. */
 141         prof_cnt_t              cnt_merged;
 142
 143         /*
 144          * List of profile counters, one for each thread that has allocated in
 145          * this context.
 146          */
 147         ql_head(prof_thr_cnt_t) cnts_ql;
 148 };
 149
 150 struct prof_tdata_s {
 151         /*
 152          * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
 153          * cache of backtraces, with associated thread-specific prof_thr_cnt_t
 154          * objects.  Other threads may read the prof_thr_cnt_t contents, but no
 155          * others will ever write them.
 156          *
 157          * Upon thread exit, the thread must merge all the prof_thr_cnt_t
 158          * counter data into the associated prof_ctx_t objects, and unlink/free
 159          * the prof_thr_cnt_t objects.
 160          */
 161         ckh_t                   bt2cnt;
 162
 163         /* LRU for contents of bt2cnt. */
 164         ql_head(prof_thr_cnt_t) lru_ql;
 165
 166         /* Backtrace vector, used for calls to prof_backtrace(). */
 167         void                    **vec;
 168
 169         /* Sampling state. */
 170         uint64_t                prng_state;
 171         uint64_t                threshold;
 172         uint64_t                accum;
 173
 174         /* State used to avoid dumping while operating on prof internals. */
 175         bool                    enq;
 176         bool                    enq_idump;
 177         bool                    enq_gdump;
 178 };
 179
 180 #endif /* JEMALLOC_H_STRUCTS */
 181 /******************************************************************************/
 182 #ifdef JEMALLOC_H_EXTERNS
 183
 184 extern bool     opt_prof;
 185 /*
 186  * Even if opt_prof is true, sampling can be temporarily disabled by setting
 187  * opt_prof_active to false.  No locking is used when updating opt_prof_active,
 188  * so there are no guarantees regarding how long it will take for all threads
 189  * to notice state changes.
 190  */
 191 extern bool     opt_prof_active;
 192 extern size_t   opt_lg_prof_sample;   /* Mean bytes between samples. */
 193 extern ssize_t  opt_lg_prof_interval; /* lg(prof_interval). */
 194 extern bool     opt_prof_gdump;       /* High-water memory dumping. */
 195 extern bool     opt_prof_final;       /* Final profile dumping. */
 196 extern bool     opt_prof_leak;        /* Dump leak summary at exit. */
 197 extern bool     opt_prof_accum;       /* Report cumulative bytes. */
 198 extern char     opt_prof_prefix[PATH_MAX + 1];
 199
 200 /*
 201  * Profile dump interval, measured in bytes allocated.  Each arena triggers a
 202  * profile dump when it reaches this threshold.  The effect is that the
 203  * interval between profile dumps averages prof_interval, though the actual
 204  * interval between dumps will tend to be sporadic, and the interval will be a
 205  * maximum of approximately (prof_interval * narenas).
 206  */
 207 extern uint64_t prof_interval;
 208
 209 /*
 210  * If true, promote small sampled objects to large objects, since small run
 211  * headers do not have embedded profile context pointers.
 212  */
 213 extern bool     prof_promote;
 214
 215 void    bt_init(prof_bt_t *bt, void **vec);
 216 void    prof_backtrace(prof_bt_t *bt, unsigned nignore);
 217 prof_thr_cnt_t  *prof_lookup(prof_bt_t *bt);
 218 void    prof_idump(void);
 219 bool    prof_mdump(const char *filename);
 220 void    prof_gdump(void);
 221 prof_tdata_t    *prof_tdata_init(void);
 222 void    prof_tdata_cleanup(void *arg);
 223 void    prof_boot0(void);
 224 void    prof_boot1(void);
 225 bool    prof_boot2(void);
 226 void    prof_prefork(void);
 227 void    prof_postfork_parent(void);
 228 void    prof_postfork_child(void);
 229
 230 #endif /* JEMALLOC_H_EXTERNS */
 231 /******************************************************************************/
 232 #ifdef JEMALLOC_H_INLINES
 233
 234 #define PROF_ALLOC_PREP(nignore, size, ret) do {                        \
 235         prof_tdata_t *prof_tdata;                                       \
 236         prof_bt_t bt;                                                   \
 237                                                                         \
 238         assert(size == s2u(size));                                      \
 239                                                                         \
 240         prof_tdata = prof_tdata_get(true);                              \
 241         if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) { \
 242                 if (prof_tdata != NULL)                                 \
 243                         ret = (prof_thr_cnt_t *)(uintptr_t)1U;          \
 244                 else                                                    \
 245                         ret = NULL;                                     \
 246                 break;                                                  \
 247         }                                                               \
 248                                                                         \
 249         if (opt_prof_active == false) {                                 \
 250                 /* Sampling is currently inactive, so avoid sampling. */\
 251                 ret = (prof_thr_cnt_t *)(uintptr_t)1U;                  \
 252         } else if (opt_lg_prof_sample == 0) {                           \
 253                 /* Don't bother with sampling logic, since sampling   */\
 254                 /* interval is 1.                                     */\
 255                 bt_init(&bt, prof_tdata->vec);                          \
 256                 prof_backtrace(&bt, nignore);                           \
 257                 ret = prof_lookup(&bt);                                 \
 258         } else {                                                        \
 259                 if (prof_tdata->threshold == 0) {                       \
 260                         /* Initialize.  Seed the prng differently for */\
 261                         /* each thread.                               */\
 262                         prof_tdata->prng_state =                        \
 263                             (uint64_t)(uintptr_t)&size;                 \
 264                         prof_sample_threshold_update(prof_tdata);       \
 265                 }                                                       \
 266                                                                         \
 267                 /* Determine whether to capture a backtrace based on  */\
 268                 /* whether size is enough for prof_accum to reach     */\
 269                 /* prof_tdata->threshold.  However, delay updating    */\
 270                 /* these variables until prof_{m,re}alloc(), because  */\
 271                 /* we don't know for sure that the allocation will    */\
 272                 /* succeed.                                           */\
 273                 /*                                                    */\
 274                 /* Use subtraction rather than addition to avoid      */\
 275                 /* potential integer overflow.                        */\
 276                 if (size >= prof_tdata->threshold -                     \
 277                     prof_tdata->accum) {                                \
 278                         bt_init(&bt, prof_tdata->vec);                  \
 279                         prof_backtrace(&bt, nignore);                   \
 280                         ret = prof_lookup(&bt);                         \
 281                 } else                                                  \
 282                         ret = (prof_thr_cnt_t *)(uintptr_t)1U;          \
 283         }                                                               \
 284 } while (0)
 285
 286 #ifndef JEMALLOC_ENABLE_INLINE
 287 malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
 288
 289 prof_tdata_t    *prof_tdata_get(bool create);
 290 void    prof_sample_threshold_update(prof_tdata_t *prof_tdata);
 291 prof_ctx_t      *prof_ctx_get(const void *ptr);
 292 void    prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 293 bool    prof_sample_accum_update(size_t size);
 294 void    prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
 295 void    prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
 296     size_t old_size, prof_ctx_t *old_ctx);
 297 void    prof_free(const void *ptr, size_t size);
 298 #endif
 299
 300 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
 301 /* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
 302 malloc_tsd_externs(prof_tdata, prof_tdata_t *)
 303 malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
 304     prof_tdata_cleanup)
 305
 306 JEMALLOC_INLINE prof_tdata_t *
 307 prof_tdata_get(bool create)
 308 {
 309         prof_tdata_t *prof_tdata;
 310
 311         cassert(config_prof);
 312
 313         prof_tdata = *prof_tdata_tsd_get();
 314         if (create && prof_tdata == NULL)
 315                 prof_tdata = prof_tdata_init();
 316
 317         return (prof_tdata);
 318 }
 319
 320 JEMALLOC_INLINE void
 321 prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 322 {
 323         uint64_t r;
 324         double u;
 325
 326         cassert(config_prof);
 327
 328         /*
 329          * Compute sample threshold as a geometrically distributed random
 330          * variable with mean (2^opt_lg_prof_sample).
 331          *
 332          *                         __        __
 333          *                         |  log(u)  |                     1
 334          * prof_tdata->threshold = | -------- |, where p = -------------------
 335          *                         | log(1-p) |             opt_lg_prof_sample
 336          *                                                 2
 337          *
 338          * For more information on the math, see:
 339          *
 340          *   Non-Uniform Random Variate Generation
 341          *   Luc Devroye
 342          *   Springer-Verlag, New York, 1986
 343          *   pp 500
 344          *   (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
 345          */
 346         prng64(r, 53, prof_tdata->prng_state,
 347             UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
 348         u = (double)r * (1.0/9007199254740992.0L);
 349         prof_tdata->threshold = (uint64_t)(log(u) /
 350             log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
 351             + (uint64_t)1U;
 352 }
 353
 354 JEMALLOC_INLINE prof_ctx_t *
 355 prof_ctx_get(const void *ptr)
 356 {
 357         prof_ctx_t *ret;
 358         arena_chunk_t *chunk;
 359
 360         cassert(config_prof);
 361         assert(ptr != NULL);
 362
 363         chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 364         if (chunk != ptr) {
 365                 /* Region. */
 366                 ret = arena_prof_ctx_get(ptr);
 367         } else
 368                 ret = huge_prof_ctx_get(ptr);
 369
 370         return (ret);
 371 }
 372
 373 JEMALLOC_INLINE void
 374 prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 375 {
 376         arena_chunk_t *chunk;
 377
 378         cassert(config_prof);
 379         assert(ptr != NULL);
 380
 381         chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 382         if (chunk != ptr) {
 383                 /* Region. */
 384                 arena_prof_ctx_set(ptr, ctx);
 385         } else
 386                 huge_prof_ctx_set(ptr, ctx);
 387 }
 388
 389 JEMALLOC_INLINE bool
 390 prof_sample_accum_update(size_t size)
 391 {
 392         prof_tdata_t *prof_tdata;
 393
 394         cassert(config_prof);
 395         /* Sampling logic is unnecessary if the interval is 1. */
 396         assert(opt_lg_prof_sample != 0);
 397
 398         prof_tdata = prof_tdata_get(false);
 399         if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 400                 return (true);
 401
 402         /* Take care to avoid integer overflow. */
 403         if (size >= prof_tdata->threshold - prof_tdata->accum) {
 404                 prof_tdata->accum -= (prof_tdata->threshold - size);
 405                 /* Compute new sample threshold. */
 406                 prof_sample_threshold_update(prof_tdata);
 407                 while (prof_tdata->accum >= prof_tdata->threshold) {
 408                         prof_tdata->accum -= prof_tdata->threshold;
 409                         prof_sample_threshold_update(prof_tdata);
 410                 }
 411                 return (false);
 412         } else {
 413                 prof_tdata->accum += size;
 414                 return (true);
 415         }
 416 }
 417
 418 JEMALLOC_INLINE void
 419 prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
 420 {
 421
 422         cassert(config_prof);
 423         assert(ptr != NULL);
 424         assert(size == isalloc(ptr, true));
 425
 426         if (opt_lg_prof_sample != 0) {
 427                 if (prof_sample_accum_update(size)) {
 428                         /*
 429                          * Don't sample.  For malloc()-like allocation, it is
 430                          * always possible to tell in advance how large an
 431                          * object's usable size will be, so there should never
 432                          * be a difference between the size passed to
 433                          * PROF_ALLOC_PREP() and prof_malloc().
 434                          */
 435                         assert((uintptr_t)cnt == (uintptr_t)1U);
 436                 }
 437         }
 438
 439         if ((uintptr_t)cnt > (uintptr_t)1U) {
 440                 prof_ctx_set(ptr, cnt->ctx);
 441
 442                 cnt->epoch++;
 443                 /*********/
 444                 mb_write();
 445                 /*********/
 446                 cnt->cnts.curobjs++;
 447                 cnt->cnts.curbytes += size;
 448                 if (opt_prof_accum) {
 449                         cnt->cnts.accumobjs++;
 450                         cnt->cnts.accumbytes += size;
 451                 }
 452                 /*********/
 453                 mb_write();
 454                 /*********/
 455                 cnt->epoch++;
 456                 /*********/
 457                 mb_write();
 458                 /*********/
 459         } else
 460                 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 461 }
 462
 463 JEMALLOC_INLINE void
 464 prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
 465     size_t old_size, prof_ctx_t *old_ctx)
 466 {
 467         prof_thr_cnt_t *told_cnt;
 468
 469         cassert(config_prof);
 470         assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
 471
 472         if (ptr != NULL) {
 473                 assert(size == isalloc(ptr, true));
 474                 if (opt_lg_prof_sample != 0) {
 475                         if (prof_sample_accum_update(size)) {
 476                                 /*
 477                                  * Don't sample.  The size passed to
 478                                  * PROF_ALLOC_PREP() was larger than what
 479                                  * actually got allocated, so a backtrace was
 480                                  * captured for this allocation, even though
 481                                  * its actual size was insufficient to cross
 482                                  * the sample threshold.
 483                                  */
 484                                 cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 485                         }
 486                 }
 487         }
 488
 489         if ((uintptr_t)old_ctx > (uintptr_t)1U) {
 490                 told_cnt = prof_lookup(old_ctx->bt);
 491                 if (told_cnt == NULL) {
 492                         /*
 493                          * It's too late to propagate OOM for this realloc(),
 494                          * so operate directly on old_cnt->ctx->cnt_merged.
 495                          */
 496                         malloc_mutex_lock(old_ctx->lock);
 497                         old_ctx->cnt_merged.curobjs--;
 498                         old_ctx->cnt_merged.curbytes -= old_size;
 499                         malloc_mutex_unlock(old_ctx->lock);
 500                         told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 501                 }
 502         } else
 503                 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 504
 505         if ((uintptr_t)told_cnt > (uintptr_t)1U)
 506                 told_cnt->epoch++;
 507         if ((uintptr_t)cnt > (uintptr_t)1U) {
 508                 prof_ctx_set(ptr, cnt->ctx);
 509                 cnt->epoch++;
 510         } else if (ptr != NULL)
 511                 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
 512         /*********/
 513         mb_write();
 514         /*********/
 515         if ((uintptr_t)told_cnt > (uintptr_t)1U) {
 516                 told_cnt->cnts.curobjs--;
 517                 told_cnt->cnts.curbytes -= old_size;
 518         }
 519         if ((uintptr_t)cnt > (uintptr_t)1U) {
 520                 cnt->cnts.curobjs++;
 521                 cnt->cnts.curbytes += size;
 522                 if (opt_prof_accum) {
 523                         cnt->cnts.accumobjs++;
 524                         cnt->cnts.accumbytes += size;
 525                 }
 526         }
 527         /*********/
 528         mb_write();
 529         /*********/
 530         if ((uintptr_t)told_cnt > (uintptr_t)1U)
 531                 told_cnt->epoch++;
 532         if ((uintptr_t)cnt > (uintptr_t)1U)
 533                 cnt->epoch++;
 534         /*********/
 535         mb_write(); /* Not strictly necessary. */
 536 }
 537
 538 JEMALLOC_INLINE void
 539 prof_free(const void *ptr, size_t size)
 540 {
 541         prof_ctx_t *ctx = prof_ctx_get(ptr);
 542
 543         cassert(config_prof);
 544
 545         if ((uintptr_t)ctx > (uintptr_t)1) {
 546                 prof_thr_cnt_t *tcnt;
 547                 assert(size == isalloc(ptr, true));
 548                 tcnt = prof_lookup(ctx->bt);
 549
 550                 if (tcnt != NULL) {
 551                         tcnt->epoch++;
 552                         /*********/
 553                         mb_write();
 554                         /*********/
 555                         tcnt->cnts.curobjs--;
 556                         tcnt->cnts.curbytes -= size;
 557                         /*********/
 558                         mb_write();
 559                         /*********/
 560                         tcnt->epoch++;
 561                         /*********/
 562                         mb_write();
 563                         /*********/
 564                 } else {
 565                         /*
 566                          * OOM during free() cannot be propagated, so operate
 567                          * directly on cnt->ctx->cnt_merged.
 568                          */
 569                         malloc_mutex_lock(ctx->lock);
 570                         ctx->cnt_merged.curobjs--;
 571                         ctx->cnt_merged.curbytes -= size;
 572                         malloc_mutex_unlock(ctx->lock);
 573                 }
 574         }
 575 }
 576 #endif
 577
 578 #endif /* JEMALLOC_H_INLINES */
 579 /******************************************************************************/