]> CyberLeo.Net >> Repos - FreeBSD/releng/10.0.git/blob - contrib/jemalloc/include/jemalloc/internal/prof.h
- Copy stable/10 (r259064) to releng/10.0 as part of the
[FreeBSD/releng/10.0.git] / contrib / jemalloc / include / jemalloc / internal / prof.h
1 /******************************************************************************/
2 #ifdef JEMALLOC_H_TYPES
3
4 typedef struct prof_bt_s prof_bt_t;
5 typedef struct prof_cnt_s prof_cnt_t;
6 typedef struct prof_thr_cnt_s prof_thr_cnt_t;
7 typedef struct prof_ctx_s prof_ctx_t;
8 typedef struct prof_tdata_s prof_tdata_t;
9
10 /* Option defaults. */
11 #define PROF_PREFIX_DEFAULT             "jeprof"
12 #define LG_PROF_SAMPLE_DEFAULT          19
13 #define LG_PROF_INTERVAL_DEFAULT        -1
14
15 /*
16  * Hard limit on stack backtrace depth.  The version of prof_backtrace() that
17  * is based on __builtin_return_address() necessarily has a hard-coded number
18  * of backtrace frame handlers, and should be kept in sync with this setting.
19  */
20 #define PROF_BT_MAX                     128
21
22 /* Maximum number of backtraces to store in each per thread LRU cache. */
23 #define PROF_TCMAX                      1024
24
25 /* Initial hash table size. */
26 #define PROF_CKH_MINITEMS               64
27
28 /* Size of memory buffer to use when writing dump files. */
29 #define PROF_DUMP_BUFSIZE               65536
30
31 /* Size of stack-allocated buffer used by prof_printf(). */
32 #define PROF_PRINTF_BUFSIZE             128
33
34 /*
35  * Number of mutexes shared among all ctx's.  No space is allocated for these
36  * unless profiling is enabled, so it's okay to over-provision.
37  */
38 #define PROF_NCTX_LOCKS                 1024
39
40 /*
41  * prof_tdata pointers close to NULL are used to encode state information that
42  * is used for cleaning up during thread shutdown.
43  */
44 #define PROF_TDATA_STATE_REINCARNATED   ((prof_tdata_t *)(uintptr_t)1)
45 #define PROF_TDATA_STATE_PURGATORY      ((prof_tdata_t *)(uintptr_t)2)
46 #define PROF_TDATA_STATE_MAX            PROF_TDATA_STATE_PURGATORY
47
48 #endif /* JEMALLOC_H_TYPES */
49 /******************************************************************************/
50 #ifdef JEMALLOC_H_STRUCTS
51
52 struct prof_bt_s {
53         /* Backtrace, stored as len program counters. */
54         void            **vec;
55         unsigned        len;
56 };
57
58 #ifdef JEMALLOC_PROF_LIBGCC
59 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
60 typedef struct {
61         prof_bt_t       *bt;
62         unsigned        nignore;
63         unsigned        max;
64 } prof_unwind_data_t;
65 #endif
66
67 struct prof_cnt_s {
68         /*
69          * Profiling counters.  An allocation/deallocation pair can operate on
70          * different prof_thr_cnt_t objects that are linked into the same
71          * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
72          * negative.  In principle it is possible for the *bytes counters to
73          * overflow/underflow, but a general solution would require something
74          * like 128-bit counters; this implementation doesn't bother to solve
75          * that problem.
76          */
77         int64_t         curobjs;
78         int64_t         curbytes;
79         uint64_t        accumobjs;
80         uint64_t        accumbytes;
81 };
82
83 struct prof_thr_cnt_s {
84         /* Linkage into prof_ctx_t's cnts_ql. */
85         ql_elm(prof_thr_cnt_t)  cnts_link;
86
87         /* Linkage into thread's LRU. */
88         ql_elm(prof_thr_cnt_t)  lru_link;
89
90         /*
91          * Associated context.  If a thread frees an object that it did not
92          * allocate, it is possible that the context is not cached in the
93          * thread's hash table, in which case it must be able to look up the
94          * context, insert a new prof_thr_cnt_t into the thread's hash table,
95          * and link it into the prof_ctx_t's cnts_ql.
96          */
97         prof_ctx_t              *ctx;
98
99         /*
100          * Threads use memory barriers to update the counters.  Since there is
101          * only ever one writer, the only challenge is for the reader to get a
102          * consistent read of the counters.
103          *
104          * The writer uses this series of operations:
105          *
106          * 1) Increment epoch to an odd number.
107          * 2) Update counters.
108          * 3) Increment epoch to an even number.
109          *
110          * The reader must assure 1) that the epoch is even while it reads the
111          * counters, and 2) that the epoch doesn't change between the time it
112          * starts and finishes reading the counters.
113          */
114         unsigned                epoch;
115
116         /* Profiling counters. */
117         prof_cnt_t              cnts;
118 };
119
120 struct prof_ctx_s {
121         /* Associated backtrace. */
122         prof_bt_t               *bt;
123
124         /* Protects nlimbo, cnt_merged, and cnts_ql. */
125         malloc_mutex_t          *lock;
126
127         /*
128          * Number of threads that currently cause this ctx to be in a state of
129          * limbo due to one of:
130          *   - Initializing per thread counters associated with this ctx.
131          *   - Preparing to destroy this ctx.
132          * nlimbo must be 1 (single destroyer) in order to safely destroy the
133          * ctx.
134          */
135         unsigned                nlimbo;
136
137         /* Temporary storage for summation during dump. */
138         prof_cnt_t              cnt_summed;
139
140         /* When threads exit, they merge their stats into cnt_merged. */
141         prof_cnt_t              cnt_merged;
142
143         /*
144          * List of profile counters, one for each thread that has allocated in
145          * this context.
146          */
147         ql_head(prof_thr_cnt_t) cnts_ql;
148 };
149
150 struct prof_tdata_s {
151         /*
152          * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
153          * cache of backtraces, with associated thread-specific prof_thr_cnt_t
154          * objects.  Other threads may read the prof_thr_cnt_t contents, but no
155          * others will ever write them.
156          *
157          * Upon thread exit, the thread must merge all the prof_thr_cnt_t
158          * counter data into the associated prof_ctx_t objects, and unlink/free
159          * the prof_thr_cnt_t objects.
160          */
161         ckh_t                   bt2cnt;
162
163         /* LRU for contents of bt2cnt. */
164         ql_head(prof_thr_cnt_t) lru_ql;
165
166         /* Backtrace vector, used for calls to prof_backtrace(). */
167         void                    **vec;
168
169         /* Sampling state. */
170         uint64_t                prng_state;
171         uint64_t                threshold;
172         uint64_t                accum;
173
174         /* State used to avoid dumping while operating on prof internals. */
175         bool                    enq;
176         bool                    enq_idump;
177         bool                    enq_gdump;
178 };
179
180 #endif /* JEMALLOC_H_STRUCTS */
181 /******************************************************************************/
182 #ifdef JEMALLOC_H_EXTERNS
183
184 extern bool     opt_prof;
185 /*
186  * Even if opt_prof is true, sampling can be temporarily disabled by setting
187  * opt_prof_active to false.  No locking is used when updating opt_prof_active,
188  * so there are no guarantees regarding how long it will take for all threads
189  * to notice state changes.
190  */
191 extern bool     opt_prof_active;
192 extern size_t   opt_lg_prof_sample;   /* Mean bytes between samples. */
193 extern ssize_t  opt_lg_prof_interval; /* lg(prof_interval). */
194 extern bool     opt_prof_gdump;       /* High-water memory dumping. */
195 extern bool     opt_prof_final;       /* Final profile dumping. */
196 extern bool     opt_prof_leak;        /* Dump leak summary at exit. */
197 extern bool     opt_prof_accum;       /* Report cumulative bytes. */
198 extern char     opt_prof_prefix[PATH_MAX + 1];
199
200 /*
201  * Profile dump interval, measured in bytes allocated.  Each arena triggers a
202  * profile dump when it reaches this threshold.  The effect is that the
203  * interval between profile dumps averages prof_interval, though the actual
204  * interval between dumps will tend to be sporadic, and the interval will be a
205  * maximum of approximately (prof_interval * narenas).
206  */
207 extern uint64_t prof_interval;
208
209 /*
210  * If true, promote small sampled objects to large objects, since small run
211  * headers do not have embedded profile context pointers.
212  */
213 extern bool     prof_promote;
214
215 void    bt_init(prof_bt_t *bt, void **vec);
216 void    prof_backtrace(prof_bt_t *bt, unsigned nignore);
217 prof_thr_cnt_t  *prof_lookup(prof_bt_t *bt);
218 void    prof_idump(void);
219 bool    prof_mdump(const char *filename);
220 void    prof_gdump(void);
221 prof_tdata_t    *prof_tdata_init(void);
222 void    prof_tdata_cleanup(void *arg);
223 void    prof_boot0(void);
224 void    prof_boot1(void);
225 bool    prof_boot2(void);
226 void    prof_prefork(void);
227 void    prof_postfork_parent(void);
228 void    prof_postfork_child(void);
229
230 #endif /* JEMALLOC_H_EXTERNS */
231 /******************************************************************************/
232 #ifdef JEMALLOC_H_INLINES
233
234 #define PROF_ALLOC_PREP(nignore, size, ret) do {                        \
235         prof_tdata_t *prof_tdata;                                       \
236         prof_bt_t bt;                                                   \
237                                                                         \
238         assert(size == s2u(size));                                      \
239                                                                         \
240         prof_tdata = prof_tdata_get(true);                              \
241         if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) { \
242                 if (prof_tdata != NULL)                                 \
243                         ret = (prof_thr_cnt_t *)(uintptr_t)1U;          \
244                 else                                                    \
245                         ret = NULL;                                     \
246                 break;                                                  \
247         }                                                               \
248                                                                         \
249         if (opt_prof_active == false) {                                 \
250                 /* Sampling is currently inactive, so avoid sampling. */\
251                 ret = (prof_thr_cnt_t *)(uintptr_t)1U;                  \
252         } else if (opt_lg_prof_sample == 0) {                           \
253                 /* Don't bother with sampling logic, since sampling   */\
254                 /* interval is 1.                                     */\
255                 bt_init(&bt, prof_tdata->vec);                          \
256                 prof_backtrace(&bt, nignore);                           \
257                 ret = prof_lookup(&bt);                                 \
258         } else {                                                        \
259                 if (prof_tdata->threshold == 0) {                       \
260                         /* Initialize.  Seed the prng differently for */\
261                         /* each thread.                               */\
262                         prof_tdata->prng_state =                        \
263                             (uint64_t)(uintptr_t)&size;                 \
264                         prof_sample_threshold_update(prof_tdata);       \
265                 }                                                       \
266                                                                         \
267                 /* Determine whether to capture a backtrace based on  */\
268                 /* whether size is enough for prof_accum to reach     */\
269                 /* prof_tdata->threshold.  However, delay updating    */\
270                 /* these variables until prof_{m,re}alloc(), because  */\
271                 /* we don't know for sure that the allocation will    */\
272                 /* succeed.                                           */\
273                 /*                                                    */\
274                 /* Use subtraction rather than addition to avoid      */\
275                 /* potential integer overflow.                        */\
276                 if (size >= prof_tdata->threshold -                     \
277                     prof_tdata->accum) {                                \
278                         bt_init(&bt, prof_tdata->vec);                  \
279                         prof_backtrace(&bt, nignore);                   \
280                         ret = prof_lookup(&bt);                         \
281                 } else                                                  \
282                         ret = (prof_thr_cnt_t *)(uintptr_t)1U;          \
283         }                                                               \
284 } while (0)
285
286 #ifndef JEMALLOC_ENABLE_INLINE
287 malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
288
289 prof_tdata_t    *prof_tdata_get(bool create);
290 void    prof_sample_threshold_update(prof_tdata_t *prof_tdata);
291 prof_ctx_t      *prof_ctx_get(const void *ptr);
292 void    prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
293 bool    prof_sample_accum_update(size_t size);
294 void    prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
295 void    prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
296     size_t old_size, prof_ctx_t *old_ctx);
297 void    prof_free(const void *ptr, size_t size);
298 #endif
299
300 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
301 /* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
302 malloc_tsd_externs(prof_tdata, prof_tdata_t *)
303 malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL,
304     prof_tdata_cleanup)
305
306 JEMALLOC_INLINE prof_tdata_t *
307 prof_tdata_get(bool create)
308 {
309         prof_tdata_t *prof_tdata;
310
311         cassert(config_prof);
312
313         prof_tdata = *prof_tdata_tsd_get();
314         if (create && prof_tdata == NULL)
315                 prof_tdata = prof_tdata_init();
316
317         return (prof_tdata);
318 }
319
320 JEMALLOC_INLINE void
321 prof_sample_threshold_update(prof_tdata_t *prof_tdata)
322 {
323         uint64_t r;
324         double u;
325
326         cassert(config_prof);
327
328         /*
329          * Compute sample threshold as a geometrically distributed random
330          * variable with mean (2^opt_lg_prof_sample).
331          *
332          *                         __        __
333          *                         |  log(u)  |                     1
334          * prof_tdata->threshold = | -------- |, where p = -------------------
335          *                         | log(1-p) |             opt_lg_prof_sample
336          *                                                 2
337          *
338          * For more information on the math, see:
339          *
340          *   Non-Uniform Random Variate Generation
341          *   Luc Devroye
342          *   Springer-Verlag, New York, 1986
343          *   pp 500
344          *   (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
345          */
346         prng64(r, 53, prof_tdata->prng_state,
347             UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
348         u = (double)r * (1.0/9007199254740992.0L);
349         prof_tdata->threshold = (uint64_t)(log(u) /
350             log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
351             + (uint64_t)1U;
352 }
353
354 JEMALLOC_INLINE prof_ctx_t *
355 prof_ctx_get(const void *ptr)
356 {
357         prof_ctx_t *ret;
358         arena_chunk_t *chunk;
359
360         cassert(config_prof);
361         assert(ptr != NULL);
362
363         chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
364         if (chunk != ptr) {
365                 /* Region. */
366                 ret = arena_prof_ctx_get(ptr);
367         } else
368                 ret = huge_prof_ctx_get(ptr);
369
370         return (ret);
371 }
372
373 JEMALLOC_INLINE void
374 prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
375 {
376         arena_chunk_t *chunk;
377
378         cassert(config_prof);
379         assert(ptr != NULL);
380
381         chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
382         if (chunk != ptr) {
383                 /* Region. */
384                 arena_prof_ctx_set(ptr, ctx);
385         } else
386                 huge_prof_ctx_set(ptr, ctx);
387 }
388
389 JEMALLOC_INLINE bool
390 prof_sample_accum_update(size_t size)
391 {
392         prof_tdata_t *prof_tdata;
393
394         cassert(config_prof);
395         /* Sampling logic is unnecessary if the interval is 1. */
396         assert(opt_lg_prof_sample != 0);
397
398         prof_tdata = prof_tdata_get(false);
399         if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
400                 return (true);
401
402         /* Take care to avoid integer overflow. */
403         if (size >= prof_tdata->threshold - prof_tdata->accum) {
404                 prof_tdata->accum -= (prof_tdata->threshold - size);
405                 /* Compute new sample threshold. */
406                 prof_sample_threshold_update(prof_tdata);
407                 while (prof_tdata->accum >= prof_tdata->threshold) {
408                         prof_tdata->accum -= prof_tdata->threshold;
409                         prof_sample_threshold_update(prof_tdata);
410                 }
411                 return (false);
412         } else {
413                 prof_tdata->accum += size;
414                 return (true);
415         }
416 }
417
418 JEMALLOC_INLINE void
419 prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
420 {
421
422         cassert(config_prof);
423         assert(ptr != NULL);
424         assert(size == isalloc(ptr, true));
425
426         if (opt_lg_prof_sample != 0) {
427                 if (prof_sample_accum_update(size)) {
428                         /*
429                          * Don't sample.  For malloc()-like allocation, it is
430                          * always possible to tell in advance how large an
431                          * object's usable size will be, so there should never
432                          * be a difference between the size passed to
433                          * PROF_ALLOC_PREP() and prof_malloc().
434                          */
435                         assert((uintptr_t)cnt == (uintptr_t)1U);
436                 }
437         }
438
439         if ((uintptr_t)cnt > (uintptr_t)1U) {
440                 prof_ctx_set(ptr, cnt->ctx);
441
442                 cnt->epoch++;
443                 /*********/
444                 mb_write();
445                 /*********/
446                 cnt->cnts.curobjs++;
447                 cnt->cnts.curbytes += size;
448                 if (opt_prof_accum) {
449                         cnt->cnts.accumobjs++;
450                         cnt->cnts.accumbytes += size;
451                 }
452                 /*********/
453                 mb_write();
454                 /*********/
455                 cnt->epoch++;
456                 /*********/
457                 mb_write();
458                 /*********/
459         } else
460                 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
461 }
462
463 JEMALLOC_INLINE void
464 prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
465     size_t old_size, prof_ctx_t *old_ctx)
466 {
467         prof_thr_cnt_t *told_cnt;
468
469         cassert(config_prof);
470         assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
471
472         if (ptr != NULL) {
473                 assert(size == isalloc(ptr, true));
474                 if (opt_lg_prof_sample != 0) {
475                         if (prof_sample_accum_update(size)) {
476                                 /*
477                                  * Don't sample.  The size passed to
478                                  * PROF_ALLOC_PREP() was larger than what
479                                  * actually got allocated, so a backtrace was
480                                  * captured for this allocation, even though
481                                  * its actual size was insufficient to cross
482                                  * the sample threshold.
483                                  */
484                                 cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
485                         }
486                 }
487         }
488
489         if ((uintptr_t)old_ctx > (uintptr_t)1U) {
490                 told_cnt = prof_lookup(old_ctx->bt);
491                 if (told_cnt == NULL) {
492                         /*
493                          * It's too late to propagate OOM for this realloc(),
494                          * so operate directly on old_cnt->ctx->cnt_merged.
495                          */
496                         malloc_mutex_lock(old_ctx->lock);
497                         old_ctx->cnt_merged.curobjs--;
498                         old_ctx->cnt_merged.curbytes -= old_size;
499                         malloc_mutex_unlock(old_ctx->lock);
500                         told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
501                 }
502         } else
503                 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
504
505         if ((uintptr_t)told_cnt > (uintptr_t)1U)
506                 told_cnt->epoch++;
507         if ((uintptr_t)cnt > (uintptr_t)1U) {
508                 prof_ctx_set(ptr, cnt->ctx);
509                 cnt->epoch++;
510         } else if (ptr != NULL)
511                 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
512         /*********/
513         mb_write();
514         /*********/
515         if ((uintptr_t)told_cnt > (uintptr_t)1U) {
516                 told_cnt->cnts.curobjs--;
517                 told_cnt->cnts.curbytes -= old_size;
518         }
519         if ((uintptr_t)cnt > (uintptr_t)1U) {
520                 cnt->cnts.curobjs++;
521                 cnt->cnts.curbytes += size;
522                 if (opt_prof_accum) {
523                         cnt->cnts.accumobjs++;
524                         cnt->cnts.accumbytes += size;
525                 }
526         }
527         /*********/
528         mb_write();
529         /*********/
530         if ((uintptr_t)told_cnt > (uintptr_t)1U)
531                 told_cnt->epoch++;
532         if ((uintptr_t)cnt > (uintptr_t)1U)
533                 cnt->epoch++;
534         /*********/
535         mb_write(); /* Not strictly necessary. */
536 }
537
538 JEMALLOC_INLINE void
539 prof_free(const void *ptr, size_t size)
540 {
541         prof_ctx_t *ctx = prof_ctx_get(ptr);
542
543         cassert(config_prof);
544
545         if ((uintptr_t)ctx > (uintptr_t)1) {
546                 prof_thr_cnt_t *tcnt;
547                 assert(size == isalloc(ptr, true));
548                 tcnt = prof_lookup(ctx->bt);
549
550                 if (tcnt != NULL) {
551                         tcnt->epoch++;
552                         /*********/
553                         mb_write();
554                         /*********/
555                         tcnt->cnts.curobjs--;
556                         tcnt->cnts.curbytes -= size;
557                         /*********/
558                         mb_write();
559                         /*********/
560                         tcnt->epoch++;
561                         /*********/
562                         mb_write();
563                         /*********/
564                 } else {
565                         /*
566                          * OOM during free() cannot be propagated, so operate
567                          * directly on cnt->ctx->cnt_merged.
568                          */
569                         malloc_mutex_lock(ctx->lock);
570                         ctx->cnt_merged.curobjs--;
571                         ctx->cnt_merged.curbytes -= size;
572                         malloc_mutex_unlock(ctx->lock);
573                 }
574         }
575 }
576 #endif
577
578 #endif /* JEMALLOC_H_INLINES */
579 /******************************************************************************/