1 #ifndef JEMALLOC_INTERNAL_TSD_H
2 #define JEMALLOC_INTERNAL_TSD_H
4 #include "jemalloc/internal/arena_types.h"
5 #include "jemalloc/internal/assert.h"
6 #include "jemalloc/internal/bin_types.h"
7 #include "jemalloc/internal/jemalloc_internal_externs.h"
8 #include "jemalloc/internal/prof_types.h"
9 #include "jemalloc/internal/ql.h"
10 #include "jemalloc/internal/rtree_tsd.h"
11 #include "jemalloc/internal/tcache_types.h"
12 #include "jemalloc/internal/tcache_structs.h"
13 #include "jemalloc/internal/util.h"
14 #include "jemalloc/internal/witness.h"
17 * Thread-Specific-Data layout
18 * --- data accessed on tcache fast path: state, rtree_ctx, stats, prof ---
21 * m: thread_allocated (config_stats)
22 * f: thread_deallocated (config_stats)
23 * p: prof_tdata (config_prof)
24 * c: rtree_ctx (rtree cache accessed on deallocation)
26 * --- data not accessed on tcache fast path: arena-related fields ---
27 * d: arenas_tdata_bypass
33 * Loading TSD data is on the critical path of basically all malloc operations.
34 * In particular, tcache and rtree_ctx rely on hot CPU cache to be effective.
35 * Use a compact layout to reduce cache footprint.
36 * +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+
37 * |---------------------------- 1st cacheline ----------------------------|
38 * | sedrxxxx mmmmmmmm ffffffff pppppppp [c * 32 ........ ........ .......] |
39 * |---------------------------- 2nd cacheline ----------------------------|
40 * | [c * 64 ........ ........ ........ ........ ........ ........ .......] |
41 * |---------------------------- 3nd cacheline ----------------------------|
42 * | [c * 32 ........ ........ .......] iiiiiiii aaaaaaaa oooooooo [t...... |
43 * +-------------------------------------------------------------------------+
44 * Note: the entire tcache is embedded into TSD and spans multiple cachelines.
46 * The last 3 members (i, a and o) before tcache isn't really needed on tcache
47 * fast path. However we have a number of unused tcache bins and witnesses
48 * (never touched unless config_debug) at the end of tcache, so we place them
49 * there to avoid breaking the cachelines and possibly paging in an extra page.
52 typedef void (*test_callback_t)(int *);
53 # define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10
54 # define MALLOC_TEST_TSD \
55 O(test_data, int, int) \
56 O(test_callback, test_callback_t, int)
57 # define MALLOC_TEST_TSD_INITIALIZER , MALLOC_TSD_TEST_DATA_INIT, NULL
59 # define MALLOC_TEST_TSD
60 # define MALLOC_TEST_TSD_INITIALIZER
63 /* O(name, type, nullable type */
65 O(tcache_enabled, bool, bool) \
66 O(arenas_tdata_bypass, bool, bool) \
67 O(reentrancy_level, int8_t, int8_t) \
68 O(narenas_tdata, uint32_t, uint32_t) \
69 O(offset_state, uint64_t, uint64_t) \
70 O(thread_allocated, uint64_t, uint64_t) \
71 O(thread_deallocated, uint64_t, uint64_t) \
72 O(bytes_until_sample, int64_t, int64_t) \
73 O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \
74 O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) \
75 O(iarena, arena_t *, arena_t *) \
76 O(arena, arena_t *, arena_t *) \
77 O(arenas_tdata, arena_tdata_t *, arena_tdata_t *)\
78 O(binshards, tsd_binshards_t, tsd_binshards_t)\
79 O(tcache, tcache_t, tcache_t) \
80 O(witness_tsd, witness_tsd_t, witness_tsdn_t) \
83 #define TSD_INITIALIZER { \
84 ATOMIC_INIT(tsd_state_uninitialized), \
85 TCACHE_ENABLED_ZERO_INITIALIZER, \
94 RTREE_CTX_ZERO_INITIALIZER, \
98 TSD_BINSHARDS_ZERO_INITIALIZER, \
99 TCACHE_ZERO_INITIALIZER, \
100 WITNESS_TSD_INITIALIZER \
101 MALLOC_TEST_TSD_INITIALIZER \
104 void *malloc_tsd_malloc(size_t size);
105 void malloc_tsd_dalloc(void *wrapper);
106 void malloc_tsd_cleanup_register(bool (*f)(void));
107 tsd_t *malloc_tsd_boot0(void);
108 void malloc_tsd_boot1(void);
109 void tsd_cleanup(void *arg);
110 tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal);
111 void tsd_state_set(tsd_t *tsd, uint8_t new_state);
112 void tsd_slow_update(tsd_t *tsd);
113 void tsd_prefork(tsd_t *tsd);
114 void tsd_postfork_parent(tsd_t *tsd);
115 void tsd_postfork_child(tsd_t *tsd);
118 * Call ..._inc when your module wants to take all threads down the slow paths,
119 * and ..._dec when it no longer needs to.
121 void tsd_global_slow_inc(tsdn_t *tsdn);
122 void tsd_global_slow_dec(tsdn_t *tsdn);
123 bool tsd_global_slow();
126 /* Common case --> jnz. */
127 tsd_state_nominal = 0,
128 /* Initialized but on slow path. */
129 tsd_state_nominal_slow = 1,
131 * Some thread has changed global state in such a way that all nominal
132 * threads need to recompute their fast / slow status the next time they
135 * Any thread can change another thread's status *to* recompute, but
136 * threads are the only ones who can change their status *from*
139 tsd_state_nominal_recompute = 2,
141 * The above nominal states should be lower values. We use
142 * tsd_nominal_max to separate nominal states from threads in the
143 * process of being born / dying.
145 tsd_state_nominal_max = 2,
148 * A thread might free() during its death as its only allocator action;
149 * in such scenarios, we need tsd, but set up in such a way that no
150 * cleanup is necessary.
152 tsd_state_minimal_initialized = 3,
153 /* States during which we know we're in thread death. */
154 tsd_state_purgatory = 4,
155 tsd_state_reincarnated = 5,
157 * What it says on the tin; tsd that hasn't been initialized. Note
158 * that even when the tsd struct lives in TLS, when need to keep track
159 * of stuff like whether or not our pthread destructors have been
160 * scheduled, so this really truly is different than the nominal state.
162 tsd_state_uninitialized = 6
166 * Some TSD accesses can only be done in a nominal state. To enforce this, we
167 * wrap TSD member access in a function that asserts on TSD state, and mangle
168 * field names to prevent touching them accidentally.
170 #define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n
172 #ifdef JEMALLOC_U8_ATOMICS
173 # define tsd_state_t atomic_u8_t
174 # define tsd_atomic_load atomic_load_u8
175 # define tsd_atomic_store atomic_store_u8
176 # define tsd_atomic_exchange atomic_exchange_u8
178 # define tsd_state_t atomic_u32_t
179 # define tsd_atomic_load atomic_load_u32
180 # define tsd_atomic_store atomic_store_u32
181 # define tsd_atomic_exchange atomic_exchange_u32
184 /* The actual tsd. */
187 * The contents should be treated as totally opaque outside the tsd
188 * module. Access any thread-local state through the getters and
193 * We manually limit the state to just a single byte. Unless the 8-bit
194 * atomics are unavailable (which is rare).
197 #define O(n, t, nt) \
201 /* AddressSanitizer requires TLS data to be aligned to at least 8 bytes. */
202 } JEMALLOC_ALIGNED(16);
204 JEMALLOC_ALWAYS_INLINE uint8_t
205 tsd_state_get(tsd_t *tsd) {
207 * This should be atomic. Unfortunately, compilers right now can't tell
208 * that this can be done as a memory comparison, and forces a load into
209 * a register that hurts fast-path performance.
211 /* return atomic_load_u8(&tsd->state, ATOMIC_RELAXED); */
212 return *(uint8_t *)&tsd->state;
216 * Wrapper around tsd_t that makes it possible to avoid implicit conversion
217 * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be
218 * explicitly converted to tsd_t, which is non-nullable.
223 #define TSDN_NULL ((tsdn_t *)0)
224 JEMALLOC_ALWAYS_INLINE tsdn_t *
225 tsd_tsdn(tsd_t *tsd) {
226 return (tsdn_t *)tsd;
229 JEMALLOC_ALWAYS_INLINE bool
230 tsdn_null(const tsdn_t *tsdn) {
234 JEMALLOC_ALWAYS_INLINE tsd_t *
235 tsdn_tsd(tsdn_t *tsdn) {
236 assert(!tsdn_null(tsdn));
242 * We put the platform-specific data declarations and inlines into their own
243 * header files to avoid cluttering this file. They define tsd_boot0,
244 * tsd_boot1, tsd_boot, tsd_booted_get, tsd_get_allocates, tsd_get, and tsd_set.
246 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
247 #include "jemalloc/internal/tsd_malloc_thread_cleanup.h"
248 #elif (defined(JEMALLOC_TLS))
249 #include "jemalloc/internal/tsd_tls.h"
250 #elif (defined(_WIN32))
251 #include "jemalloc/internal/tsd_win.h"
253 #include "jemalloc/internal/tsd_generic.h"
257 * tsd_foop_get_unsafe(tsd) returns a pointer to the thread-local instance of
258 * foo. This omits some safety checks, and so can be used during tsd
259 * initialization and cleanup.
261 #define O(n, t, nt) \
262 JEMALLOC_ALWAYS_INLINE t * \
263 tsd_##n##p_get_unsafe(tsd_t *tsd) { \
264 return &tsd->TSD_MANGLE(n); \
269 /* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */
270 #define O(n, t, nt) \
271 JEMALLOC_ALWAYS_INLINE t * \
272 tsd_##n##p_get(tsd_t *tsd) { \
274 * Because the state might change asynchronously if it's \
275 * nominal, we need to make sure that we only read it once. \
277 uint8_t state = tsd_state_get(tsd); \
278 assert(state == tsd_state_nominal || \
279 state == tsd_state_nominal_slow || \
280 state == tsd_state_nominal_recompute || \
281 state == tsd_state_reincarnated || \
282 state == tsd_state_minimal_initialized); \
283 return tsd_##n##p_get_unsafe(tsd); \
289 * tsdn_foop_get(tsdn) returns either the thread-local instance of foo (if tsdn
290 * isn't NULL), or NULL (if tsdn is NULL), cast to the nullable pointer type.
292 #define O(n, t, nt) \
293 JEMALLOC_ALWAYS_INLINE nt * \
294 tsdn_##n##p_get(tsdn_t *tsdn) { \
295 if (tsdn_null(tsdn)) { \
298 tsd_t *tsd = tsdn_tsd(tsdn); \
299 return (nt *)tsd_##n##p_get(tsd); \
304 /* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */
305 #define O(n, t, nt) \
306 JEMALLOC_ALWAYS_INLINE t \
307 tsd_##n##_get(tsd_t *tsd) { \
308 return *tsd_##n##p_get(tsd); \
313 /* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */
314 #define O(n, t, nt) \
315 JEMALLOC_ALWAYS_INLINE void \
316 tsd_##n##_set(tsd_t *tsd, t val) { \
317 assert(tsd_state_get(tsd) != tsd_state_reincarnated && \
318 tsd_state_get(tsd) != tsd_state_minimal_initialized); \
319 *tsd_##n##p_get(tsd) = val; \
324 JEMALLOC_ALWAYS_INLINE void
325 tsd_assert_fast(tsd_t *tsd) {
327 * Note that our fastness assertion does *not* include global slowness
328 * counters; it's not in general possible to ensure that they won't
329 * change asynchronously from underneath us.
331 assert(!malloc_slow && tsd_tcache_enabled_get(tsd) &&
332 tsd_reentrancy_level_get(tsd) == 0);
335 JEMALLOC_ALWAYS_INLINE bool
336 tsd_fast(tsd_t *tsd) {
337 bool fast = (tsd_state_get(tsd) == tsd_state_nominal);
339 tsd_assert_fast(tsd);
345 JEMALLOC_ALWAYS_INLINE tsd_t *
346 tsd_fetch_impl(bool init, bool minimal) {
347 tsd_t *tsd = tsd_get(init);
349 if (!init && tsd_get_allocates() && tsd == NULL) {
354 if (unlikely(tsd_state_get(tsd) != tsd_state_nominal)) {
355 return tsd_fetch_slow(tsd, minimal);
357 assert(tsd_fast(tsd));
358 tsd_assert_fast(tsd);
363 /* Get a minimal TSD that requires no cleanup. See comments in free(). */
364 JEMALLOC_ALWAYS_INLINE tsd_t *
365 tsd_fetch_min(void) {
366 return tsd_fetch_impl(true, true);
369 /* For internal background threads use only. */
370 JEMALLOC_ALWAYS_INLINE tsd_t *
371 tsd_internal_fetch(void) {
372 tsd_t *tsd = tsd_fetch_min();
373 /* Use reincarnated state to prevent full initialization. */
374 tsd_state_set(tsd, tsd_state_reincarnated);
379 JEMALLOC_ALWAYS_INLINE tsd_t *
381 return tsd_fetch_impl(true, false);
385 tsd_nominal(tsd_t *tsd) {
386 return (tsd_state_get(tsd) <= tsd_state_nominal_max);
389 JEMALLOC_ALWAYS_INLINE tsdn_t *
391 if (!tsd_booted_get()) {
395 return tsd_tsdn(tsd_fetch_impl(false, false));
398 JEMALLOC_ALWAYS_INLINE rtree_ctx_t *
399 tsd_rtree_ctx(tsd_t *tsd) {
400 return tsd_rtree_ctxp_get(tsd);
403 JEMALLOC_ALWAYS_INLINE rtree_ctx_t *
404 tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) {
406 * If tsd cannot be accessed, initialize the fallback rtree_ctx and
407 * return a pointer to it.
409 if (unlikely(tsdn_null(tsdn))) {
410 rtree_ctx_data_init(fallback);
413 return tsd_rtree_ctx(tsdn_tsd(tsdn));
416 #endif /* JEMALLOC_INTERNAL_TSD_H */