1 //===-- xray_profile_collector.cc ------------------------------*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file is a part of XRay, a dynamic runtime instrumentation system.
12 // This implements the interface for the profileCollectorService.
14 //===----------------------------------------------------------------------===//
15 #include "xray_profile_collector.h"
16 #include "sanitizer_common/sanitizer_common.h"
17 #include "xray_allocator.h"
18 #include "xray_defs.h"
19 #include "xray_profiling_flags.h"
20 #include "xray_segmented_array.h"
26 namespace profileCollectorService {
30 SpinMutex GlobalMutex;
33 typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
36 struct ProfileBuffer {
41 // Current version of the profile format.
42 constexpr u64 XRayProfilingVersion = 0x20180424;
44 // Identifier for XRay profiling files 'xrayprof' in hex.
45 constexpr u64 XRayMagicBytes = 0x7872617970726f66;
47 struct XRayProfilingFileHeader {
48 const u64 MagicBytes = XRayMagicBytes;
49 const u64 Version = XRayProfilingVersion;
50 u64 Timestamp = 0; // System time in nanoseconds.
51 u64 PID = 0; // Process ID.
62 FunctionCallTrie::Allocators::Buffers Buffers;
63 FunctionCallTrie::Allocators Allocators;
68 using ThreadDataArray = Array<ThreadData>;
69 using ThreadDataAllocator = ThreadDataArray::AllocatorType;
71 // We use a separate buffer queue for the backing store for the allocator used
72 // by the ThreadData array. This lets us host the buffers, allocators, and tries
73 // associated with a thread by moving the data into the array instead of
74 // attempting to copy the data to a separately backed set of tries.
75 static typename std::aligned_storage<
76 sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage;
77 static BufferQueue *BQ = nullptr;
78 static BufferQueue::Buffer Buffer;
79 static typename std::aligned_storage<sizeof(ThreadDataAllocator),
80 alignof(ThreadDataAllocator)>::type
81 ThreadDataAllocatorStorage;
82 static typename std::aligned_storage<sizeof(ThreadDataArray),
83 alignof(ThreadDataArray)>::type
84 ThreadDataArrayStorage;
86 static ThreadDataAllocator *TDAllocator = nullptr;
87 static ThreadDataArray *TDArray = nullptr;
89 using ProfileBufferArray = Array<ProfileBuffer>;
90 using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
92 // These need to be global aligned storage to avoid dynamic initialization. We
93 // need these to be aligned to allow us to placement new objects into the
94 // storage, and have pointers to those objects be appropriately aligned.
95 static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
96 ProfileBuffersStorage;
97 static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
98 ProfileBufferArrayAllocatorStorage;
100 static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
101 static ProfileBufferArray *ProfileBuffers = nullptr;
103 // Use a global flag to determine whether the collector implementation has been
105 static atomic_uint8_t CollectorInitialized{0};
109 void post(BufferQueue *Q, FunctionCallTrie &&T,
110 FunctionCallTrie::Allocators &&A,
111 FunctionCallTrie::Allocators::Buffers &&B,
112 tid_t TId) XRAY_NEVER_INSTRUMENT {
113 DCHECK_NE(Q, nullptr);
115 // Bail out early if the collector has not been initialized.
116 if (!atomic_load(&CollectorInitialized, memory_order_acquire)) {
117 T.~FunctionCallTrie();
119 Q->releaseBuffer(B.NodeBuffer);
120 Q->releaseBuffer(B.RootsBuffer);
121 Q->releaseBuffer(B.ShadowStackBuffer);
122 Q->releaseBuffer(B.NodeIdPairBuffer);
128 SpinMutexLock Lock(&GlobalMutex);
129 DCHECK_NE(TDAllocator, nullptr);
130 DCHECK_NE(TDArray, nullptr);
132 if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T),
134 // If we fail to add the data to the array, we should destroy the objects
136 T.~FunctionCallTrie();
138 Q->releaseBuffer(B.NodeBuffer);
139 Q->releaseBuffer(B.RootsBuffer);
140 Q->releaseBuffer(B.ShadowStackBuffer);
141 Q->releaseBuffer(B.NodeIdPairBuffer);
147 // A PathArray represents the function id's representing a stack trace. In this
148 // context a path is almost always represented from the leaf function in a call
149 // stack to a root of the call trie.
150 using PathArray = Array<int32_t>;
152 struct ProfileRecord {
153 using PathAllocator = typename PathArray::AllocatorType;
155 // The Path in this record is the function id's from the leaf to the root of
156 // the function call stack as represented from a FunctionCallTrie.
158 const FunctionCallTrie::Node *Node;
163 using ProfileRecordArray = Array<ProfileRecord>;
165 // Walk a depth-first traversal of each root of the FunctionCallTrie to generate
166 // the path(s) and the data associated with the path.
168 populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
169 const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT {
170 using StackArray = Array<const FunctionCallTrie::Node *>;
171 using StackAllocator = typename StackArray::AllocatorType;
172 StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
173 StackArray DFSStack(StackAlloc);
174 for (const auto *R : Trie.getRoots()) {
176 while (!DFSStack.empty()) {
177 auto *Node = DFSStack.back();
181 auto Record = PRs.AppendEmplace(PathArray{PA}, Node);
182 if (Record == nullptr)
184 DCHECK_NE(Record, nullptr);
186 // Traverse the Node's parents and as we're doing so, get the FIds in
187 // the order they appear.
188 for (auto N = Node; N != nullptr; N = N->Parent)
189 Record->Path.Append(N->FId);
190 DCHECK(!Record->Path.empty());
192 for (const auto C : Node->Callees)
193 DFSStack.Append(C.NodePtr);
198 static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
199 const ProfileRecordArray &ProfileRecords)
200 XRAY_NEVER_INSTRUMENT {
201 auto NextPtr = static_cast<uint8_t *>(
202 internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
204 for (const auto &Record : ProfileRecords) {
205 // List of IDs follow:
206 for (const auto FId : Record.Path)
208 static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
211 // Add the sentinel here.
212 constexpr int32_t SentinelFId = 0;
213 NextPtr = static_cast<uint8_t *>(
214 internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
217 // Add the node data here.
219 static_cast<uint8_t *>(internal_memcpy(
220 NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) +
221 sizeof(Record.Node->CallCount);
222 NextPtr = static_cast<uint8_t *>(
223 internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
224 sizeof(Record.Node->CumulativeLocalTime))) +
225 sizeof(Record.Node->CumulativeLocalTime);
228 DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size);
233 void serialize() XRAY_NEVER_INSTRUMENT {
234 if (!atomic_load(&CollectorInitialized, memory_order_acquire))
237 SpinMutexLock Lock(&GlobalMutex);
239 // Clear out the global ProfileBuffers, if it's not empty.
240 for (auto &B : *ProfileBuffers)
241 deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size);
242 ProfileBuffers->trim(ProfileBuffers->size());
244 DCHECK_NE(TDArray, nullptr);
245 if (TDArray->empty())
248 // Then repopulate the global ProfileBuffers.
250 auto MaxSize = profilingFlags()->global_allocator_max;
251 auto ProfileArena = allocateBuffer(MaxSize);
252 if (ProfileArena == nullptr)
255 auto ProfileArenaCleanup = at_scope_exit(
256 [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); });
258 auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max);
259 if (PathArena == nullptr)
262 auto PathArenaCleanup = at_scope_exit(
263 [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); });
265 for (const auto &ThreadTrie : *TDArray) {
266 using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
267 ProfileRecordAllocator PRAlloc(ProfileArena,
268 profilingFlags()->global_allocator_max);
269 ProfileRecord::PathAllocator PathAlloc(
270 PathArena, profilingFlags()->global_allocator_max);
271 ProfileRecordArray ProfileRecords(PRAlloc);
273 // First, we want to compute the amount of space we're going to need. We'll
274 // use a local allocator and an __xray::Array<...> to store the intermediary
275 // data, then compute the size as we're going along. Then we'll allocate the
276 // contiguous space to contain the thread buffer data.
277 if (ThreadTrie.FCT.getRoots().empty())
280 populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT);
281 DCHECK(!ThreadTrie.FCT.getRoots().empty());
282 DCHECK(!ProfileRecords.empty());
284 // Go through each record, to compute the sizes.
286 // header size = block size (4 bytes)
287 // + block number (4 bytes)
288 // + thread id (8 bytes)
289 // record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
290 // + call count (8 bytes)
291 // + local time (8 bytes)
292 // + end of record (8 bytes)
293 u32 CumulativeSizes = 0;
294 for (const auto &Record : ProfileRecords)
295 CumulativeSizes += 20 + (4 * Record.Path.size());
297 BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
298 auto B = ProfileBuffers->Append({});
299 B->Size = sizeof(Header) + CumulativeSizes;
300 B->Data = allocateBuffer(B->Size);
301 DCHECK_NE(B->Data, nullptr);
302 serializeRecords(B, Header, ProfileRecords);
306 void reset() XRAY_NEVER_INSTRUMENT {
307 atomic_store(&CollectorInitialized, 0, memory_order_release);
308 SpinMutexLock Lock(&GlobalMutex);
310 if (ProfileBuffers != nullptr) {
311 // Clear out the profile buffers that have been serialized.
312 for (auto &B : *ProfileBuffers)
313 deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);
314 ProfileBuffers->trim(ProfileBuffers->size());
315 ProfileBuffers = nullptr;
318 if (TDArray != nullptr) {
319 // Release the resources as required.
320 for (auto &TD : *TDArray) {
321 TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer);
322 TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer);
323 TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer);
324 TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer);
326 // We don't bother destroying the array here because we've already
327 // potentially freed the backing store for the array. Instead we're going to
328 // reset the pointer to nullptr, and re-use the storage later instead
329 // (placement-new'ing into the storage as-is).
333 if (TDAllocator != nullptr) {
334 TDAllocator->~Allocator();
335 TDAllocator = nullptr;
338 if (Buffer.Data != nullptr) {
339 BQ->releaseBuffer(Buffer);
343 bool Success = false;
344 new (&BufferQueueStorage)
345 BufferQueue(profilingFlags()->global_allocator_max, 1, Success);
348 BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
352 if (BQ->init(profilingFlags()->global_allocator_max, 1) !=
353 BufferQueue::ErrorCode::Ok)
357 if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok)
360 new (&ProfileBufferArrayAllocatorStorage)
361 ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
362 ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
363 &ProfileBufferArrayAllocatorStorage);
365 new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator);
367 reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
369 new (&ThreadDataAllocatorStorage)
370 ThreadDataAllocator(Buffer.Data, Buffer.Size);
372 reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage);
373 new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator);
374 TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage);
376 atomic_store(&CollectorInitialized, 1, memory_order_release);
379 XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
380 SpinMutexLock Lock(&GlobalMutex);
382 if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
385 static pthread_once_t Once = PTHREAD_ONCE_INIT;
386 static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type
389 &Once, +[]() XRAY_NEVER_INSTRUMENT {
390 new (&FileHeaderStorage) XRayProfilingFileHeader{};
393 if (UNLIKELY(B.Data == nullptr)) {
394 // The first buffer should always contain the file header information.
396 *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage);
397 FileHeader.Timestamp = NanoTime();
398 FileHeader.PID = internal_getpid();
399 return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)};
402 if (UNLIKELY(B.Data == &FileHeaderStorage))
403 return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size};
406 internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
407 auto NextBlock = Header.BlockNum + 1;
408 if (NextBlock < ProfileBuffers->size())
409 return {(*ProfileBuffers)[NextBlock].Data,
410 (*ProfileBuffers)[NextBlock].Size};
414 } // namespace profileCollectorService
415 } // namespace __xray