1 //===-- xray_function_call_trie.h ------------------------------*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file is a part of XRay, a dynamic runtime instrumentation system.
12 // This file defines the interface for a function call trie.
14 //===----------------------------------------------------------------------===//
15 #ifndef XRAY_FUNCTION_CALL_TRIE_H
16 #define XRAY_FUNCTION_CALL_TRIE_H
18 #include "xray_buffer_queue.h"
19 #include "xray_defs.h"
20 #include "xray_profiling_flags.h"
21 #include "xray_segmented_array.h"
23 #include <memory> // For placement new.
28 /// A FunctionCallTrie represents the stack traces of XRay instrumented
29 /// functions that we've encountered, where a node corresponds to a function and
30 /// the path from the root to the node its stack trace. Each node in the trie
31 /// will contain some useful values, including:
33 /// * The cumulative amount of time spent in this particular node/stack.
34 /// * The number of times this stack has appeared.
35 /// * A histogram of latencies for that particular node.
37 /// Each node in the trie will also contain a list of callees, represented using
38 /// a Array<NodeIdPair> -- each NodeIdPair instance will contain the function
39 /// ID of the callee, and a pointer to the node.
41 /// If we visualise this data structure, we'll find the following potential
44 /// [function id node] -> [callees] [cumulative time]
45 /// [call counter] [latency histogram]
47 /// As an example, when we have a function in this pseudocode:
52 /// for i := 1..N { j() }
55 /// We may end up with a trie of the following form:
57 /// f -> [ g, h, j ] [...] [1] [...]
58 /// g -> [ ... ] [...] [1] [...]
59 /// h -> [ ... ] [...] [1] [...]
60 /// j -> [ ... ] [...] [N] [...]
62 /// If for instance the function g() called j() like so:
65 /// for i := 1..10 { j() }
68 /// We'll find the following updated trie:
70 /// f -> [ g, h, j ] [...] [1] [...]
71 /// g -> [ j' ] [...] [1] [...]
72 /// h -> [ ... ] [...] [1] [...]
73 /// j -> [ ... ] [...] [N] [...]
74 /// j' -> [ ... ] [...] [10] [...]
76 /// Note that we'll have a new node representing the path `f -> g -> j'` with
77 /// isolated data. This isolation gives us a means of representing the stack
78 /// traces as a path, as opposed to a key in a table. The alternative
79 /// implementation here would be to use a separate table for the path, and use
80 /// hashes of the path as an identifier to accumulate the information. We've
81 /// moved away from this approach as it takes a lot of time to compute the hash
82 /// every time we need to update a function's call information as we're handling
83 /// the entry and exit events.
85 /// This approach allows us to maintain a shadow stack, which represents the
86 /// currently executing path, and on function exits quickly compute the amount
87 /// of time elapsed from the entry, then update the counters for the node
88 /// already represented in the trie. This necessitates an efficient
89 /// representation of the various data structures (the list of callees must be
90 /// cache-aware and efficient to look up, and the histogram must be compact and
91 /// quick to update) to enable us to keep the overheads of this implementation
93 class FunctionCallTrie {
97 // We use a NodeIdPair type instead of a std::pair<...> to not rely on the
98 // standard library types in this header.
104 using NodeIdPairArray = Array<NodeIdPair>;
105 using NodeIdPairAllocatorType = NodeIdPairArray::AllocatorType;
107 // A Node in the FunctionCallTrie gives us a list of callees, the cumulative
108 // number of times this node actually appeared, the cumulative amount of time
109 // for this particular node including its children call times, and just the
110 // local time spent on this node. Each Node will have the ID of the XRay
111 // instrumented function that it is associated to.
114 NodeIdPairArray Callees;
116 uint64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
119 // TODO: Include the compact histogram.
123 struct ShadowStackEntry {
129 using NodeArray = Array<Node>;
130 using RootArray = Array<Node *>;
131 using ShadowStackArray = Array<ShadowStackEntry>;
134 // We collate the allocators we need into a single struct, as a convenience to
135 // allow us to initialize these as a group.
137 using NodeAllocatorType = NodeArray::AllocatorType;
138 using RootAllocatorType = RootArray::AllocatorType;
139 using ShadowStackAllocatorType = ShadowStackArray::AllocatorType;
141 // Use hosted aligned storage members to allow for trivial move and init.
142 // This also allows us to sidestep the potential-failing allocation issue.
143 typename std::aligned_storage<sizeof(NodeAllocatorType),
144 alignof(NodeAllocatorType)>::type
145 NodeAllocatorStorage;
146 typename std::aligned_storage<sizeof(RootAllocatorType),
147 alignof(RootAllocatorType)>::type
148 RootAllocatorStorage;
149 typename std::aligned_storage<sizeof(ShadowStackAllocatorType),
150 alignof(ShadowStackAllocatorType)>::type
151 ShadowStackAllocatorStorage;
152 typename std::aligned_storage<sizeof(NodeIdPairAllocatorType),
153 alignof(NodeIdPairAllocatorType)>::type
154 NodeIdPairAllocatorStorage;
156 NodeAllocatorType *NodeAllocator = nullptr;
157 RootAllocatorType *RootAllocator = nullptr;
158 ShadowStackAllocatorType *ShadowStackAllocator = nullptr;
159 NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
161 Allocators() = default;
162 Allocators(const Allocators &) = delete;
163 Allocators &operator=(const Allocators &) = delete;
166 BufferQueue::Buffer NodeBuffer;
167 BufferQueue::Buffer RootsBuffer;
168 BufferQueue::Buffer ShadowStackBuffer;
169 BufferQueue::Buffer NodeIdPairBuffer;
172 explicit Allocators(Buffers &B) XRAY_NEVER_INSTRUMENT {
173 new (&NodeAllocatorStorage)
174 NodeAllocatorType(B.NodeBuffer.Data, B.NodeBuffer.Size);
176 reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
178 new (&RootAllocatorStorage)
179 RootAllocatorType(B.RootsBuffer.Data, B.RootsBuffer.Size);
181 reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
183 new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(
184 B.ShadowStackBuffer.Data, B.ShadowStackBuffer.Size);
185 ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
186 &ShadowStackAllocatorStorage);
188 new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(
189 B.NodeIdPairBuffer.Data, B.NodeIdPairBuffer.Size);
190 NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
191 &NodeIdPairAllocatorStorage);
194 explicit Allocators(uptr Max) XRAY_NEVER_INSTRUMENT {
195 new (&NodeAllocatorStorage) NodeAllocatorType(Max);
197 reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
199 new (&RootAllocatorStorage) RootAllocatorType(Max);
201 reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
203 new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(Max);
204 ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
205 &ShadowStackAllocatorStorage);
207 new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(Max);
208 NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
209 &NodeIdPairAllocatorStorage);
212 Allocators(Allocators &&O) XRAY_NEVER_INSTRUMENT {
213 // Here we rely on the safety of memcpy'ing contents of the storage
214 // members, and then pointing the source pointers to nullptr.
215 internal_memcpy(&NodeAllocatorStorage, &O.NodeAllocatorStorage,
216 sizeof(NodeAllocatorType));
217 internal_memcpy(&RootAllocatorStorage, &O.RootAllocatorStorage,
218 sizeof(RootAllocatorType));
219 internal_memcpy(&ShadowStackAllocatorStorage,
220 &O.ShadowStackAllocatorStorage,
221 sizeof(ShadowStackAllocatorType));
222 internal_memcpy(&NodeIdPairAllocatorStorage,
223 &O.NodeIdPairAllocatorStorage,
224 sizeof(NodeIdPairAllocatorType));
227 reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
229 reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
230 ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
231 &ShadowStackAllocatorStorage);
232 NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
233 &NodeIdPairAllocatorStorage);
235 O.NodeAllocator = nullptr;
236 O.RootAllocator = nullptr;
237 O.ShadowStackAllocator = nullptr;
238 O.NodeIdPairAllocator = nullptr;
241 Allocators &operator=(Allocators &&O) XRAY_NEVER_INSTRUMENT {
242 // When moving into an existing instance, we ensure that we clean up the
243 // current allocators.
245 NodeAllocator->~NodeAllocatorType();
246 if (O.NodeAllocator) {
247 new (&NodeAllocatorStorage)
248 NodeAllocatorType(std::move(*O.NodeAllocator));
250 reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
251 O.NodeAllocator = nullptr;
253 NodeAllocator = nullptr;
257 RootAllocator->~RootAllocatorType();
258 if (O.RootAllocator) {
259 new (&RootAllocatorStorage)
260 RootAllocatorType(std::move(*O.RootAllocator));
262 reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
263 O.RootAllocator = nullptr;
265 RootAllocator = nullptr;
268 if (ShadowStackAllocator)
269 ShadowStackAllocator->~ShadowStackAllocatorType();
270 if (O.ShadowStackAllocator) {
271 new (&ShadowStackAllocatorStorage)
272 ShadowStackAllocatorType(std::move(*O.ShadowStackAllocator));
273 ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
274 &ShadowStackAllocatorStorage);
275 O.ShadowStackAllocator = nullptr;
277 ShadowStackAllocator = nullptr;
280 if (NodeIdPairAllocator)
281 NodeIdPairAllocator->~NodeIdPairAllocatorType();
282 if (O.NodeIdPairAllocator) {
283 new (&NodeIdPairAllocatorStorage)
284 NodeIdPairAllocatorType(std::move(*O.NodeIdPairAllocator));
285 NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
286 &NodeIdPairAllocatorStorage);
287 O.NodeIdPairAllocator = nullptr;
289 NodeIdPairAllocator = nullptr;
295 ~Allocators() XRAY_NEVER_INSTRUMENT {
296 if (NodeAllocator != nullptr)
297 NodeAllocator->~NodeAllocatorType();
298 if (RootAllocator != nullptr)
299 RootAllocator->~RootAllocatorType();
300 if (ShadowStackAllocator != nullptr)
301 ShadowStackAllocator->~ShadowStackAllocatorType();
302 if (NodeIdPairAllocator != nullptr)
303 NodeIdPairAllocator->~NodeIdPairAllocatorType();
307 static Allocators InitAllocators() XRAY_NEVER_INSTRUMENT {
308 return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max);
311 static Allocators InitAllocatorsCustom(uptr Max) XRAY_NEVER_INSTRUMENT {
317 InitAllocatorsFromBuffers(Allocators::Buffers &Bufs) XRAY_NEVER_INSTRUMENT {
325 ShadowStackArray ShadowStack;
326 NodeIdPairAllocatorType *NodeIdPairAllocator;
327 uint32_t OverflowedFunctions;
330 explicit FunctionCallTrie(const Allocators &A) XRAY_NEVER_INSTRUMENT
331 : Nodes(*A.NodeAllocator),
332 Roots(*A.RootAllocator),
333 ShadowStack(*A.ShadowStackAllocator),
334 NodeIdPairAllocator(A.NodeIdPairAllocator),
335 OverflowedFunctions(0) {}
337 FunctionCallTrie() = delete;
338 FunctionCallTrie(const FunctionCallTrie &) = delete;
339 FunctionCallTrie &operator=(const FunctionCallTrie &) = delete;
341 FunctionCallTrie(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT
342 : Nodes(std::move(O.Nodes)),
343 Roots(std::move(O.Roots)),
344 ShadowStack(std::move(O.ShadowStack)),
345 NodeIdPairAllocator(O.NodeIdPairAllocator),
346 OverflowedFunctions(O.OverflowedFunctions) {}
348 FunctionCallTrie &operator=(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT {
349 Nodes = std::move(O.Nodes);
350 Roots = std::move(O.Roots);
351 ShadowStack = std::move(O.ShadowStack);
352 NodeIdPairAllocator = O.NodeIdPairAllocator;
353 OverflowedFunctions = O.OverflowedFunctions;
357 ~FunctionCallTrie() XRAY_NEVER_INSTRUMENT {}
359 void enterFunction(const int32_t FId, uint64_t TSC,
360 uint16_t CPU) XRAY_NEVER_INSTRUMENT {
363 // If we're already overflowed the function call stack, do not bother
364 // attempting to record any more function entries.
365 if (UNLIKELY(OverflowedFunctions)) {
366 ++OverflowedFunctions;
370 // If this is the first function we've encountered, we want to set up the
371 // node(s) and treat it as a root.
372 if (UNLIKELY(ShadowStack.empty())) {
373 auto *NewRoot = Nodes.AppendEmplace(
374 nullptr, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId);
375 if (UNLIKELY(NewRoot == nullptr))
377 if (Roots.AppendEmplace(NewRoot) == nullptr) {
381 if (ShadowStack.AppendEmplace(TSC, NewRoot, CPU) == nullptr) {
384 ++OverflowedFunctions;
390 // From this point on, we require that the stack is not empty.
391 DCHECK(!ShadowStack.empty());
392 auto TopNode = ShadowStack.back().NodePtr;
393 DCHECK_NE(TopNode, nullptr);
395 // If we've seen this callee before, then we access that node and place that
396 // on the top of the stack.
397 auto* Callee = TopNode->Callees.find_element(
398 [FId](const NodeIdPair &NR) { return NR.FId == FId; });
399 if (Callee != nullptr) {
400 CHECK_NE(Callee->NodePtr, nullptr);
401 if (ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU) == nullptr)
402 ++OverflowedFunctions;
406 // This means we've never seen this stack before, create a new node here.
407 auto* NewNode = Nodes.AppendEmplace(
408 TopNode, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId);
409 if (UNLIKELY(NewNode == nullptr))
411 DCHECK_NE(NewNode, nullptr);
412 TopNode->Callees.AppendEmplace(NewNode, FId);
413 if (ShadowStack.AppendEmplace(TSC, NewNode, CPU) == nullptr)
414 ++OverflowedFunctions;
418 void exitFunction(int32_t FId, uint64_t TSC,
419 uint16_t CPU) XRAY_NEVER_INSTRUMENT {
420 // If we're exiting functions that have "overflowed" or don't fit into the
421 // stack due to allocator constraints, we then decrement that count first.
422 if (OverflowedFunctions) {
423 --OverflowedFunctions;
427 // When we exit a function, we look up the ShadowStack to see whether we've
428 // entered this function before. We do as little processing here as we can,
429 // since most of the hard work would have already been done at function
431 uint64_t CumulativeTreeTime = 0;
433 while (!ShadowStack.empty()) {
434 const auto &Top = ShadowStack.back();
435 auto TopNode = Top.NodePtr;
436 DCHECK_NE(TopNode, nullptr);
438 // We may encounter overflow on the TSC we're provided, which may end up
439 // being less than the TSC when we first entered the function.
441 // To get the accurate measurement of cycles, we need to check whether
442 // we've overflowed (TSC < Top.EntryTSC) and then account the difference
443 // between the entry TSC and the max for the TSC counter (max of uint64_t)
444 // then add the value of TSC. We can prove that the maximum delta we will
445 // get is at most the 64-bit unsigned value, since the difference between
446 // a TSC of 0 and a Top.EntryTSC of 1 is (numeric_limits<uint64_t>::max()
449 // NOTE: This assumes that TSCs are synchronised across CPUs.
450 // TODO: Count the number of times we've seen CPU migrations.
453 ? (std::numeric_limits<uint64_t>::max() - Top.EntryTSC) + TSC
454 : TSC - Top.EntryTSC;
455 TopNode->CallCount++;
456 TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
457 CumulativeTreeTime += LocalTime;
460 // TODO: Update the histogram for the node.
461 if (TopNode->FId == FId)
466 const RootArray &getRoots() const XRAY_NEVER_INSTRUMENT { return Roots; }
468 // The deepCopyInto operation will update the provided FunctionCallTrie by
469 // re-creating the contents of this particular FunctionCallTrie in the other
470 // FunctionCallTrie. It will do this using a Depth First Traversal from the
471 // roots, and while doing so recreating the traversal in the provided
474 // This operation will *not* destroy the state in `O`, and thus may cause some
475 // duplicate entries in `O` if it is not empty.
477 // This function is *not* thread-safe, and may require external
478 // synchronisation of both "this" and |O|.
480 // This function must *not* be called with a non-empty FunctionCallTrie |O|.
481 void deepCopyInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT {
482 DCHECK(O.getRoots().empty());
484 // We then push the root into a stack, to use as the parent marker for new
485 // nodes we push in as we're traversing depth-first down the call tree.
486 struct NodeAndParent {
487 FunctionCallTrie::Node *Node;
488 FunctionCallTrie::Node *NewNode;
490 using Stack = Array<NodeAndParent>;
492 typename Stack::AllocatorType StackAllocator(
493 profilingFlags()->stack_allocator_max);
494 Stack DFSStack(StackAllocator);
496 for (const auto Root : getRoots()) {
497 // Add a node in O for this root.
498 auto NewRoot = O.Nodes.AppendEmplace(
499 nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), Root->CallCount,
500 Root->CumulativeLocalTime, Root->FId);
502 // Because we cannot allocate more memory we should bail out right away.
503 if (UNLIKELY(NewRoot == nullptr))
506 if (UNLIKELY(O.Roots.Append(NewRoot) == nullptr))
509 // TODO: Figure out what to do if we fail to allocate any more stack
510 // space. Maybe warn or report once?
511 if (DFSStack.AppendEmplace(Root, NewRoot) == nullptr)
513 while (!DFSStack.empty()) {
514 NodeAndParent NP = DFSStack.back();
515 DCHECK_NE(NP.Node, nullptr);
516 DCHECK_NE(NP.NewNode, nullptr);
518 for (const auto Callee : NP.Node->Callees) {
519 auto NewNode = O.Nodes.AppendEmplace(
520 NP.NewNode, NodeIdPairArray(*O.NodeIdPairAllocator),
521 Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime,
523 if (UNLIKELY(NewNode == nullptr))
525 if (UNLIKELY(NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId) ==
528 if (UNLIKELY(DFSStack.AppendEmplace(Callee.NodePtr, NewNode) ==
536 // The mergeInto operation will update the provided FunctionCallTrie by
537 // traversing the current trie's roots and updating (i.e. merging) the data in
538 // the nodes with the data in the target's nodes. If the node doesn't exist in
539 // the provided trie, we add a new one in the right position, and inherit the
540 // data from the original (current) trie, along with all its callees.
542 // This function is *not* thread-safe, and may require external
543 // synchronisation of both "this" and |O|.
544 void mergeInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT {
545 struct NodeAndTarget {
546 FunctionCallTrie::Node *OrigNode;
547 FunctionCallTrie::Node *TargetNode;
549 using Stack = Array<NodeAndTarget>;
550 typename Stack::AllocatorType StackAllocator(
551 profilingFlags()->stack_allocator_max);
552 Stack DFSStack(StackAllocator);
554 for (const auto Root : getRoots()) {
555 Node *TargetRoot = nullptr;
556 auto R = O.Roots.find_element(
557 [&](const Node *Node) { return Node->FId == Root->FId; });
559 TargetRoot = O.Nodes.AppendEmplace(
560 nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u,
562 if (UNLIKELY(TargetRoot == nullptr))
565 O.Roots.Append(TargetRoot);
570 DFSStack.AppendEmplace(Root, TargetRoot);
571 while (!DFSStack.empty()) {
572 NodeAndTarget NT = DFSStack.back();
573 DCHECK_NE(NT.OrigNode, nullptr);
574 DCHECK_NE(NT.TargetNode, nullptr);
576 // TODO: Update the histogram as well when we have it ready.
577 NT.TargetNode->CallCount += NT.OrigNode->CallCount;
578 NT.TargetNode->CumulativeLocalTime += NT.OrigNode->CumulativeLocalTime;
579 for (const auto Callee : NT.OrigNode->Callees) {
580 auto TargetCallee = NT.TargetNode->Callees.find_element(
581 [&](const FunctionCallTrie::NodeIdPair &C) {
582 return C.FId == Callee.FId;
584 if (TargetCallee == nullptr) {
585 auto NewTargetNode = O.Nodes.AppendEmplace(
586 NT.TargetNode, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u,
589 if (UNLIKELY(NewTargetNode == nullptr))
593 NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId);
595 DFSStack.AppendEmplace(Callee.NodePtr, TargetCallee->NodePtr);
602 } // namespace __xray
604 #endif // XRAY_FUNCTION_CALL_TRIE_H