1 //===- AMDGPUInline.cpp - Code to perform simple function inlining --------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// This is AMDGPU specific replacement of the standard inliner.
11 /// The main purpose is to account for the fact that calls not only expensive
12 /// on the AMDGPU, but much more expensive if a private memory pointer is
13 /// passed to a function as an argument. In this situation, we are unable to
14 /// eliminate private memory in the caller unless inlined and end up with slow
15 /// and expensive scratch access. Thus, we boost the inline threshold for such
18 //===----------------------------------------------------------------------===//
21 #include "llvm/Analysis/AssumptionCache.h"
22 #include "llvm/Analysis/CallGraph.h"
23 #include "llvm/Analysis/InlineCost.h"
24 #include "llvm/Analysis/TargetTransformInfo.h"
25 #include "llvm/Analysis/ValueTracking.h"
26 #include "llvm/IR/DataLayout.h"
27 #include "llvm/IR/Instructions.h"
28 #include "llvm/IR/Module.h"
29 #include "llvm/IR/Type.h"
30 #include "llvm/InitializePasses.h"
31 #include "llvm/Support/CommandLine.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/Transforms/IPO.h"
34 #include "llvm/Transforms/IPO/Inliner.h"
38 #define DEBUG_TYPE "inline"
41 ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000),
42 cl::desc("Cost of alloca argument"));
44 // If the amount of scratch memory to eliminate exceeds our ability to allocate
45 // it into registers we gain nothing by aggressively inlining functions for that
47 static cl::opt<unsigned>
48 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
49 cl::desc("Maximum alloca size to use for inline cost"));
51 // Inliner constraint to achieve reasonable compilation time
52 static cl::opt<size_t>
53 MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
54 cl::desc("Maximum BB number allowed in a function after inlining"
55 " (compile time constraint)"));
59 class AMDGPUInliner : public LegacyInlinerBase {
62 AMDGPUInliner() : LegacyInlinerBase(ID) {
63 initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry());
64 Params = getInlineParams();
67 static char ID; // Pass identification, replacement for typeid
69 unsigned getInlineThreshold(CallBase &CB) const;
71 InlineCost getInlineCost(CallBase &CB) override;
73 bool runOnSCC(CallGraphSCC &SCC) override;
75 void getAnalysisUsage(AnalysisUsage &AU) const override;
78 TargetTransformInfoWrapperPass *TTIWP;
83 } // end anonymous namespace
85 char AMDGPUInliner::ID = 0;
86 INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline",
87 "AMDGPU Function Integration/Inlining", false, false)
88 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
89 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
90 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
91 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
92 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
93 INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline",
94 "AMDGPU Function Integration/Inlining", false, false)
96 Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); }
98 bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) {
99 TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
100 return LegacyInlinerBase::runOnSCC(SCC);
103 void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const {
104 AU.addRequired<TargetTransformInfoWrapperPass>();
105 LegacyInlinerBase::getAnalysisUsage(AU);
108 unsigned AMDGPUInliner::getInlineThreshold(CallBase &CB) const {
109 int Thres = Params.DefaultThreshold;
111 Function *Caller = CB.getCaller();
112 // Listen to the inlinehint attribute when it would increase the threshold
113 // and the caller does not need to minimize its size.
114 Function *Callee = CB.getCalledFunction();
115 bool InlineHint = Callee && !Callee->isDeclaration() &&
116 Callee->hasFnAttribute(Attribute::InlineHint);
117 if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres
118 && !Caller->hasFnAttribute(Attribute::MinSize))
119 Thres = Params.HintThreshold.getValue() *
120 TTIWP->getTTI(*Callee).getInliningThresholdMultiplier();
122 const DataLayout &DL = Caller->getParent()->getDataLayout();
124 return (unsigned)Thres;
126 // If we have a pointer to private array passed into a function
127 // it will not be optimized out, leaving scratch usage.
128 // Increase the inline threshold to allow inliniting in this case.
129 uint64_t AllocaSize = 0;
130 SmallPtrSet<const AllocaInst *, 8> AIVisited;
131 for (Value *PtrArg : CB.args()) {
132 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
133 if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
134 Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
137 PtrArg = GetUnderlyingObject(PtrArg, DL);
138 if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
139 if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
141 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
142 // If the amount of stack memory is excessive we will not be able
143 // to get rid of the scratch anyway, bail out.
144 if (AllocaSize > ArgAllocaCutoff) {
151 Thres += ArgAllocaCost;
153 return (unsigned)Thres;
156 // Check if call is just a wrapper around another call.
157 // In this case we only have call and ret instructions.
158 static bool isWrapperOnlyCall(CallBase &CB) {
159 Function *Callee = CB.getCalledFunction();
160 if (!Callee || Callee->size() != 1)
162 const BasicBlock &BB = Callee->getEntryBlock();
163 if (const Instruction *I = BB.getFirstNonPHI()) {
164 if (!isa<CallInst>(I)) {
167 if (isa<ReturnInst>(*std::next(I->getIterator()))) {
168 LLVM_DEBUG(dbgs() << " Wrapper only call detected: "
169 << Callee->getName() << '\n');
176 InlineCost AMDGPUInliner::getInlineCost(CallBase &CB) {
177 Function *Callee = CB.getCalledFunction();
178 Function *Caller = CB.getCaller();
180 if (!Callee || Callee->isDeclaration())
181 return llvm::InlineCost::getNever("undefined callee");
184 return llvm::InlineCost::getNever("noinline");
186 TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
187 if (!TTI.areInlineCompatible(Caller, Callee))
188 return llvm::InlineCost::getNever("incompatible");
190 if (CB.hasFnAttr(Attribute::AlwaysInline)) {
191 auto IsViable = isInlineViable(*Callee);
192 if (IsViable.isSuccess())
193 return llvm::InlineCost::getAlways("alwaysinline viable");
194 return llvm::InlineCost::getNever(IsViable.getFailureReason());
197 if (isWrapperOnlyCall(CB))
198 return llvm::InlineCost::getAlways("wrapper-only call");
200 InlineParams LocalParams = Params;
201 LocalParams.DefaultThreshold = (int)getInlineThreshold(CB);
202 bool RemarksEnabled = false;
203 const auto &BBs = Caller->getBasicBlockList();
205 auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front());
207 RemarksEnabled = true;
210 OptimizationRemarkEmitter ORE(Caller);
211 auto GetAssumptionCache = [this](Function &F) -> AssumptionCache & {
212 return ACT->getAssumptionCache(F);
215 auto IC = llvm::getInlineCost(CB, Callee, LocalParams, TTI,
216 GetAssumptionCache, GetTLI, nullptr, PSI,
217 RemarksEnabled ? &ORE : nullptr);
219 if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) {
220 // Single BB does not increase total BB amount, thus subtract 1
221 size_t Size = Caller->size() + Callee->size() - 1;
222 if (MaxBB && Size > MaxBB)
223 return llvm::InlineCost::getNever("max number of bb exceeded");