1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// This pass does misc. AMDGPU optimizations on IR before instruction
14 //===----------------------------------------------------------------------===//
17 #include "AMDGPUIntrinsicInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
21 #include "llvm/Analysis/DivergenceAnalysis.h"
22 #include "llvm/CodeGen/Passes.h"
23 #include "llvm/IR/InstVisitor.h"
24 #include "llvm/IR/IRBuilder.h"
25 #include "llvm/Support/Debug.h"
26 #include "llvm/Support/raw_ostream.h"
28 #define DEBUG_TYPE "amdgpu-codegenprepare"
34 class AMDGPUCodeGenPrepare : public FunctionPass,
35 public InstVisitor<AMDGPUCodeGenPrepare, bool> {
36 const GCNTargetMachine *TM;
37 const SISubtarget *ST;
38 DivergenceAnalysis *DA;
44 AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
46 TM(static_cast<const GCNTargetMachine *>(TM)),
50 HasUnsafeFPMath(false) { }
52 bool visitFDiv(BinaryOperator &I);
54 bool visitInstruction(Instruction &I) {
58 bool doInitialization(Module &M) override;
59 bool runOnFunction(Function &F) override;
61 const char *getPassName() const override {
62 return "AMDGPU IR optimizations";
65 void getAnalysisUsage(AnalysisUsage &AU) const override {
66 AU.addRequired<DivergenceAnalysis>();
71 } // End anonymous namespace
73 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
74 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
78 // Reciprocal f32 is handled separately without denormals.
79 return UnsafeDiv || CNum->isExactlyValue(+1.0);
82 // Insert an intrinsic for fast fdiv for safe math situations where we can
83 // reduce precision. Leave fdiv for situations where the generic node is
84 // expected to be optimized.
85 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
86 Type *Ty = FDiv.getType();
89 if (!Ty->getScalarType()->isFloatTy())
92 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
96 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
97 float ULP = FPOp->getFPAccuracy();
101 FastMathFlags FMF = FPOp->getFastMathFlags();
102 bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
103 FMF.allowReciprocal();
104 if (ST->hasFP32Denormals() && !UnsafeDiv)
107 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
108 Builder.setFastMathFlags(FMF);
109 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
111 const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
113 = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
115 Value *Num = FDiv.getOperand(0);
116 Value *Den = FDiv.getOperand(1);
118 Value *NewFDiv = nullptr;
120 if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
121 NewFDiv = UndefValue::get(VT);
123 // FIXME: Doesn't do the right thing for cases where the vector is partially
124 // constant. This works when the scalarizer pass is run first.
125 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
126 Value *NumEltI = Builder.CreateExtractElement(Num, I);
127 Value *DenEltI = Builder.CreateExtractElement(Den, I);
130 if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
131 NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
133 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
136 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
139 if (!shouldKeepFDivF32(Num, UnsafeDiv))
140 NewFDiv = Builder.CreateCall(Decl, { Num, Den });
144 FDiv.replaceAllUsesWith(NewFDiv);
145 NewFDiv->takeName(&FDiv);
146 FDiv.eraseFromParent();
152 static bool hasUnsafeFPMath(const Function &F) {
153 Attribute Attr = F.getFnAttribute("unsafe-fp-math");
154 return Attr.getValueAsString() == "true";
157 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
162 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
163 if (!TM || skipFunction(F))
166 ST = &TM->getSubtarget<SISubtarget>(F);
167 DA = &getAnalysis<DivergenceAnalysis>();
168 HasUnsafeFPMath = hasUnsafeFPMath(F);
170 bool MadeChange = false;
172 for (BasicBlock &BB : F) {
173 BasicBlock::iterator Next;
174 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
176 MadeChange |= visit(*I);
183 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
184 "AMDGPU IR optimizations", false, false)
185 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
186 INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
187 "AMDGPU IR optimizations", false, false)
189 char AMDGPUCodeGenPrepare::ID = 0;
191 FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
192 return new AMDGPUCodeGenPrepare(TM);