contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

   1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// This pass does misc. AMDGPU optimizations on IR before instruction
  12 /// selection.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "AMDGPU.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "AMDGPUTargetMachine.h"
  20
  21 #include "llvm/Analysis/DivergenceAnalysis.h"
  22 #include "llvm/CodeGen/Passes.h"
  23 #include "llvm/IR/InstVisitor.h"
  24 #include "llvm/IR/IRBuilder.h"
  25 #include "llvm/Support/Debug.h"
  26 #include "llvm/Support/raw_ostream.h"
  27
  28 #define DEBUG_TYPE "amdgpu-codegenprepare"
  29
  30 using namespace llvm;
  31
  32 namespace {
  33
  34 class AMDGPUCodeGenPrepare : public FunctionPass,
  35                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
  36   const GCNTargetMachine *TM;
  37   const SISubtarget *ST;
  38   DivergenceAnalysis *DA;
  39   Module *Mod;
  40   bool HasUnsafeFPMath;
  41
  42 public:
  43   static char ID;
  44   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
  45     FunctionPass(ID),
  46     TM(static_cast<const GCNTargetMachine *>(TM)),
  47     ST(nullptr),
  48     DA(nullptr),
  49     Mod(nullptr),
  50     HasUnsafeFPMath(false) { }
  51
  52   bool visitFDiv(BinaryOperator &I);
  53
  54   bool visitInstruction(Instruction &I) {
  55     return false;
  56   }
  57
  58   bool doInitialization(Module &M) override;
  59   bool runOnFunction(Function &F) override;
  60
  61   const char *getPassName() const override {
  62     return "AMDGPU IR optimizations";
  63   }
  64
  65   void getAnalysisUsage(AnalysisUsage &AU) const override {
  66     AU.addRequired<DivergenceAnalysis>();
  67     AU.setPreservesAll();
  68  }
  69 };
  70
  71 } // End anonymous namespace
  72
  73 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
  74   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
  75   if (!CNum)
  76     return false;
  77
  78   // Reciprocal f32 is handled separately without denormals.
  79   return UnsafeDiv || CNum->isExactlyValue(+1.0);
  80 }
  81
  82 // Insert an intrinsic for fast fdiv for safe math situations where we can
  83 // reduce precision. Leave fdiv for situations where the generic node is
  84 // expected to be optimized.
  85 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
  86   Type *Ty = FDiv.getType();
  87
  88   // TODO: Handle half
  89   if (!Ty->getScalarType()->isFloatTy())
  90     return false;
  91
  92   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
  93   if (!FPMath)
  94     return false;
  95
  96   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
  97   float ULP = FPOp->getFPAccuracy();
  98   if (ULP < 2.5f)
  99     return false;
 100
 101   FastMathFlags FMF = FPOp->getFastMathFlags();
 102   bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
 103                                       FMF.allowReciprocal();
 104   if (ST->hasFP32Denormals() && !UnsafeDiv)
 105     return false;
 106
 107   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
 108   Builder.setFastMathFlags(FMF);
 109   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
 110
 111   const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
 112   Function *Decl
 113     = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
 114
 115   Value *Num = FDiv.getOperand(0);
 116   Value *Den = FDiv.getOperand(1);
 117
 118   Value *NewFDiv = nullptr;
 119
 120   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
 121     NewFDiv = UndefValue::get(VT);
 122
 123     // FIXME: Doesn't do the right thing for cases where the vector is partially
 124     // constant. This works when the scalarizer pass is run first.
 125     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
 126       Value *NumEltI = Builder.CreateExtractElement(Num, I);
 127       Value *DenEltI = Builder.CreateExtractElement(Den, I);
 128       Value *NewElt;
 129
 130       if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
 131         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
 132       } else {
 133         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
 134       }
 135
 136       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
 137     }
 138   } else {
 139     if (!shouldKeepFDivF32(Num, UnsafeDiv))
 140       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
 141   }
 142
 143   if (NewFDiv) {
 144     FDiv.replaceAllUsesWith(NewFDiv);
 145     NewFDiv->takeName(&FDiv);
 146     FDiv.eraseFromParent();
 147   }
 148
 149   return true;
 150 }
 151
 152 static bool hasUnsafeFPMath(const Function &F) {
 153   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
 154   return Attr.getValueAsString() == "true";
 155 }
 156
 157 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
 158   Mod = &M;
 159   return false;
 160 }
 161
 162 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
 163   if (!TM || skipFunction(F))
 164     return false;
 165
 166   ST = &TM->getSubtarget<SISubtarget>(F);
 167   DA = &getAnalysis<DivergenceAnalysis>();
 168   HasUnsafeFPMath = hasUnsafeFPMath(F);
 169
 170   bool MadeChange = false;
 171
 172   for (BasicBlock &BB : F) {
 173     BasicBlock::iterator Next;
 174     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
 175       Next = std::next(I);
 176       MadeChange |= visit(*I);
 177     }
 178   }
 179
 180   return MadeChange;
 181 }
 182
 183 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
 184                       "AMDGPU IR optimizations", false, false)
 185 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 186 INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
 187                        "AMDGPU IR optimizations", false, false)
 188
 189 char AMDGPUCodeGenPrepare::ID = 0;
 190
 191 FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
 192   return new AMDGPUCodeGenPrepare(TM);
 193 }