contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

   1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// This pass does misc. AMDGPU optimizations on IR before instruction
  12 /// selection.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "AMDGPU.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "AMDGPUTargetMachine.h"
  20
  21 #include "llvm/Analysis/DivergenceAnalysis.h"
  22 #include "llvm/CodeGen/Passes.h"
  23 #include "llvm/IR/InstVisitor.h"
  24 #include "llvm/IR/IRBuilder.h"
  25 #include "llvm/Support/Debug.h"
  26 #include "llvm/Support/raw_ostream.h"
  27
  28 #define DEBUG_TYPE "amdgpu-codegenprepare"
  29
  30 using namespace llvm;
  31
  32 namespace {
  33
  34 class AMDGPUCodeGenPrepare : public FunctionPass,
  35                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
  36   const GCNTargetMachine *TM;
  37   const SISubtarget *ST;
  38   DivergenceAnalysis *DA;
  39   Module *Mod;
  40   bool HasUnsafeFPMath;
  41
  42   /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
  43   /// binary operation \p V.
  44   ///
  45   /// \returns Binary operation \p V.
  46   Value *copyFlags(const BinaryOperator &I, Value *V) const;
  47
  48   /// \returns \p T's base element bit width.
  49   unsigned getBaseElementBitWidth(const Type *T) const;
  50
  51   /// \returns Equivalent 32 bit integer type for given type \p T. For example,
  52   /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
  53   /// is returned.
  54   Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
  55
  56   /// \returns True if binary operation \p I is a signed binary operation, false
  57   /// otherwise.
  58   bool isSigned(const BinaryOperator &I) const;
  59
  60   /// \returns True if the condition of 'select' operation \p I comes from a
  61   /// signed 'icmp' operation, false otherwise.
  62   bool isSigned(const SelectInst &I) const;
  63
  64   /// \returns True if type \p T needs to be promoted to 32 bit integer type,
  65   /// false otherwise.
  66   bool needsPromotionToI32(const Type *T) const;
  67
  68   /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
  69   /// operation.
  70   ///
  71   /// \details \p I's base element bit width must be greater than 1 and less
  72   /// than or equal 16. Promotion is done by sign or zero extending operands to
  73   /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
  74   /// truncating the result of 32 bit binary operation back to \p I's original
  75   /// type. Division operation is not promoted.
  76   ///
  77   /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
  78   /// false otherwise.
  79   bool promoteUniformOpToI32(BinaryOperator &I) const;
  80
  81   /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
  82   ///
  83   /// \details \p I's base element bit width must be greater than 1 and less
  84   /// than or equal 16. Promotion is done by sign or zero extending operands to
  85   /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
  86   ///
  87   /// \returns True.
  88   bool promoteUniformOpToI32(ICmpInst &I) const;
  89
  90   /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
  91   /// operation.
  92   ///
  93   /// \details \p I's base element bit width must be greater than 1 and less
  94   /// than or equal 16. Promotion is done by sign or zero extending operands to
  95   /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
  96   /// result of 32 bit 'select' operation back to \p I's original type.
  97   ///
  98   /// \returns True.
  99   bool promoteUniformOpToI32(SelectInst &I) const;
 100
 101   /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
 102   /// intrinsic.
 103   ///
 104   /// \details \p I's base element bit width must be greater than 1 and less
 105   /// than or equal 16. Promotion is done by zero extending the operand to 32
 106   /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
 107   /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
 108   /// shift amount is 32 minus \p I's base element bit width), and truncating
 109   /// the result of the shift operation back to \p I's original type.
 110   ///
 111   /// \returns True.
 112   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
 113
 114 public:
 115   static char ID;
 116   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
 117     FunctionPass(ID),
 118     TM(static_cast<const GCNTargetMachine *>(TM)),
 119     ST(nullptr),
 120     DA(nullptr),
 121     Mod(nullptr),
 122     HasUnsafeFPMath(false) { }
 123
 124   bool visitFDiv(BinaryOperator &I);
 125
 126   bool visitInstruction(Instruction &I) { return false; }
 127   bool visitBinaryOperator(BinaryOperator &I);
 128   bool visitICmpInst(ICmpInst &I);
 129   bool visitSelectInst(SelectInst &I);
 130
 131   bool visitIntrinsicInst(IntrinsicInst &I);
 132   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
 133
 134   bool doInitialization(Module &M) override;
 135   bool runOnFunction(Function &F) override;
 136
 137   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
 138
 139   void getAnalysisUsage(AnalysisUsage &AU) const override {
 140     AU.addRequired<DivergenceAnalysis>();
 141     AU.setPreservesAll();
 142  }
 143 };
 144
 145 } // End anonymous namespace
 146
 147 Value *AMDGPUCodeGenPrepare::copyFlags(
 148     const BinaryOperator &I, Value *V) const {
 149   BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
 150   if (!BinOp) // Possibly constant expression.
 151     return V;
 152
 153   if (isa<OverflowingBinaryOperator>(BinOp)) {
 154     BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
 155     BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
 156   } else if (isa<PossiblyExactOperator>(BinOp))
 157     BinOp->setIsExact(I.isExact());
 158
 159   return V;
 160 }
 161
 162 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
 163   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
 164
 165   if (T->isIntegerTy())
 166     return T->getIntegerBitWidth();
 167   return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
 168 }
 169
 170 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
 171   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
 172
 173   if (T->isIntegerTy())
 174     return B.getInt32Ty();
 175   return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
 176 }
 177
 178 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
 179   return I.getOpcode() == Instruction::AShr ||
 180       I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
 181 }
 182
 183 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
 184   return isa<ICmpInst>(I.getOperand(0)) ?
 185       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
 186 }
 187
 188 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
 189   if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 &&
 190       T->getIntegerBitWidth() <= 16)
 191     return true;
 192   if (!T->isVectorTy())
 193     return false;
 194   return needsPromotionToI32(cast<VectorType>(T)->getElementType());
 195 }
 196
 197 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
 198   assert(needsPromotionToI32(I.getType()) &&
 199          "I does not need promotion to i32");
 200
 201   if (I.getOpcode() == Instruction::SDiv ||
 202       I.getOpcode() == Instruction::UDiv)
 203     return false;
 204
 205   IRBuilder<> Builder(&I);
 206   Builder.SetCurrentDebugLocation(I.getDebugLoc());
 207
 208   Type *I32Ty = getI32Ty(Builder, I.getType());
 209   Value *ExtOp0 = nullptr;
 210   Value *ExtOp1 = nullptr;
 211   Value *ExtRes = nullptr;
 212   Value *TruncRes = nullptr;
 213
 214   if (isSigned(I)) {
 215     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
 216     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
 217   } else {
 218     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
 219     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
 220   }
 221   ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1));
 222   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
 223
 224   I.replaceAllUsesWith(TruncRes);
 225   I.eraseFromParent();
 226
 227   return true;
 228 }
 229
 230 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
 231   assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
 232          "I does not need promotion to i32");
 233
 234   IRBuilder<> Builder(&I);
 235   Builder.SetCurrentDebugLocation(I.getDebugLoc());
 236
 237   Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
 238   Value *ExtOp0 = nullptr;
 239   Value *ExtOp1 = nullptr;
 240   Value *NewICmp  = nullptr;
 241
 242   if (I.isSigned()) {
 243     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
 244     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
 245   } else {
 246     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
 247     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
 248   }
 249   NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
 250
 251   I.replaceAllUsesWith(NewICmp);
 252   I.eraseFromParent();
 253
 254   return true;
 255 }
 256
 257 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
 258   assert(needsPromotionToI32(I.getType()) &&
 259          "I does not need promotion to i32");
 260
 261   IRBuilder<> Builder(&I);
 262   Builder.SetCurrentDebugLocation(I.getDebugLoc());
 263
 264   Type *I32Ty = getI32Ty(Builder, I.getType());
 265   Value *ExtOp1 = nullptr;
 266   Value *ExtOp2 = nullptr;
 267   Value *ExtRes = nullptr;
 268   Value *TruncRes = nullptr;
 269
 270   if (isSigned(I)) {
 271     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
 272     ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
 273   } else {
 274     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
 275     ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
 276   }
 277   ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
 278   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
 279
 280   I.replaceAllUsesWith(TruncRes);
 281   I.eraseFromParent();
 282
 283   return true;
 284 }
 285
 286 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
 287     IntrinsicInst &I) const {
 288   assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
 289          "I must be bitreverse intrinsic");
 290   assert(needsPromotionToI32(I.getType()) &&
 291          "I does not need promotion to i32");
 292
 293   IRBuilder<> Builder(&I);
 294   Builder.SetCurrentDebugLocation(I.getDebugLoc());
 295
 296   Type *I32Ty = getI32Ty(Builder, I.getType());
 297   Function *I32 =
 298       Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
 299   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
 300   Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
 301   Value *LShrOp =
 302       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
 303   Value *TruncRes =
 304       Builder.CreateTrunc(LShrOp, I.getType());
 305
 306   I.replaceAllUsesWith(TruncRes);
 307   I.eraseFromParent();
 308
 309   return true;
 310 }
 311
 312 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
 313   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
 314   if (!CNum)
 315     return false;
 316
 317   // Reciprocal f32 is handled separately without denormals.
 318   return UnsafeDiv || CNum->isExactlyValue(+1.0);
 319 }
 320
 321 // Insert an intrinsic for fast fdiv for safe math situations where we can
 322 // reduce precision. Leave fdiv for situations where the generic node is
 323 // expected to be optimized.
 324 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
 325   Type *Ty = FDiv.getType();
 326
 327   if (!Ty->getScalarType()->isFloatTy())
 328     return false;
 329
 330   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
 331   if (!FPMath)
 332     return false;
 333
 334   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
 335   float ULP = FPOp->getFPAccuracy();
 336   if (ULP < 2.5f)
 337     return false;
 338
 339   FastMathFlags FMF = FPOp->getFastMathFlags();
 340   bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
 341                                       FMF.allowReciprocal();
 342   if (ST->hasFP32Denormals() && !UnsafeDiv)
 343     return false;
 344
 345   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
 346   Builder.setFastMathFlags(FMF);
 347   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
 348
 349   const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
 350   Function *Decl
 351     = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
 352
 353   Value *Num = FDiv.getOperand(0);
 354   Value *Den = FDiv.getOperand(1);
 355
 356   Value *NewFDiv = nullptr;
 357
 358   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
 359     NewFDiv = UndefValue::get(VT);
 360
 361     // FIXME: Doesn't do the right thing for cases where the vector is partially
 362     // constant. This works when the scalarizer pass is run first.
 363     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
 364       Value *NumEltI = Builder.CreateExtractElement(Num, I);
 365       Value *DenEltI = Builder.CreateExtractElement(Den, I);
 366       Value *NewElt;
 367
 368       if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
 369         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
 370       } else {
 371         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
 372       }
 373
 374       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
 375     }
 376   } else {
 377     if (!shouldKeepFDivF32(Num, UnsafeDiv))
 378       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
 379   }
 380
 381   if (NewFDiv) {
 382     FDiv.replaceAllUsesWith(NewFDiv);
 383     NewFDiv->takeName(&FDiv);
 384     FDiv.eraseFromParent();
 385   }
 386
 387   return true;
 388 }
 389
 390 static bool hasUnsafeFPMath(const Function &F) {
 391   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
 392   return Attr.getValueAsString() == "true";
 393 }
 394
 395 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
 396   bool Changed = false;
 397
 398   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
 399       DA->isUniform(&I))
 400     Changed |= promoteUniformOpToI32(I);
 401
 402   return Changed;
 403 }
 404
 405 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
 406   bool Changed = false;
 407
 408   if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
 409       DA->isUniform(&I))
 410     Changed |= promoteUniformOpToI32(I);
 411
 412   return Changed;
 413 }
 414
 415 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
 416   bool Changed = false;
 417
 418   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
 419       DA->isUniform(&I))
 420     Changed |= promoteUniformOpToI32(I);
 421
 422   return Changed;
 423 }
 424
 425 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
 426   switch (I.getIntrinsicID()) {
 427   case Intrinsic::bitreverse:
 428     return visitBitreverseIntrinsicInst(I);
 429   default:
 430     return false;
 431   }
 432 }
 433
 434 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
 435   bool Changed = false;
 436
 437   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
 438       DA->isUniform(&I))
 439     Changed |= promoteUniformBitreverseToI32(I);
 440
 441   return Changed;
 442 }
 443
 444 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
 445   Mod = &M;
 446   return false;
 447 }
 448
 449 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
 450   if (!TM || skipFunction(F))
 451     return false;
 452
 453   ST = &TM->getSubtarget<SISubtarget>(F);
 454   DA = &getAnalysis<DivergenceAnalysis>();
 455   HasUnsafeFPMath = hasUnsafeFPMath(F);
 456
 457   bool MadeChange = false;
 458
 459   for (BasicBlock &BB : F) {
 460     BasicBlock::iterator Next;
 461     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
 462       Next = std::next(I);
 463       MadeChange |= visit(*I);
 464     }
 465   }
 466
 467   return MadeChange;
 468 }
 469
 470 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
 471                       "AMDGPU IR optimizations", false, false)
 472 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 473 INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
 474                        "AMDGPU IR optimizations", false, false)
 475
 476 char AMDGPUCodeGenPrepare::ID = 0;
 477
 478 FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
 479   return new AMDGPUCodeGenPrepare(TM);
 480 }