contrib/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

   1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// This pass does misc. AMDGPU optimizations on IR before instruction
  12 /// selection.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "AMDGPU.h"
  17 #include "AMDGPUSubtarget.h"
  18 #include "AMDGPUTargetMachine.h"
  19 #include "llvm/ADT/StringRef.h"
  20 #include "llvm/Analysis/DivergenceAnalysis.h"
  21 #include "llvm/CodeGen/Passes.h"
  22 #include "llvm/IR/Attributes.h"
  23 #include "llvm/IR/BasicBlock.h"
  24 #include "llvm/IR/Constants.h"
  25 #include "llvm/IR/DerivedTypes.h"
  26 #include "llvm/IR/Function.h"
  27 #include "llvm/IR/InstrTypes.h"
  28 #include "llvm/IR/Instruction.h"
  29 #include "llvm/IR/Instructions.h"
  30 #include "llvm/IR/InstVisitor.h"
  31 #include "llvm/IR/IntrinsicInst.h"
  32 #include "llvm/IR/Intrinsics.h"
  33 #include "llvm/IR/IRBuilder.h"
  34 #include "llvm/IR/LLVMContext.h"
  35 #include "llvm/IR/Operator.h"
  36 #include "llvm/IR/Type.h"
  37 #include "llvm/IR/Value.h"
  38 #include "llvm/Pass.h"
  39 #include "llvm/Support/Casting.h"
  40 #include <cassert>
  41 #include <iterator>
  42
  43 #define DEBUG_TYPE "amdgpu-codegenprepare"
  44
  45 using namespace llvm;
  46
  47 namespace {
  48
  49 class AMDGPUCodeGenPrepare : public FunctionPass,
  50                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
  51   const GCNTargetMachine *TM;
  52   const SISubtarget *ST = nullptr;
  53   DivergenceAnalysis *DA = nullptr;
  54   Module *Mod = nullptr;
  55   bool HasUnsafeFPMath = false;
  56
  57   /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
  58   /// binary operation \p V.
  59   ///
  60   /// \returns Binary operation \p V.
  61   /// \returns \p T's base element bit width.
  62   unsigned getBaseElementBitWidth(const Type *T) const;
  63
  64   /// \returns Equivalent 32 bit integer type for given type \p T. For example,
  65   /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
  66   /// is returned.
  67   Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
  68
  69   /// \returns True if binary operation \p I is a signed binary operation, false
  70   /// otherwise.
  71   bool isSigned(const BinaryOperator &I) const;
  72
  73   /// \returns True if the condition of 'select' operation \p I comes from a
  74   /// signed 'icmp' operation, false otherwise.
  75   bool isSigned(const SelectInst &I) const;
  76
  77   /// \returns True if type \p T needs to be promoted to 32 bit integer type,
  78   /// false otherwise.
  79   bool needsPromotionToI32(const Type *T) const;
  80
  81   /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
  82   /// operation.
  83   ///
  84   /// \details \p I's base element bit width must be greater than 1 and less
  85   /// than or equal 16. Promotion is done by sign or zero extending operands to
  86   /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
  87   /// truncating the result of 32 bit binary operation back to \p I's original
  88   /// type. Division operation is not promoted.
  89   ///
  90   /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
  91   /// false otherwise.
  92   bool promoteUniformOpToI32(BinaryOperator &I) const;
  93
  94   /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
  95   ///
  96   /// \details \p I's base element bit width must be greater than 1 and less
  97   /// than or equal 16. Promotion is done by sign or zero extending operands to
  98   /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
  99   ///
 100   /// \returns True.
 101   bool promoteUniformOpToI32(ICmpInst &I) const;
 102
 103   /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
 104   /// operation.
 105   ///
 106   /// \details \p I's base element bit width must be greater than 1 and less
 107   /// than or equal 16. Promotion is done by sign or zero extending operands to
 108   /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
 109   /// result of 32 bit 'select' operation back to \p I's original type.
 110   ///
 111   /// \returns True.
 112   bool promoteUniformOpToI32(SelectInst &I) const;
 113
 114   /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
 115   /// intrinsic.
 116   ///
 117   /// \details \p I's base element bit width must be greater than 1 and less
 118   /// than or equal 16. Promotion is done by zero extending the operand to 32
 119   /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
 120   /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
 121   /// shift amount is 32 minus \p I's base element bit width), and truncating
 122   /// the result of the shift operation back to \p I's original type.
 123   ///
 124   /// \returns True.
 125   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
 126
 127 public:
 128   static char ID;
 129
 130   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
 131     FunctionPass(ID), TM(static_cast<const GCNTargetMachine *>(TM)) {}
 132
 133   bool visitFDiv(BinaryOperator &I);
 134
 135   bool visitInstruction(Instruction &I) { return false; }
 136   bool visitBinaryOperator(BinaryOperator &I);
 137   bool visitICmpInst(ICmpInst &I);
 138   bool visitSelectInst(SelectInst &I);
 139
 140   bool visitIntrinsicInst(IntrinsicInst &I);
 141   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
 142
 143   bool doInitialization(Module &M) override;
 144   bool runOnFunction(Function &F) override;
 145
 146   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
 147
 148   void getAnalysisUsage(AnalysisUsage &AU) const override {
 149     AU.addRequired<DivergenceAnalysis>();
 150     AU.setPreservesAll();
 151  }
 152 };
 153
 154 } // end anonymous namespace
 155
 156 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
 157   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
 158
 159   if (T->isIntegerTy())
 160     return T->getIntegerBitWidth();
 161   return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
 162 }
 163
 164 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
 165   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
 166
 167   if (T->isIntegerTy())
 168     return B.getInt32Ty();
 169   return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
 170 }
 171
 172 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
 173   return I.getOpcode() == Instruction::AShr ||
 174       I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
 175 }
 176
 177 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
 178   return isa<ICmpInst>(I.getOperand(0)) ?
 179       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
 180 }
 181
 182 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
 183   const IntegerType *IntTy = dyn_cast<IntegerType>(T);
 184   if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
 185     return true;
 186
 187   if (const VectorType *VT = dyn_cast<VectorType>(T)) {
 188     // TODO: The set of packed operations is more limited, so may want to
 189     // promote some anyway.
 190     if (ST->hasVOP3PInsts())
 191       return false;
 192
 193     return needsPromotionToI32(VT->getElementType());
 194   }
 195
 196   return false;
 197 }
 198
 199 // Return true if the op promoted to i32 should have nsw set.
 200 static bool promotedOpIsNSW(const Instruction &I) {
 201   switch (I.getOpcode()) {
 202   case Instruction::Shl:
 203   case Instruction::Add:
 204   case Instruction::Sub:
 205     return true;
 206   case Instruction::Mul:
 207     return I.hasNoUnsignedWrap();
 208   default:
 209     return false;
 210   }
 211 }
 212
 213 // Return true if the op promoted to i32 should have nuw set.
 214 static bool promotedOpIsNUW(const Instruction &I) {
 215   switch (I.getOpcode()) {
 216   case Instruction::Shl:
 217   case Instruction::Add:
 218   case Instruction::Mul:
 219     return true;
 220   case Instruction::Sub:
 221     return I.hasNoUnsignedWrap();
 222   default:
 223     return false;
 224   }
 225 }
 226
 227 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
 228   assert(needsPromotionToI32(I.getType()) &&
 229          "I does not need promotion to i32");
 230
 231   if (I.getOpcode() == Instruction::SDiv ||
 232       I.getOpcode() == Instruction::UDiv)
 233     return false;
 234
 235   IRBuilder<> Builder(&I);
 236   Builder.SetCurrentDebugLocation(I.getDebugLoc());
 237
 238   Type *I32Ty = getI32Ty(Builder, I.getType());
 239   Value *ExtOp0 = nullptr;
 240   Value *ExtOp1 = nullptr;
 241   Value *ExtRes = nullptr;
 242   Value *TruncRes = nullptr;
 243
 244   if (isSigned(I)) {
 245     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
 246     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
 247   } else {
 248     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
 249     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
 250   }
 251
 252   ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
 253   if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
 254     if (promotedOpIsNSW(cast<Instruction>(I)))
 255       Inst->setHasNoSignedWrap();
 256
 257     if (promotedOpIsNUW(cast<Instruction>(I)))
 258       Inst->setHasNoUnsignedWrap();
 259
 260     if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
 261       Inst->setIsExact(ExactOp->isExact());
 262   }
 263
 264   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
 265
 266   I.replaceAllUsesWith(TruncRes);
 267   I.eraseFromParent();
 268
 269   return true;
 270 }
 271
 272 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
 273   assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
 274          "I does not need promotion to i32");
 275
 276   IRBuilder<> Builder(&I);
 277   Builder.SetCurrentDebugLocation(I.getDebugLoc());
 278
 279   Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
 280   Value *ExtOp0 = nullptr;
 281   Value *ExtOp1 = nullptr;
 282   Value *NewICmp  = nullptr;
 283
 284   if (I.isSigned()) {
 285     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
 286     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
 287   } else {
 288     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
 289     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
 290   }
 291   NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
 292
 293   I.replaceAllUsesWith(NewICmp);
 294   I.eraseFromParent();
 295
 296   return true;
 297 }
 298
 299 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
 300   assert(needsPromotionToI32(I.getType()) &&
 301          "I does not need promotion to i32");
 302
 303   IRBuilder<> Builder(&I);
 304   Builder.SetCurrentDebugLocation(I.getDebugLoc());
 305
 306   Type *I32Ty = getI32Ty(Builder, I.getType());
 307   Value *ExtOp1 = nullptr;
 308   Value *ExtOp2 = nullptr;
 309   Value *ExtRes = nullptr;
 310   Value *TruncRes = nullptr;
 311
 312   if (isSigned(I)) {
 313     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
 314     ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
 315   } else {
 316     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
 317     ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
 318   }
 319   ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
 320   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
 321
 322   I.replaceAllUsesWith(TruncRes);
 323   I.eraseFromParent();
 324
 325   return true;
 326 }
 327
 328 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
 329     IntrinsicInst &I) const {
 330   assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
 331          "I must be bitreverse intrinsic");
 332   assert(needsPromotionToI32(I.getType()) &&
 333          "I does not need promotion to i32");
 334
 335   IRBuilder<> Builder(&I);
 336   Builder.SetCurrentDebugLocation(I.getDebugLoc());
 337
 338   Type *I32Ty = getI32Ty(Builder, I.getType());
 339   Function *I32 =
 340       Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
 341   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
 342   Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
 343   Value *LShrOp =
 344       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
 345   Value *TruncRes =
 346       Builder.CreateTrunc(LShrOp, I.getType());
 347
 348   I.replaceAllUsesWith(TruncRes);
 349   I.eraseFromParent();
 350
 351   return true;
 352 }
 353
 354 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
 355   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
 356   if (!CNum)
 357     return false;
 358
 359   // Reciprocal f32 is handled separately without denormals.
 360   return UnsafeDiv || CNum->isExactlyValue(+1.0);
 361 }
 362
 363 // Insert an intrinsic for fast fdiv for safe math situations where we can
 364 // reduce precision. Leave fdiv for situations where the generic node is
 365 // expected to be optimized.
 366 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
 367   Type *Ty = FDiv.getType();
 368
 369   if (!Ty->getScalarType()->isFloatTy())
 370     return false;
 371
 372   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
 373   if (!FPMath)
 374     return false;
 375
 376   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
 377   float ULP = FPOp->getFPAccuracy();
 378   if (ULP < 2.5f)
 379     return false;
 380
 381   FastMathFlags FMF = FPOp->getFastMathFlags();
 382   bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
 383                                       FMF.allowReciprocal();
 384   if (ST->hasFP32Denormals() && !UnsafeDiv)
 385     return false;
 386
 387   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
 388   Builder.setFastMathFlags(FMF);
 389   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
 390
 391   Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
 392
 393   Value *Num = FDiv.getOperand(0);
 394   Value *Den = FDiv.getOperand(1);
 395
 396   Value *NewFDiv = nullptr;
 397
 398   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
 399     NewFDiv = UndefValue::get(VT);
 400
 401     // FIXME: Doesn't do the right thing for cases where the vector is partially
 402     // constant. This works when the scalarizer pass is run first.
 403     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
 404       Value *NumEltI = Builder.CreateExtractElement(Num, I);
 405       Value *DenEltI = Builder.CreateExtractElement(Den, I);
 406       Value *NewElt;
 407
 408       if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
 409         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
 410       } else {
 411         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
 412       }
 413
 414       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
 415     }
 416   } else {
 417     if (!shouldKeepFDivF32(Num, UnsafeDiv))
 418       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
 419   }
 420
 421   if (NewFDiv) {
 422     FDiv.replaceAllUsesWith(NewFDiv);
 423     NewFDiv->takeName(&FDiv);
 424     FDiv.eraseFromParent();
 425   }
 426
 427   return true;
 428 }
 429
 430 static bool hasUnsafeFPMath(const Function &F) {
 431   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
 432   return Attr.getValueAsString() == "true";
 433 }
 434
 435 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
 436   bool Changed = false;
 437
 438   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
 439       DA->isUniform(&I))
 440     Changed |= promoteUniformOpToI32(I);
 441
 442   return Changed;
 443 }
 444
 445 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
 446   bool Changed = false;
 447
 448   if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
 449       DA->isUniform(&I))
 450     Changed |= promoteUniformOpToI32(I);
 451
 452   return Changed;
 453 }
 454
 455 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
 456   bool Changed = false;
 457
 458   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
 459       DA->isUniform(&I))
 460     Changed |= promoteUniformOpToI32(I);
 461
 462   return Changed;
 463 }
 464
 465 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
 466   switch (I.getIntrinsicID()) {
 467   case Intrinsic::bitreverse:
 468     return visitBitreverseIntrinsicInst(I);
 469   default:
 470     return false;
 471   }
 472 }
 473
 474 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
 475   bool Changed = false;
 476
 477   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
 478       DA->isUniform(&I))
 479     Changed |= promoteUniformBitreverseToI32(I);
 480
 481   return Changed;
 482 }
 483
 484 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
 485   Mod = &M;
 486   return false;
 487 }
 488
 489 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
 490   if (!TM || skipFunction(F))
 491     return false;
 492
 493   ST = &TM->getSubtarget<SISubtarget>(F);
 494   DA = &getAnalysis<DivergenceAnalysis>();
 495   HasUnsafeFPMath = hasUnsafeFPMath(F);
 496
 497   bool MadeChange = false;
 498
 499   for (BasicBlock &BB : F) {
 500     BasicBlock::iterator Next;
 501     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
 502       Next = std::next(I);
 503       MadeChange |= visit(*I);
 504     }
 505   }
 506
 507   return MadeChange;
 508 }
 509
 510 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
 511                       "AMDGPU IR optimizations", false, false)
 512 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 513 INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
 514                        "AMDGPU IR optimizations", false, false)
 515
 516 char AMDGPUCodeGenPrepare::ID = 0;
 517
 518 FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
 519   return new AMDGPUCodeGenPrepare(TM);
 520 }