//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // /// \file /// This pass does misc. AMDGPU optimizations on IR before instruction /// selection. // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/IRBuilder.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "amdgpu-codegenprepare" using namespace llvm; namespace { class AMDGPUCodeGenPrepare : public FunctionPass, public InstVisitor { const GCNTargetMachine *TM; const SISubtarget *ST; DivergenceAnalysis *DA; Module *Mod; bool HasUnsafeFPMath; /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to /// binary operation \p V. /// /// \returns Binary operation \p V. Value *copyFlags(const BinaryOperator &I, Value *V) const; /// \returns \p T's base element bit width. unsigned getBaseElementBitWidth(const Type *T) const; /// \returns Equivalent 32 bit integer type for given type \p T. For example, /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> /// is returned. Type *getI32Ty(IRBuilder<> &B, const Type *T) const; /// \returns True if binary operation \p I is a signed binary operation, false /// otherwise. bool isSigned(const BinaryOperator &I) const; /// \returns True if the condition of 'select' operation \p I comes from a /// signed 'icmp' operation, false otherwise. bool isSigned(const SelectInst &I) const; /// \returns True if type \p T needs to be promoted to 32 bit integer type, /// false otherwise. bool needsPromotionToI32(const Type *T) const; /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary /// operation. /// /// \details \p I's base element bit width must be greater than 1 and less /// than or equal 16. Promotion is done by sign or zero extending operands to /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and /// truncating the result of 32 bit binary operation back to \p I's original /// type. Division operation is not promoted. /// /// \returns True if \p I is promoted to equivalent 32 bit binary operation, /// false otherwise. bool promoteUniformOpToI32(BinaryOperator &I) const; /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. /// /// \details \p I's base element bit width must be greater than 1 and less /// than or equal 16. Promotion is done by sign or zero extending operands to /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. /// /// \returns True. bool promoteUniformOpToI32(ICmpInst &I) const; /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select' /// operation. /// /// \details \p I's base element bit width must be greater than 1 and less /// than or equal 16. Promotion is done by sign or zero extending operands to /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the /// result of 32 bit 'select' operation back to \p I's original type. /// /// \returns True. bool promoteUniformOpToI32(SelectInst &I) const; /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' /// intrinsic. /// /// \details \p I's base element bit width must be greater than 1 and less /// than or equal 16. Promotion is done by zero extending the operand to 32 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the /// shift amount is 32 minus \p I's base element bit width), and truncating /// the result of the shift operation back to \p I's original type. /// /// \returns True. bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; public: static char ID; AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : FunctionPass(ID), TM(static_cast(TM)), ST(nullptr), DA(nullptr), Mod(nullptr), HasUnsafeFPMath(false) { } bool visitFDiv(BinaryOperator &I); bool visitInstruction(Instruction &I) { return false; } bool visitBinaryOperator(BinaryOperator &I); bool visitICmpInst(ICmpInst &I); bool visitSelectInst(SelectInst &I); bool visitIntrinsicInst(IntrinsicInst &I); bool visitBitreverseIntrinsicInst(IntrinsicInst &I); bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; StringRef getPassName() const override { return "AMDGPU IR optimizations"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.setPreservesAll(); } }; } // End anonymous namespace Value *AMDGPUCodeGenPrepare::copyFlags( const BinaryOperator &I, Value *V) const { BinaryOperator *BinOp = dyn_cast(V); if (!BinOp) // Possibly constant expression. return V; if (isa(BinOp)) { BinOp->setHasNoSignedWrap(I.hasNoSignedWrap()); BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); } else if (isa(BinOp)) BinOp->setIsExact(I.isExact()); return V; } unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { assert(needsPromotionToI32(T) && "T does not need promotion to i32"); if (T->isIntegerTy()) return T->getIntegerBitWidth(); return cast(T)->getElementType()->getIntegerBitWidth(); } Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { assert(needsPromotionToI32(T) && "T does not need promotion to i32"); if (T->isIntegerTy()) return B.getInt32Ty(); return VectorType::get(B.getInt32Ty(), cast(T)->getNumElements()); } bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { return I.getOpcode() == Instruction::AShr || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; } bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { return isa(I.getOperand(0)) ? cast(I.getOperand(0))->isSigned() : false; } bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 && T->getIntegerBitWidth() <= 16) return true; if (!T->isVectorTy()) return false; return needsPromotionToI32(cast(T)->getElementType()); } bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv) return false; IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = getI32Ty(Builder, I.getType()); Value *ExtOp0 = nullptr; Value *ExtOp1 = nullptr; Value *ExtRes = nullptr; Value *TruncRes = nullptr; if (isSigned(I)) { ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); } else { ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); } ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1)); TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); I.replaceAllUsesWith(TruncRes); I.eraseFromParent(); return true; } bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { assert(needsPromotionToI32(I.getOperand(0)->getType()) && "I does not need promotion to i32"); IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); Value *ExtOp0 = nullptr; Value *ExtOp1 = nullptr; Value *NewICmp = nullptr; if (I.isSigned()) { ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); } else { ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); } NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); I.replaceAllUsesWith(NewICmp); I.eraseFromParent(); return true; } bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = getI32Ty(Builder, I.getType()); Value *ExtOp1 = nullptr; Value *ExtOp2 = nullptr; Value *ExtRes = nullptr; Value *TruncRes = nullptr; if (isSigned(I)) { ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); } else { ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); } ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); I.replaceAllUsesWith(TruncRes); I.eraseFromParent(); return true; } bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( IntrinsicInst &I) const { assert(I.getIntrinsicID() == Intrinsic::bitreverse && "I must be bitreverse intrinsic"); assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = getI32Ty(Builder, I.getType()); Function *I32 = Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); Value *LShrOp = Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); Value *TruncRes = Builder.CreateTrunc(LShrOp, I.getType()); I.replaceAllUsesWith(TruncRes); I.eraseFromParent(); return true; } static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { const ConstantFP *CNum = dyn_cast(Num); if (!CNum) return false; // Reciprocal f32 is handled separately without denormals. return UnsafeDiv || CNum->isExactlyValue(+1.0); } // Insert an intrinsic for fast fdiv for safe math situations where we can // reduce precision. Leave fdiv for situations where the generic node is // expected to be optimized. bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Type *Ty = FDiv.getType(); if (!Ty->getScalarType()->isFloatTy()) return false; MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); if (!FPMath) return false; const FPMathOperator *FPOp = cast(&FDiv); float ULP = FPOp->getFPAccuracy(); if (ULP < 2.5f) return false; FastMathFlags FMF = FPOp->getFastMathFlags(); bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || FMF.allowReciprocal(); if (ST->hasFP32Denormals() && !UnsafeDiv) return false; IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); Builder.setFastMathFlags(FMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); Function *Decl = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); Value *Num = FDiv.getOperand(0); Value *Den = FDiv.getOperand(1); Value *NewFDiv = nullptr; if (VectorType *VT = dyn_cast(Ty)) { NewFDiv = UndefValue::get(VT); // FIXME: Doesn't do the right thing for cases where the vector is partially // constant. This works when the scalarizer pass is run first. for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { Value *NumEltI = Builder.CreateExtractElement(Num, I); Value *DenEltI = Builder.CreateExtractElement(Den, I); Value *NewElt; if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { NewElt = Builder.CreateFDiv(NumEltI, DenEltI); } else { NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); } NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } } else { if (!shouldKeepFDivF32(Num, UnsafeDiv)) NewFDiv = Builder.CreateCall(Decl, { Num, Den }); } if (NewFDiv) { FDiv.replaceAllUsesWith(NewFDiv); NewFDiv->takeName(&FDiv); FDiv.eraseFromParent(); } return true; } static bool hasUnsafeFPMath(const Function &F) { Attribute Attr = F.getFnAttribute("unsafe-fp-math"); return Attr.getValueAsString() == "true"; } bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { bool Changed = false; if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && DA->isUniform(&I)) Changed |= promoteUniformOpToI32(I); return Changed; } bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { bool Changed = false; if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && DA->isUniform(&I)) Changed |= promoteUniformOpToI32(I); return Changed; } bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { bool Changed = false; if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && DA->isUniform(&I)) Changed |= promoteUniformOpToI32(I); return Changed; } bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { switch (I.getIntrinsicID()) { case Intrinsic::bitreverse: return visitBitreverseIntrinsicInst(I); default: return false; } } bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { bool Changed = false; if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && DA->isUniform(&I)) Changed |= promoteUniformBitreverseToI32(I); return Changed; } bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { Mod = &M; return false; } bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { if (!TM || skipFunction(F)) return false; ST = &TM->getSubtarget(F); DA = &getAnalysis(); HasUnsafeFPMath = hasUnsafeFPMath(F); bool MadeChange = false; for (BasicBlock &BB : F) { BasicBlock::iterator Next; for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { Next = std::next(I); MadeChange |= visit(*I); } } return MadeChange; } INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) char AMDGPUCodeGenPrepare::ID = 0; FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { return new AMDGPUCodeGenPrepare(TM); }