//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file /// This pass does misc. AMDGPU optimizations on IR before instruction /// selection. // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include #include #define DEBUG_TYPE "amdgpu-codegenprepare" using namespace llvm; namespace { static cl::opt WidenLoads( "amdgpu-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"), cl::ReallyHidden, cl::init(true)); class AMDGPUCodeGenPrepare : public FunctionPass, public InstVisitor { const GCNSubtarget *ST = nullptr; AssumptionCache *AC = nullptr; LegacyDivergenceAnalysis *DA = nullptr; Module *Mod = nullptr; const DataLayout *DL = nullptr; bool HasUnsafeFPMath = false; /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to /// binary operation \p V. /// /// \returns Binary operation \p V. /// \returns \p T's base element bit width. unsigned getBaseElementBitWidth(const Type *T) const; /// \returns Equivalent 32 bit integer type for given type \p T. For example, /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> /// is returned. Type *getI32Ty(IRBuilder<> &B, const Type *T) const; /// \returns True if binary operation \p I is a signed binary operation, false /// otherwise. bool isSigned(const BinaryOperator &I) const; /// \returns True if the condition of 'select' operation \p I comes from a /// signed 'icmp' operation, false otherwise. bool isSigned(const SelectInst &I) const; /// \returns True if type \p T needs to be promoted to 32 bit integer type, /// false otherwise. bool needsPromotionToI32(const Type *T) const; /// Promotes uniform binary operation \p I to equivalent 32 bit binary /// operation. /// /// \details \p I's base element bit width must be greater than 1 and less /// than or equal 16. Promotion is done by sign or zero extending operands to /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and /// truncating the result of 32 bit binary operation back to \p I's original /// type. Division operation is not promoted. /// /// \returns True if \p I is promoted to equivalent 32 bit binary operation, /// false otherwise. bool promoteUniformOpToI32(BinaryOperator &I) const; /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. /// /// \details \p I's base element bit width must be greater than 1 and less /// than or equal 16. Promotion is done by sign or zero extending operands to /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. /// /// \returns True. bool promoteUniformOpToI32(ICmpInst &I) const; /// Promotes uniform 'select' operation \p I to 32 bit 'select' /// operation. /// /// \details \p I's base element bit width must be greater than 1 and less /// than or equal 16. Promotion is done by sign or zero extending operands to /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the /// result of 32 bit 'select' operation back to \p I's original type. /// /// \returns True. bool promoteUniformOpToI32(SelectInst &I) const; /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' /// intrinsic. /// /// \details \p I's base element bit width must be greater than 1 and less /// than or equal 16. Promotion is done by zero extending the operand to 32 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the /// shift amount is 32 minus \p I's base element bit width), and truncating /// the result of the shift operation back to \p I's original type. /// /// \returns True. bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const; unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const; bool isI24(Value *V, unsigned ScalarSize) const; bool isU24(Value *V, unsigned ScalarSize) const; /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24. /// SelectionDAG has an issue where an and asserting the bits are known bool replaceMulWithMul24(BinaryOperator &I) const; /// Expands 24 bit div or rem. Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den, bool IsDiv, bool IsSigned) const; /// Expands 32 bit div or rem. Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den) const; /// Widen a scalar load. /// /// \details \p Widen scalar load for uniform, small type loads from constant // memory / to a full 32-bits and then truncate the input to allow a scalar // load instead of a vector load. // /// \returns True. bool canWidenScalarExtLoad(LoadInst &I) const; public: static char ID; AMDGPUCodeGenPrepare() : FunctionPass(ID) {} bool visitFDiv(BinaryOperator &I); bool visitInstruction(Instruction &I) { return false; } bool visitBinaryOperator(BinaryOperator &I); bool visitLoadInst(LoadInst &I); bool visitICmpInst(ICmpInst &I); bool visitSelectInst(SelectInst &I); bool visitIntrinsicInst(IntrinsicInst &I); bool visitBitreverseIntrinsicInst(IntrinsicInst &I); bool doInitialization(Module &M) override; bool runOnFunction(Function &F) override; StringRef getPassName() const override { return "AMDGPU IR optimizations"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.setPreservesAll(); } }; } // end anonymous namespace unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { assert(needsPromotionToI32(T) && "T does not need promotion to i32"); if (T->isIntegerTy()) return T->getIntegerBitWidth(); return cast(T)->getElementType()->getIntegerBitWidth(); } Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { assert(needsPromotionToI32(T) && "T does not need promotion to i32"); if (T->isIntegerTy()) return B.getInt32Ty(); return VectorType::get(B.getInt32Ty(), cast(T)->getNumElements()); } bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { return I.getOpcode() == Instruction::AShr || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; } bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { return isa(I.getOperand(0)) ? cast(I.getOperand(0))->isSigned() : false; } bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { const IntegerType *IntTy = dyn_cast(T); if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) return true; if (const VectorType *VT = dyn_cast(T)) { // TODO: The set of packed operations is more limited, so may want to // promote some anyway. if (ST->hasVOP3PInsts()) return false; return needsPromotionToI32(VT->getElementType()); } return false; } // Return true if the op promoted to i32 should have nsw set. static bool promotedOpIsNSW(const Instruction &I) { switch (I.getOpcode()) { case Instruction::Shl: case Instruction::Add: case Instruction::Sub: return true; case Instruction::Mul: return I.hasNoUnsignedWrap(); default: return false; } } // Return true if the op promoted to i32 should have nuw set. static bool promotedOpIsNUW(const Instruction &I) { switch (I.getOpcode()) { case Instruction::Shl: case Instruction::Add: case Instruction::Mul: return true; case Instruction::Sub: return I.hasNoUnsignedWrap(); default: return false; } } bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { Type *Ty = I.getType(); const DataLayout &DL = Mod->getDataLayout(); int TySize = DL.getTypeSizeInBits(Ty); unsigned Align = I.getAlignment() ? I.getAlignment() : DL.getABITypeAlignment(Ty); return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I); } bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SRem || I.getOpcode() == Instruction::URem) return false; IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = getI32Ty(Builder, I.getType()); Value *ExtOp0 = nullptr; Value *ExtOp1 = nullptr; Value *ExtRes = nullptr; Value *TruncRes = nullptr; if (isSigned(I)) { ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); } else { ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); } ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); if (Instruction *Inst = dyn_cast(ExtRes)) { if (promotedOpIsNSW(cast(I))) Inst->setHasNoSignedWrap(); if (promotedOpIsNUW(cast(I))) Inst->setHasNoUnsignedWrap(); if (const auto *ExactOp = dyn_cast(&I)) Inst->setIsExact(ExactOp->isExact()); } TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); I.replaceAllUsesWith(TruncRes); I.eraseFromParent(); return true; } bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { assert(needsPromotionToI32(I.getOperand(0)->getType()) && "I does not need promotion to i32"); IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); Value *ExtOp0 = nullptr; Value *ExtOp1 = nullptr; Value *NewICmp = nullptr; if (I.isSigned()) { ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); } else { ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); } NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); I.replaceAllUsesWith(NewICmp); I.eraseFromParent(); return true; } bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = getI32Ty(Builder, I.getType()); Value *ExtOp1 = nullptr; Value *ExtOp2 = nullptr; Value *ExtRes = nullptr; Value *TruncRes = nullptr; if (isSigned(I)) { ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); } else { ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); } ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); I.replaceAllUsesWith(TruncRes); I.eraseFromParent(); return true; } bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( IntrinsicInst &I) const { assert(I.getIntrinsicID() == Intrinsic::bitreverse && "I must be bitreverse intrinsic"); assert(needsPromotionToI32(I.getType()) && "I does not need promotion to i32"); IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = getI32Ty(Builder, I.getType()); Function *I32 = Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); Value *LShrOp = Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); Value *TruncRes = Builder.CreateTrunc(LShrOp, I.getType()); I.replaceAllUsesWith(TruncRes); I.eraseFromParent(); return true; } unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op, unsigned ScalarSize) const { KnownBits Known = computeKnownBits(Op, *DL, 0, AC); return ScalarSize - Known.countMinLeadingZeros(); } unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op, unsigned ScalarSize) const { // In order for this to be a signed 24-bit value, bit 23, must // be a sign bit. return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC); } bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const { return ScalarSize >= 24 && // Types less than 24-bit should be treated // as unsigned 24-bit values. numBitsSigned(V, ScalarSize) < 24; } bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const { return numBitsUnsigned(V, ScalarSize) <= 24; } static void extractValues(IRBuilder<> &Builder, SmallVectorImpl &Values, Value *V) { VectorType *VT = dyn_cast(V->getType()); if (!VT) { Values.push_back(V); return; } for (int I = 0, E = VT->getNumElements(); I != E; ++I) Values.push_back(Builder.CreateExtractElement(V, I)); } static Value *insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl &Values) { if (Values.size() == 1) return Values[0]; Value *NewVal = UndefValue::get(Ty); for (int I = 0, E = Values.size(); I != E; ++I) NewVal = Builder.CreateInsertElement(NewVal, Values[I], I); return NewVal; } bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const { if (I.getOpcode() != Instruction::Mul) return false; Type *Ty = I.getType(); unsigned Size = Ty->getScalarSizeInBits(); if (Size <= 16 && ST->has16BitInsts()) return false; // Prefer scalar if this could be s_mul_i32 if (DA->isUniform(&I)) return false; Value *LHS = I.getOperand(0); Value *RHS = I.getOperand(1); IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Intrinsic::ID IntrID = Intrinsic::not_intrinsic; // TODO: Should this try to match mulhi24? if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) { IntrID = Intrinsic::amdgcn_mul_u24; } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) { IntrID = Intrinsic::amdgcn_mul_i24; } else return false; SmallVector LHSVals; SmallVector RHSVals; SmallVector ResultVals; extractValues(Builder, LHSVals, LHS); extractValues(Builder, RHSVals, RHS); IntegerType *I32Ty = Builder.getInt32Ty(); FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID); for (int I = 0, E = LHSVals.size(); I != E; ++I) { Value *LHS, *RHS; if (IntrID == Intrinsic::amdgcn_mul_u24) { LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty); RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty); } else { LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty); RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty); } Value *Result = Builder.CreateCall(Intrin, {LHS, RHS}); if (IntrID == Intrinsic::amdgcn_mul_u24) { ResultVals.push_back(Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType())); } else { ResultVals.push_back(Builder.CreateSExtOrTrunc(Result, LHSVals[I]->getType())); } } I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals)); I.eraseFromParent(); return true; } static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) { const ConstantFP *CNum = dyn_cast(Num); if (!CNum) return HasDenormals; if (UnsafeDiv) return true; bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0); // Reciprocal f32 is handled separately without denormals. return HasDenormals ^ IsOne; } // Insert an intrinsic for fast fdiv for safe math situations where we can // reduce precision. Leave fdiv for situations where the generic node is // expected to be optimized. bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Type *Ty = FDiv.getType(); if (!Ty->getScalarType()->isFloatTy()) return false; MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); if (!FPMath) return false; const FPMathOperator *FPOp = cast(&FDiv); float ULP = FPOp->getFPAccuracy(); if (ULP < 2.5f) return false; FastMathFlags FMF = FPOp->getFastMathFlags(); bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() || FMF.allowReciprocal(); // With UnsafeDiv node will be optimized to just rcp and mul. if (UnsafeDiv) return false; IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); Builder.setFastMathFlags(FMF); Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); Value *Num = FDiv.getOperand(0); Value *Den = FDiv.getOperand(1); Value *NewFDiv = nullptr; bool HasDenormals = ST->hasFP32Denormals(); if (VectorType *VT = dyn_cast(Ty)) { NewFDiv = UndefValue::get(VT); // FIXME: Doesn't do the right thing for cases where the vector is partially // constant. This works when the scalarizer pass is run first. for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { Value *NumEltI = Builder.CreateExtractElement(Num, I); Value *DenEltI = Builder.CreateExtractElement(Den, I); Value *NewElt; if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) { NewElt = Builder.CreateFDiv(NumEltI, DenEltI); } else { NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); } NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); } } else { if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals)) NewFDiv = Builder.CreateCall(Decl, { Num, Den }); } if (NewFDiv) { FDiv.replaceAllUsesWith(NewFDiv); NewFDiv->takeName(&FDiv); FDiv.eraseFromParent(); } return !!NewFDiv; } static bool hasUnsafeFPMath(const Function &F) { Attribute Attr = F.getFnAttribute("unsafe-fp-math"); return Attr.getValueAsString() == "true"; } static std::pair getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS) { Type *I32Ty = Builder.getInt32Ty(); Type *I64Ty = Builder.getInt64Ty(); Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty); Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty); Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64); Value *Lo = Builder.CreateTrunc(MUL64, I32Ty); Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32)); Hi = Builder.CreateTrunc(Hi, I32Ty); return std::make_pair(Lo, Hi); } static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { return getMul64(Builder, LHS, RHS).second; } // The fractional part of a float is enough to accurately represent up to // a 24-bit signed integer. Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den, bool IsDiv, bool IsSigned) const { assert(Num->getType()->isIntegerTy(32)); const DataLayout &DL = Mod->getDataLayout(); unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I); if (LHSSignBits < 9) return nullptr; unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I); if (RHSSignBits < 9) return nullptr; unsigned SignBits = std::min(LHSSignBits, RHSSignBits); unsigned DivBits = 32 - SignBits; if (IsSigned) ++DivBits; Type *Ty = Num->getType(); Type *I32Ty = Builder.getInt32Ty(); Type *F32Ty = Builder.getFloatTy(); ConstantInt *One = Builder.getInt32(1); Value *JQ = One; if (IsSigned) { // char|short jq = ia ^ ib; JQ = Builder.CreateXor(Num, Den); // jq = jq >> (bitsize - 2) JQ = Builder.CreateAShr(JQ, Builder.getInt32(30)); // jq = jq | 0x1 JQ = Builder.CreateOr(JQ, One); } // int ia = (int)LHS; Value *IA = Num; // int ib, (int)RHS; Value *IB = Den; // float fa = (float)ia; Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty) : Builder.CreateUIToFP(IA, F32Ty); // float fb = (float)ib; Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) : Builder.CreateUIToFP(IB,F32Ty); Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB); Value *FQM = Builder.CreateFMul(FA, RCP); // fq = trunc(fqm); CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM); FQ->copyFastMathFlags(Builder.getFastMathFlags()); // float fqneg = -fq; Value *FQNeg = Builder.CreateFNeg(FQ); // float fr = mad(fqneg, fb, fa); Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz, {FQNeg->getType()}, {FQNeg, FB, FA}, FQ); // int iq = (int)fq; Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty) : Builder.CreateFPToUI(FQ, I32Ty); // fr = fabs(fr); FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ); // fb = fabs(fb); FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ); // int cv = fr >= fb; Value *CV = Builder.CreateFCmpOGE(FR, FB); // jq = (cv ? jq : 0); JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0)); // dst = iq + jq; Value *Div = Builder.CreateAdd(IQ, JQ); Value *Res = Div; if (!IsDiv) { // Rem needs compensation, it's easier to recompute it Value *Rem = Builder.CreateMul(Div, Den); Res = Builder.CreateSub(Num, Rem); } // Truncate to number of bits this divide really is. if (IsSigned) { Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits)); Res = Builder.CreateSExt(Res, Ty); } else { ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1); Res = Builder.CreateAnd(Res, TruncMask); } return Res; } Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den) const { Instruction::BinaryOps Opc = I.getOpcode(); assert(Opc == Instruction::URem || Opc == Instruction::UDiv || Opc == Instruction::SRem || Opc == Instruction::SDiv); FastMathFlags FMF; FMF.setFast(); Builder.setFastMathFlags(FMF); if (isa(Den)) return nullptr; // Keep it for optimization bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv; bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv; Type *Ty = Num->getType(); Type *I32Ty = Builder.getInt32Ty(); Type *F32Ty = Builder.getFloatTy(); if (Ty->getScalarSizeInBits() < 32) { if (IsSigned) { Num = Builder.CreateSExt(Num, I32Ty); Den = Builder.CreateSExt(Den, I32Ty); } else { Num = Builder.CreateZExt(Num, I32Ty); Den = Builder.CreateZExt(Den, I32Ty); } } if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) { Res = Builder.CreateTrunc(Res, Ty); return Res; } ConstantInt *Zero = Builder.getInt32(0); ConstantInt *One = Builder.getInt32(1); ConstantInt *MinusOne = Builder.getInt32(~0); Value *Sign = nullptr; if (IsSigned) { ConstantInt *K31 = Builder.getInt32(31); Value *LHSign = Builder.CreateAShr(Num, K31); Value *RHSign = Builder.CreateAShr(Den, K31); // Remainder sign is the same as LHS Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign; Num = Builder.CreateAdd(Num, LHSign); Den = Builder.CreateAdd(Den, RHSign); Num = Builder.CreateXor(Num, LHSign); Den = Builder.CreateXor(Den, RHSign); } // RCP = URECIP(Den) = 2^32 / Den + e // e is rounding error. Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty); Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32); Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000)); Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1); Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty); // RCP_LO, RCP_HI = mul(RCP, Den) */ Value *RCP_LO, *RCP_HI; std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den); // NEG_RCP_LO = -RCP_LO Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO); // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero); Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO); // Calculate the rounding error from the URECIP instruction // E = mulhu(ABS_RCP_LO, RCP) Value *E = getMulHu(Builder, ABS_RCP_LO, RCP); // RCP_A_E = RCP + E Value *RCP_A_E = Builder.CreateAdd(RCP, E); // RCP_S_E = RCP - E Value *RCP_S_E = Builder.CreateSub(RCP, E); // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E); // Quotient = mulhu(Tmp0, Num) Value *Quotient = getMulHu(Builder, Tmp0, Num); // Num_S_Remainder = Quotient * Den Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den); // Remainder = Num - Num_S_Remainder Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder); // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den); Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero); // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder); Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, MinusOne, Zero); // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero); Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero); Value *Res; if (IsDiv) { // Quotient_A_One = Quotient + 1 Value *Quotient_A_One = Builder.CreateAdd(Quotient, One); // Quotient_S_One = Quotient - 1 Value *Quotient_S_One = Builder.CreateSub(Quotient, One); // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One); // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One); } else { // Remainder_S_Den = Remainder - Den Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den); // Remainder_A_Den = Remainder + Den Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den); // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den); // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den); } if (IsSigned) { Res = Builder.CreateXor(Res, Sign); Res = Builder.CreateSub(Res, Sign); } Res = Builder.CreateTrunc(Res, Ty); return Res; } bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && DA->isUniform(&I) && promoteUniformOpToI32(I)) return true; if (replaceMulWithMul24(I)) return true; bool Changed = false; Instruction::BinaryOps Opc = I.getOpcode(); Type *Ty = I.getType(); Value *NewDiv = nullptr; if ((Opc == Instruction::URem || Opc == Instruction::UDiv || Opc == Instruction::SRem || Opc == Instruction::SDiv) && Ty->getScalarSizeInBits() <= 32) { Value *Num = I.getOperand(0); Value *Den = I.getOperand(1); IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); if (VectorType *VT = dyn_cast(Ty)) { NewDiv = UndefValue::get(VT); for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { Value *NumEltN = Builder.CreateExtractElement(Num, N); Value *DenEltN = Builder.CreateExtractElement(Den, N); Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); if (!NewElt) NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); } } else { NewDiv = expandDivRem32(Builder, I, Num, Den); } if (NewDiv) { I.replaceAllUsesWith(NewDiv); I.eraseFromParent(); Changed = true; } } return Changed; } bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { if (!WidenLoads) return false; if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && canWidenScalarExtLoad(I)) { IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Type *I32Ty = Builder.getInt32Ty(); Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast); WidenLoad->copyMetadata(I); // If we have range metadata, we need to convert the type, and not make // assumptions about the high bits. if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) { ConstantInt *Lower = mdconst::extract(Range->getOperand(0)); if (Lower->getValue().isNullValue()) { WidenLoad->setMetadata(LLVMContext::MD_range, nullptr); } else { Metadata *LowAndHigh[] = { ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))), // Don't make assumptions about the high bits. ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0)) }; WidenLoad->setMetadata(LLVMContext::MD_range, MDNode::get(Mod->getContext(), LowAndHigh)); } } int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); Type *IntNTy = Builder.getIntNTy(TySize); Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); I.replaceAllUsesWith(ValOrig); I.eraseFromParent(); return true; } return false; } bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { bool Changed = false; if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && DA->isUniform(&I)) Changed |= promoteUniformOpToI32(I); return Changed; } bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { bool Changed = false; if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && DA->isUniform(&I)) Changed |= promoteUniformOpToI32(I); return Changed; } bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { switch (I.getIntrinsicID()) { case Intrinsic::bitreverse: return visitBitreverseIntrinsicInst(I); default: return false; } } bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { bool Changed = false; if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && DA->isUniform(&I)) Changed |= promoteUniformBitreverseToI32(I); return Changed; } bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { Mod = &M; DL = &Mod->getDataLayout(); return false; } bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { if (skipFunction(F)) return false; auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; const AMDGPUTargetMachine &TM = TPC->getTM(); ST = &TM.getSubtarget(F); AC = &getAnalysis().getAssumptionCache(F); DA = &getAnalysis(); HasUnsafeFPMath = hasUnsafeFPMath(F); bool MadeChange = false; for (BasicBlock &BB : F) { BasicBlock::iterator Next; for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { Next = std::next(I); MadeChange |= visit(*I); } } return MadeChange; } INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", false, false) char AMDGPUCodeGenPrepare::ID = 0; FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { return new AMDGPUCodeGenPrepare(); }