//===-- PPCCTRLoops.cpp - Identify and generate CTR loops -----------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This pass identifies loops where we can generate the PPC branch instructions // that decrement and test the count register (CTR) (bdnz and friends). // // The pattern that defines the induction variable can changed depending on // prior optimizations. For example, the IndVarSimplify phase run by 'opt' // normalizes induction variables, and the Loop Strength Reduction pass // run by 'llc' may also make changes to the induction variable. // // Criteria for CTR loops: // - Countable loops (w/ ind. var for a trip count) // - Try inner-most loops first // - No nested CTR loops. // - No function calls in loops. // //===----------------------------------------------------------------------===// #include "PPC.h" #include "PPCSubtarget.h" #include "PPCTargetMachine.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/ValueHandle.h" #include "llvm/PassSupport.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #ifndef NDEBUG #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #endif using namespace llvm; #define DEBUG_TYPE "ctrloops" #ifndef NDEBUG static cl::opt CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1)); #endif STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops"); namespace llvm { void initializePPCCTRLoopsPass(PassRegistry&); #ifndef NDEBUG void initializePPCCTRLoopsVerifyPass(PassRegistry&); #endif } namespace { struct PPCCTRLoops : public FunctionPass { #ifndef NDEBUG static int Counter; #endif public: static char ID; PPCCTRLoops() : FunctionPass(ID) { initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addPreserved(); AU.addRequired(); AU.addPreserved(); AU.addRequired(); } private: bool mightUseCTR(BasicBlock *BB); bool convertToCTRLoop(Loop *L); private: const PPCTargetMachine *TM; const PPCSubtarget *STI; const PPCTargetLowering *TLI; const DataLayout *DL; const TargetLibraryInfo *LibInfo; LoopInfo *LI; ScalarEvolution *SE; DominatorTree *DT; bool PreserveLCSSA; }; char PPCCTRLoops::ID = 0; #ifndef NDEBUG int PPCCTRLoops::Counter = 0; #endif #ifndef NDEBUG struct PPCCTRLoopsVerify : public MachineFunctionPass { public: static char ID; PPCCTRLoopsVerify() : MachineFunctionPass(ID) { initializePPCCTRLoopsVerifyPass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } bool runOnMachineFunction(MachineFunction &MF) override; private: MachineDominatorTree *MDT; }; char PPCCTRLoopsVerify::ID = 0; #endif // NDEBUG } // end anonymous namespace INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", false, false) FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); } #ifndef NDEBUG INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", "PowerPC CTR Loops Verify", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify", "PowerPC CTR Loops Verify", false, false) FunctionPass *llvm::createPPCCTRLoopsVerify() { return new PPCCTRLoopsVerify(); } #endif // NDEBUG bool PPCCTRLoops::runOnFunction(Function &F) { if (skipFunction(F)) return false; auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; TM = &TPC->getTM(); STI = TM->getSubtargetImpl(F); TLI = STI->getTargetLowering(); LI = &getAnalysis().getLoopInfo(); SE = &getAnalysis().getSE(); DT = &getAnalysis().getDomTree(); DL = &F.getParent()->getDataLayout(); auto *TLIP = getAnalysisIfAvailable(); LibInfo = TLIP ? &TLIP->getTLI() : nullptr; PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); bool MadeChange = false; for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) { Loop *L = *I; if (!L->getParentLoop()) MadeChange |= convertToCTRLoop(L); } return MadeChange; } static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) { if (IntegerType *ITy = dyn_cast(Ty)) return ITy->getBitWidth() > (Is32Bit ? 32U : 64U); return false; } // Determining the address of a TLS variable results in a function call in // certain TLS models. static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) { const auto *GV = dyn_cast(MemAddr); if (!GV) { // Recurse to check for constants that refer to TLS global variables. if (const auto *CV = dyn_cast(MemAddr)) for (const auto &CO : CV->operands()) if (memAddrUsesCTR(TM, CO)) return true; return false; } if (!GV->isThreadLocal()) return false; TLSModel::Model Model = TM.getTLSModel(GV); return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic; } // Loop through the inline asm constraints and look for something that clobbers // ctr. static bool asmClobbersCTR(InlineAsm *IA) { InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints(); for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) { InlineAsm::ConstraintInfo &C = CIV[i]; if (C.Type != InlineAsm::isInput) for (unsigned j = 0, je = C.Codes.size(); j < je; ++j) if (StringRef(C.Codes[j]).equals_lower("{ctr}")) return true; } return false; } bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) { for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J) { if (CallInst *CI = dyn_cast(J)) { // Inline ASM is okay, unless it clobbers the ctr register. if (InlineAsm *IA = dyn_cast(CI->getCalledValue())) { if (asmClobbersCTR(IA)) return true; continue; } if (Function *F = CI->getCalledFunction()) { // Most intrinsics don't become function calls, but some might. // sin, cos, exp and log are always calls. unsigned Opcode = 0; if (F->getIntrinsicID() != Intrinsic::not_intrinsic) { switch (F->getIntrinsicID()) { default: continue; // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr // we're definitely using CTR. case Intrinsic::ppc_is_decremented_ctr_nonzero: case Intrinsic::ppc_mtctr: return true; // VisualStudio defines setjmp as _setjmp #if defined(_MSC_VER) && defined(setjmp) && \ !defined(setjmp_undefined_for_msvc) # pragma push_macro("setjmp") # undef setjmp # define setjmp_undefined_for_msvc #endif case Intrinsic::setjmp: #if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc) // let's return it to _setjmp state # pragma pop_macro("setjmp") # undef setjmp_undefined_for_msvc #endif case Intrinsic::longjmp: // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp // because, although it does clobber the counter register, the // control can't then return to inside the loop unless there is also // an eh_sjlj_setjmp. case Intrinsic::eh_sjlj_setjmp: case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset: case Intrinsic::powi: case Intrinsic::log: case Intrinsic::log2: case Intrinsic::log10: case Intrinsic::exp: case Intrinsic::exp2: case Intrinsic::pow: case Intrinsic::sin: case Intrinsic::cos: return true; case Intrinsic::copysign: if (CI->getArgOperand(0)->getType()->getScalarType()-> isPPC_FP128Ty()) return true; else continue; // ISD::FCOPYSIGN is never a library call. case Intrinsic::sqrt: Opcode = ISD::FSQRT; break; case Intrinsic::floor: Opcode = ISD::FFLOOR; break; case Intrinsic::ceil: Opcode = ISD::FCEIL; break; case Intrinsic::trunc: Opcode = ISD::FTRUNC; break; case Intrinsic::rint: Opcode = ISD::FRINT; break; case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break; case Intrinsic::round: Opcode = ISD::FROUND; break; case Intrinsic::minnum: Opcode = ISD::FMINNUM; break; case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break; case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break; case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break; } } // PowerPC does not use [US]DIVREM or other library calls for // operations on regular types which are not otherwise library calls // (i.e. soft float or atomics). If adapting for targets that do, // additional care is required here. LibFunc Func; if (!F->hasLocalLinkage() && F->hasName() && LibInfo && LibInfo->getLibFunc(F->getName(), Func) && LibInfo->hasOptimizedCodeGen(Func)) { // Non-read-only functions are never treated as intrinsics. if (!CI->onlyReadsMemory()) return true; // Conversion happens only for FP calls. if (!CI->getArgOperand(0)->getType()->isFloatingPointTy()) return true; switch (Func) { default: return true; case LibFunc_copysign: case LibFunc_copysignf: continue; // ISD::FCOPYSIGN is never a library call. case LibFunc_copysignl: return true; case LibFunc_fabs: case LibFunc_fabsf: case LibFunc_fabsl: continue; // ISD::FABS is never a library call. case LibFunc_sqrt: case LibFunc_sqrtf: case LibFunc_sqrtl: Opcode = ISD::FSQRT; break; case LibFunc_floor: case LibFunc_floorf: case LibFunc_floorl: Opcode = ISD::FFLOOR; break; case LibFunc_nearbyint: case LibFunc_nearbyintf: case LibFunc_nearbyintl: Opcode = ISD::FNEARBYINT; break; case LibFunc_ceil: case LibFunc_ceilf: case LibFunc_ceill: Opcode = ISD::FCEIL; break; case LibFunc_rint: case LibFunc_rintf: case LibFunc_rintl: Opcode = ISD::FRINT; break; case LibFunc_round: case LibFunc_roundf: case LibFunc_roundl: Opcode = ISD::FROUND; break; case LibFunc_trunc: case LibFunc_truncf: case LibFunc_truncl: Opcode = ISD::FTRUNC; break; case LibFunc_fmin: case LibFunc_fminf: case LibFunc_fminl: Opcode = ISD::FMINNUM; break; case LibFunc_fmax: case LibFunc_fmaxf: case LibFunc_fmaxl: Opcode = ISD::FMAXNUM; break; } } if (Opcode) { MVT VTy = TLI->getSimpleValueType( *DL, CI->getArgOperand(0)->getType(), true); if (VTy == MVT::Other) return true; if (TLI->isOperationLegalOrCustom(Opcode, VTy)) continue; else if (VTy.isVector() && TLI->isOperationLegalOrCustom(Opcode, VTy.getScalarType())) continue; return true; } } return true; } else if (isa(J) && J->getType()->getScalarType()->isPPC_FP128Ty()) { // Most operations on ppc_f128 values become calls. return true; } else if (isa(J) || isa(J) || isa(J) || isa(J)) { CastInst *CI = cast(J); if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() || CI->getDestTy()->getScalarType()->isPPC_FP128Ty() || isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) || isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType())) return true; } else if (isLargeIntegerTy(!TM->isPPC64(), J->getType()->getScalarType()) && (J->getOpcode() == Instruction::UDiv || J->getOpcode() == Instruction::SDiv || J->getOpcode() == Instruction::URem || J->getOpcode() == Instruction::SRem)) { return true; } else if (!TM->isPPC64() && isLargeIntegerTy(false, J->getType()->getScalarType()) && (J->getOpcode() == Instruction::Shl || J->getOpcode() == Instruction::AShr || J->getOpcode() == Instruction::LShr)) { // Only on PPC32, for 128-bit integers (specifically not 64-bit // integers), these might be runtime calls. return true; } else if (isa(J) || isa(J)) { // On PowerPC, indirect jumps use the counter register. return true; } else if (SwitchInst *SI = dyn_cast(J)) { if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries()) return true; } if (STI->useSoftFloat()) { switch(J->getOpcode()) { case Instruction::FAdd: case Instruction::FSub: case Instruction::FMul: case Instruction::FDiv: case Instruction::FRem: case Instruction::FPTrunc: case Instruction::FPExt: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::UIToFP: case Instruction::SIToFP: case Instruction::FCmp: return true; } } for (Value *Operand : J->operands()) if (memAddrUsesCTR(*TM, Operand)) return true; } return false; } bool PPCCTRLoops::convertToCTRLoop(Loop *L) { bool MadeChange = false; // Process nested loops first. for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) { MadeChange |= convertToCTRLoop(*I); DEBUG(dbgs() << "Nested loop converted\n"); } // If a nested loop has been converted, then we can't convert this loop. if (MadeChange) return MadeChange; #ifndef NDEBUG // Stop trying after reaching the limit (if any). int Limit = CTRLoopLimit; if (Limit >= 0) { if (Counter >= CTRLoopLimit) return false; Counter++; } #endif // We don't want to spill/restore the counter register, and so we don't // want to use the counter register if the loop contains calls. for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); I != IE; ++I) if (mightUseCTR(*I)) return MadeChange; SmallVector ExitingBlocks; L->getExitingBlocks(ExitingBlocks); BasicBlock *CountedExitBlock = nullptr; const SCEV *ExitCount = nullptr; BranchInst *CountedExitBranch = nullptr; for (SmallVectorImpl::iterator I = ExitingBlocks.begin(), IE = ExitingBlocks.end(); I != IE; ++I) { const SCEV *EC = SE->getExitCount(L, *I); DEBUG(dbgs() << "Exit Count for " << *L << " from block " << (*I)->getName() << ": " << *EC << "\n"); if (isa(EC)) continue; if (const SCEVConstant *ConstEC = dyn_cast(EC)) { if (ConstEC->getValue()->isZero()) continue; } else if (!SE->isLoopInvariant(EC, L)) continue; if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32)) continue; // We now have a loop-invariant count of loop iterations (which is not the // constant zero) for which we know that this loop will not exit via this // exisiting block. // We need to make sure that this block will run on every loop iteration. // For this to be true, we must dominate all blocks with backedges. Such // blocks are in-loop predecessors to the header block. bool NotAlways = false; for (pred_iterator PI = pred_begin(L->getHeader()), PIE = pred_end(L->getHeader()); PI != PIE; ++PI) { if (!L->contains(*PI)) continue; if (!DT->dominates(*I, *PI)) { NotAlways = true; break; } } if (NotAlways) continue; // Make sure this blocks ends with a conditional branch. Instruction *TI = (*I)->getTerminator(); if (!TI) continue; if (BranchInst *BI = dyn_cast(TI)) { if (!BI->isConditional()) continue; CountedExitBranch = BI; } else continue; // Note that this block may not be the loop latch block, even if the loop // has a latch block. CountedExitBlock = *I; ExitCount = EC; break; } if (!CountedExitBlock) return MadeChange; BasicBlock *Preheader = L->getLoopPreheader(); // If we don't have a preheader, then insert one. If we already have a // preheader, then we can use it (except if the preheader contains a use of // the CTR register because some such uses might be reordered by the // selection DAG after the mtctr instruction). if (!Preheader || mightUseCTR(Preheader)) Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); if (!Preheader) return MadeChange; DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() << "\n"); // Insert the count into the preheader and replace the condition used by the // selected branch. MadeChange = true; SCEVExpander SCEVE(*SE, *DL, "loopcnt"); LLVMContext &C = SE->getContext(); Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C); if (!ExitCount->getType()->isPointerTy() && ExitCount->getType() != CountType) ExitCount = SE->getZeroExtendExpr(ExitCount, CountType); ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType)); Value *ECValue = SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator()); IRBuilder<> CountBuilder(Preheader->getTerminator()); Module *M = Preheader->getParent()->getParent(); Value *MTCTRFunc = Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr, CountType); CountBuilder.CreateCall(MTCTRFunc, ECValue); IRBuilder<> CondBuilder(CountedExitBranch); Value *DecFunc = Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero); Value *NewCond = CondBuilder.CreateCall(DecFunc, {}); Value *OldCond = CountedExitBranch->getCondition(); CountedExitBranch->setCondition(NewCond); // The false branch must exit the loop. if (!L->contains(CountedExitBranch->getSuccessor(0))) CountedExitBranch->swapSuccessors(); // The old condition may be dead now, and may have even created a dead PHI // (the original induction variable). RecursivelyDeleteTriviallyDeadInstructions(OldCond); DeleteDeadPHIs(CountedExitBlock); ++NumCTRLoops; return MadeChange; } #ifndef NDEBUG static bool clobbersCTR(const MachineInstr &MI) { for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI.getOperand(i); if (MO.isReg()) { if (MO.isDef() && (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8)) return true; } else if (MO.isRegMask()) { if (MO.clobbersPhysReg(PPC::CTR) || MO.clobbersPhysReg(PPC::CTR8)) return true; } } return false; } static bool verifyCTRBranch(MachineBasicBlock *MBB, MachineBasicBlock::iterator I) { MachineBasicBlock::iterator BI = I; SmallSet Visited; SmallVector Preds; bool CheckPreds; if (I == MBB->begin()) { Visited.insert(MBB); goto queue_preds; } else --I; check_block: Visited.insert(MBB); if (I == MBB->end()) goto queue_preds; CheckPreds = true; for (MachineBasicBlock::iterator IE = MBB->begin();; --I) { unsigned Opc = I->getOpcode(); if (Opc == PPC::MTCTRloop || Opc == PPC::MTCTR8loop) { CheckPreds = false; break; } if (I != BI && clobbersCTR(*I)) { DEBUG(dbgs() << "BB#" << MBB->getNumber() << " (" << MBB->getFullName() << ") instruction " << *I << " clobbers CTR, invalidating " << "BB#" << BI->getParent()->getNumber() << " (" << BI->getParent()->getFullName() << ") instruction " << *BI << "\n"); return false; } if (I == IE) break; } if (!CheckPreds && Preds.empty()) return true; if (CheckPreds) { queue_preds: if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) { DEBUG(dbgs() << "Unable to find a MTCTR instruction for BB#" << BI->getParent()->getNumber() << " (" << BI->getParent()->getFullName() << ") instruction " << *BI << "\n"); return false; } for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(), PIE = MBB->pred_end(); PI != PIE; ++PI) Preds.push_back(*PI); } do { MBB = Preds.pop_back_val(); if (!Visited.count(MBB)) { I = MBB->getLastNonDebugInstr(); goto check_block; } } while (!Preds.empty()); return true; } bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) { MDT = &getAnalysis(); // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before // any other instructions that might clobber the ctr register. for (MachineFunction::iterator I = MF.begin(), IE = MF.end(); I != IE; ++I) { MachineBasicBlock *MBB = &*I; if (!MDT->isReachableFromEntry(MBB)) continue; for (MachineBasicBlock::iterator MII = MBB->getFirstTerminator(), MIIE = MBB->end(); MII != MIIE; ++MII) { unsigned Opc = MII->getOpcode(); if (Opc == PPC::BDNZ8 || Opc == PPC::BDNZ || Opc == PPC::BDZ8 || Opc == PPC::BDZ) if (!verifyCTRBranch(MBB, MII)) llvm_unreachable("Invalid PPC CTR loop!"); } } return false; } #endif // NDEBUG