1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file implements the transformation that optimizes memory intrinsics
11 // such as memcpy using the size value profile. When memory intrinsic size
12 // value profile metadata is available, a single memory intrinsic is expanded
13 // to a sequence of guarded specialized versions that are called with the
14 // hottest size(s), for later expansion into more optimal inline sequences.
16 //===----------------------------------------------------------------------===//
18 #include "llvm/ADT/ArrayRef.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/ADT/Twine.h"
22 #include "llvm/Analysis/BlockFrequencyInfo.h"
23 #include "llvm/Analysis/GlobalsModRef.h"
24 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
25 #include "llvm/IR/BasicBlock.h"
26 #include "llvm/IR/CallSite.h"
27 #include "llvm/IR/DerivedTypes.h"
28 #include "llvm/IR/DomTreeUpdater.h"
29 #include "llvm/IR/Dominators.h"
30 #include "llvm/IR/Function.h"
31 #include "llvm/IR/IRBuilder.h"
32 #include "llvm/IR/InstVisitor.h"
33 #include "llvm/IR/InstrTypes.h"
34 #include "llvm/IR/Instruction.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/IR/LLVMContext.h"
37 #include "llvm/IR/PassManager.h"
38 #include "llvm/IR/Type.h"
39 #include "llvm/Pass.h"
40 #include "llvm/PassRegistry.h"
41 #include "llvm/PassSupport.h"
42 #include "llvm/ProfileData/InstrProf.h"
43 #include "llvm/Support/Casting.h"
44 #include "llvm/Support/CommandLine.h"
45 #include "llvm/Support/Debug.h"
46 #include "llvm/Support/ErrorHandling.h"
47 #include "llvm/Support/MathExtras.h"
48 #include "llvm/Transforms/Instrumentation.h"
49 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
50 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
57 #define DEBUG_TYPE "pgo-memop-opt"
59 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
60 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
62 // The minimum call count to optimize memory intrinsic calls.
63 static cl::opt<unsigned>
64 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
66 cl::desc("The minimum count to optimize memory "
69 // Command line option to disable memory intrinsic optimization. The default is
70 // false. This is for debug purpose.
71 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
72 cl::Hidden, cl::desc("Disable optimize"));
74 // The percent threshold to optimize memory intrinsic calls.
75 static cl::opt<unsigned>
76 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
77 cl::Hidden, cl::ZeroOrMore,
78 cl::desc("The percentage threshold for the "
79 "memory intrinsic calls optimization"));
81 // Maximum number of versions for optimizing memory intrinsic call.
82 static cl::opt<unsigned>
83 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
85 cl::desc("The max version for the optimized memory "
88 // Scale the counts from the annotation using the BB count value.
90 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
91 cl::desc("Scale the memop size counts using the basic "
92 " block count value"));
94 // This option sets the rangge of precise profile memop sizes.
95 extern cl::opt<std::string> MemOPSizeRange;
97 // This option sets the value that groups large memop sizes
98 extern cl::opt<unsigned> MemOPSizeLarge;
101 class PGOMemOPSizeOptLegacyPass : public FunctionPass {
105 PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
106 initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
109 StringRef getPassName() const override { return "PGOMemOPSize"; }
112 bool runOnFunction(Function &F) override;
113 void getAnalysisUsage(AnalysisUsage &AU) const override {
114 AU.addRequired<BlockFrequencyInfoWrapperPass>();
115 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
116 AU.addPreserved<GlobalsAAWrapperPass>();
117 AU.addPreserved<DominatorTreeWrapperPass>();
120 } // end anonymous namespace
122 char PGOMemOPSizeOptLegacyPass::ID = 0;
123 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
124 "Optimize memory intrinsic using its size value profile",
126 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
127 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
128 "Optimize memory intrinsic using its size value profile",
131 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
132 return new PGOMemOPSizeOptLegacyPass();
136 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
138 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
139 OptimizationRemarkEmitter &ORE, DominatorTree *DT)
140 : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) {
142 llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
143 // Get the MemOPSize range information from option MemOPSizeRange,
144 getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
147 bool isChanged() const { return Changed; }
152 for (auto &MI : WorkList) {
153 ++NumOfPGOMemOPAnnotate;
157 LLVM_DEBUG(dbgs() << "MemOP call: "
158 << MI->getCalledFunction()->getName()
159 << "is Transformed.\n");
164 void visitMemIntrinsic(MemIntrinsic &MI) {
165 Value *Length = MI.getLength();
166 // Not perform on constant length calls.
167 if (dyn_cast<ConstantInt>(Length))
169 WorkList.push_back(&MI);
174 BlockFrequencyInfo &BFI;
175 OptimizationRemarkEmitter &ORE;
178 std::vector<MemIntrinsic *> WorkList;
179 // Start of the previse range.
180 int64_t PreciseRangeStart;
181 // Last value of the previse range.
182 int64_t PreciseRangeLast;
183 // The space to read the profile annotation.
184 std::unique_ptr<InstrProfValueData[]> ValueDataArray;
185 bool perform(MemIntrinsic *MI);
187 // This kind shows which group the value falls in. For PreciseValue, we have
188 // the profile count for that value. LargeGroup groups the values that are in
189 // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
190 enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
192 MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
193 if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
195 if (Value == PreciseRangeLast + 1)
196 return NonLargeGroup;
201 static const char *getMIName(const MemIntrinsic *MI) {
202 switch (MI->getIntrinsicID()) {
203 case Intrinsic::memcpy:
205 case Intrinsic::memmove:
207 case Intrinsic::memset:
214 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
215 assert(Count <= TotalCount);
216 if (Count < MemOPCountThreshold)
218 if (Count < TotalCount * MemOPPercentThreshold / 100)
223 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
225 if (!MemOPScaleCount)
228 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
229 return ScaleCount / Denom;
232 bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
234 if (MI->getIntrinsicID() == Intrinsic::memmove)
237 uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
239 if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
240 ValueDataArray.get(), NumVals, TotalCount))
243 uint64_t ActualCount = TotalCount;
244 uint64_t SavedTotalCount = TotalCount;
245 if (MemOPScaleCount) {
246 auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
249 ActualCount = *BBEdgeCount;
252 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
253 LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
254 << ActualCount << "\n");
257 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; });
259 if (ActualCount < MemOPCountThreshold)
261 // Skip if the total value profiled count is 0, in which case we can't
262 // scale up the counts properly (and there is no profitable transformation).
266 TotalCount = ActualCount;
268 LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
269 << " denominator = " << SavedTotalCount << "\n");
271 // Keeping track of the count of the default case:
272 uint64_t RemainCount = TotalCount;
273 uint64_t SavedRemainCount = SavedTotalCount;
274 SmallVector<uint64_t, 16> SizeIds;
275 SmallVector<uint64_t, 16> CaseCounts;
276 uint64_t MaxCount = 0;
277 unsigned Version = 0;
278 // Default case is in the front -- save the slot here.
279 CaseCounts.push_back(0);
280 for (auto &VD : VDs) {
281 int64_t V = VD.Value;
282 uint64_t C = VD.Count;
284 C = getScaledCount(C, ActualCount, SavedTotalCount);
286 // Only care precise value here.
287 if (getMemOPSizeKind(V) != PreciseValue)
290 // ValueCounts are sorted on the count. Break at the first un-profitable
292 if (!isProfitable(C, RemainCount))
295 SizeIds.push_back(V);
296 CaseCounts.push_back(C);
300 assert(RemainCount >= C);
302 assert(SavedRemainCount >= VD.Count);
303 SavedRemainCount -= VD.Count;
305 if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
312 CaseCounts[0] = RemainCount;
313 if (RemainCount > MaxCount)
314 MaxCount = RemainCount;
316 uint64_t SumForOpt = TotalCount - RemainCount;
318 LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
319 << " Versions (covering " << SumForOpt << " out of "
320 << TotalCount << ")\n");
333 // mem_op(..., size);
338 BasicBlock *BB = MI->getParent();
339 LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
340 LLVM_DEBUG(dbgs() << *BB << "\n");
341 auto OrigBBFreq = BFI.getBlockFreq(BB);
343 BasicBlock *DefaultBB = SplitBlock(BB, MI, DT);
344 BasicBlock::iterator It(*MI);
346 assert(It != DefaultBB->end());
347 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
348 MergeBB->setName("MemOP.Merge");
349 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
350 DefaultBB->setName("MemOP.Default");
352 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
353 auto &Ctx = Func.getContext();
355 BB->getTerminator()->eraseFromParent();
356 Value *SizeVar = MI->getLength();
357 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
359 // Clear the value profile data.
360 MI->setMetadata(LLVMContext::MD_prof, nullptr);
361 // If all promoted, we don't need the MD.prof metadata.
362 if (SavedRemainCount > 0 || Version != NumVals)
363 // Otherwise we need update with the un-promoted records back.
364 annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
365 SavedRemainCount, IPVK_MemOPSize, NumVals);
367 LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
369 std::vector<DominatorTree::UpdateType> Updates;
371 Updates.reserve(2 * SizeIds.size());
373 for (uint64_t SizeId : SizeIds) {
374 BasicBlock *CaseBB = BasicBlock::Create(
375 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
376 Instruction *NewInst = MI->clone();
378 MemIntrinsic * MemI = dyn_cast<MemIntrinsic>(NewInst);
379 IntegerType *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType());
380 assert(SizeType && "Expected integer type size argument.");
381 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
382 MemI->setLength(CaseSizeId);
383 CaseBB->getInstList().push_back(NewInst);
384 IRBuilder<> IRBCase(CaseBB);
385 IRBCase.CreateBr(MergeBB);
386 SI->addCase(CaseSizeId, CaseBB);
388 Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
389 Updates.push_back({DominatorTree::Insert, BB, CaseBB});
391 LLVM_DEBUG(dbgs() << *CaseBB << "\n");
393 DTU.applyUpdates(Updates);
396 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
398 LLVM_DEBUG(dbgs() << *BB << "\n");
399 LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
400 LLVM_DEBUG(dbgs() << *MergeBB << "\n");
404 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MI)
405 << "optimized " << NV("Intrinsic", StringRef(getMIName(MI)))
406 << " with count " << NV("Count", SumForOpt) << " out of "
407 << NV("Total", TotalCount) << " for " << NV("Versions", Version)
415 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
416 OptimizationRemarkEmitter &ORE,
421 if (F.hasFnAttribute(Attribute::OptimizeForSize))
423 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT);
424 MemOPSizeOpt.perform();
425 return MemOPSizeOpt.isChanged();
428 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
429 BlockFrequencyInfo &BFI =
430 getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
431 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
432 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
433 DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
434 return PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
438 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
440 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
441 FunctionAnalysisManager &FAM) {
442 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
443 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
444 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
445 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
447 return PreservedAnalyses::all();
448 auto PA = PreservedAnalyses();
449 PA.preserve<GlobalsAA>();
450 PA.preserve<DominatorTreeAnalysis>();