1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // The machine combiner pass uses machine trace metrics to ensure the combined
11 // instructions does not lengthen the critical path or the resource depth.
12 //===----------------------------------------------------------------------===//
14 #define DEBUG_TYPE "machine-combiner"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/ADT/Statistic.h"
18 #include "llvm/CodeGen/MachineDominators.h"
19 #include "llvm/CodeGen/MachineFunction.h"
20 #include "llvm/CodeGen/MachineFunctionPass.h"
21 #include "llvm/CodeGen/MachineInstrBuilder.h"
22 #include "llvm/CodeGen/MachineLoopInfo.h"
23 #include "llvm/CodeGen/MachineRegisterInfo.h"
24 #include "llvm/CodeGen/MachineTraceMetrics.h"
25 #include "llvm/CodeGen/Passes.h"
26 #include "llvm/CodeGen/TargetSchedule.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include "llvm/Target/TargetInstrInfo.h"
30 #include "llvm/Target/TargetRegisterInfo.h"
31 #include "llvm/Target/TargetSubtargetInfo.h"
35 STATISTIC(NumInstCombined, "Number of machineinst combined");
38 class MachineCombiner : public MachineFunctionPass {
39 const TargetInstrInfo *TII;
40 const TargetRegisterInfo *TRI;
41 MCSchedModel SchedModel;
42 MachineRegisterInfo *MRI;
43 MachineLoopInfo *MLI; // Current MachineLoopInfo
44 MachineTraceMetrics *Traces;
45 MachineTraceMetrics::Ensemble *MinInstr;
47 TargetSchedModel TSchedModel;
49 /// True if optimizing for code size.
54 MachineCombiner() : MachineFunctionPass(ID) {
55 initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
57 void getAnalysisUsage(AnalysisUsage &AU) const override;
58 bool runOnMachineFunction(MachineFunction &MF) override;
59 StringRef getPassName() const override { return "Machine InstCombiner"; }
62 bool doSubstitute(unsigned NewSize, unsigned OldSize);
63 bool combineInstructions(MachineBasicBlock *);
64 MachineInstr *getOperandDef(const MachineOperand &MO);
65 unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
66 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
67 MachineTraceMetrics::Trace BlockTrace);
68 unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
69 MachineTraceMetrics::Trace BlockTrace);
71 improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
72 MachineTraceMetrics::Trace BlockTrace,
73 SmallVectorImpl<MachineInstr *> &InsInstrs,
74 SmallVectorImpl<MachineInstr *> &DelInstrs,
75 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
76 MachineCombinerPattern Pattern);
77 bool preservesResourceLen(MachineBasicBlock *MBB,
78 MachineTraceMetrics::Trace BlockTrace,
79 SmallVectorImpl<MachineInstr *> &InsInstrs,
80 SmallVectorImpl<MachineInstr *> &DelInstrs);
81 void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
82 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
86 char MachineCombiner::ID = 0;
87 char &llvm::MachineCombinerID = MachineCombiner::ID;
89 INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner",
90 "Machine InstCombiner", false, false)
91 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
92 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
93 INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
96 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
98 AU.addPreserved<MachineDominatorTree>();
99 AU.addRequired<MachineLoopInfo>();
100 AU.addPreserved<MachineLoopInfo>();
101 AU.addRequired<MachineTraceMetrics>();
102 AU.addPreserved<MachineTraceMetrics>();
103 MachineFunctionPass::getAnalysisUsage(AU);
106 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
107 MachineInstr *DefInstr = nullptr;
108 // We need a virtual register definition.
109 if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
110 DefInstr = MRI->getUniqueVRegDef(MO.getReg());
111 // PHI's have no depth etc.
112 if (DefInstr && DefInstr->isPHI())
117 /// Computes depth of instructions in vector \InsInstr.
119 /// \param InsInstrs is a vector of machine instructions
120 /// \param InstrIdxForVirtReg is a dense map of virtual register to index
121 /// of defining machine instruction in \p InsInstrs
122 /// \param BlockTrace is a trace of machine instructions
124 /// \returns Depth of last instruction in \InsInstrs ("NewRoot")
126 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
127 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
128 MachineTraceMetrics::Trace BlockTrace) {
129 SmallVector<unsigned, 16> InstrDepth;
130 assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
131 "Missing machine model\n");
133 // For each instruction in the new sequence compute the depth based on the
134 // operands. Use the trace information when possible. For new operands which
135 // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
136 for (auto *InstrPtr : InsInstrs) { // for each Use
138 DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(TII); dbgs() << "\n";);
139 for (const MachineOperand &MO : InstrPtr->operands()) {
140 // Check for virtual register operand.
141 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
145 unsigned DepthOp = 0;
146 unsigned LatencyOp = 0;
147 DenseMap<unsigned, unsigned>::iterator II =
148 InstrIdxForVirtReg.find(MO.getReg());
149 if (II != InstrIdxForVirtReg.end()) {
150 // Operand is new virtual register not in trace
151 assert(II->second < InstrDepth.size() && "Bad Index");
152 MachineInstr *DefInstr = InsInstrs[II->second];
154 "There must be a definition for a new virtual register");
155 DepthOp = InstrDepth[II->second];
156 LatencyOp = TSchedModel.computeOperandLatency(
157 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
158 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
160 MachineInstr *DefInstr = getOperandDef(MO);
162 DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
163 LatencyOp = TSchedModel.computeOperandLatency(
164 DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
165 InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
168 IDepth = std::max(IDepth, DepthOp + LatencyOp);
170 InstrDepth.push_back(IDepth);
172 unsigned NewRootIdx = InsInstrs.size() - 1;
173 return InstrDepth[NewRootIdx];
176 /// Computes instruction latency as max of latency of defined operands.
178 /// \param Root is a machine instruction that could be replaced by NewRoot.
179 /// It is used to compute a more accurate latency information for NewRoot in
180 /// case there is a dependent instruction in the same trace (\p BlockTrace)
181 /// \param NewRoot is the instruction for which the latency is computed
182 /// \param BlockTrace is a trace of machine instructions
184 /// \returns Latency of \p NewRoot
185 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
186 MachineTraceMetrics::Trace BlockTrace) {
187 assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
188 "Missing machine model\n");
190 // Check each definition in NewRoot and compute the latency
191 unsigned NewRootLatency = 0;
193 for (const MachineOperand &MO : NewRoot->operands()) {
194 // Check for virtual register operand.
195 if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
199 // Get the first instruction that uses MO
200 MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
202 MachineInstr *UseMO = RI->getParent();
203 unsigned LatencyOp = 0;
204 if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
205 LatencyOp = TSchedModel.computeOperandLatency(
206 NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
207 UseMO->findRegisterUseOperandIdx(MO.getReg()));
209 LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
211 NewRootLatency = std::max(NewRootLatency, LatencyOp);
213 return NewRootLatency;
216 /// The combiner's goal may differ based on which pattern it is attempting
218 enum class CombinerObjective {
219 MustReduceDepth, // The data dependency chain must be improved.
220 Default // The critical path must not be lengthened.
223 static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
224 // TODO: If C++ ever gets a real enum class, make this part of the
225 // MachineCombinerPattern class.
227 case MachineCombinerPattern::REASSOC_AX_BY:
228 case MachineCombinerPattern::REASSOC_AX_YB:
229 case MachineCombinerPattern::REASSOC_XA_BY:
230 case MachineCombinerPattern::REASSOC_XA_YB:
231 return CombinerObjective::MustReduceDepth;
233 return CombinerObjective::Default;
237 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
238 /// The new code sequence ends in MI NewRoot. A necessary condition for the new
239 /// sequence to replace the old sequence is that it cannot lengthen the critical
240 /// path. The definition of "improve" may be restricted by specifying that the
241 /// new path improves the data dependency chain (MustReduceDepth).
242 bool MachineCombiner::improvesCriticalPathLen(
243 MachineBasicBlock *MBB, MachineInstr *Root,
244 MachineTraceMetrics::Trace BlockTrace,
245 SmallVectorImpl<MachineInstr *> &InsInstrs,
246 SmallVectorImpl<MachineInstr *> &DelInstrs,
247 DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
248 MachineCombinerPattern Pattern) {
249 assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
250 "Missing machine model\n");
251 // NewRoot is the last instruction in the \p InsInstrs vector.
252 unsigned NewRootIdx = InsInstrs.size() - 1;
253 MachineInstr *NewRoot = InsInstrs[NewRootIdx];
255 // Get depth and latency of NewRoot and Root.
256 unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
257 unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
259 DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
260 dbgs() << " NewRootDepth: " << NewRootDepth << "\n";
261 dbgs() << " RootDepth: " << RootDepth << "\n");
263 // For a transform such as reassociation, the cost equation is
264 // conservatively calculated so that we must improve the depth (data
265 // dependency cycles) in the critical path to proceed with the transform.
266 // Being conservative also protects against inaccuracies in the underlying
267 // machine trace metrics and CPU models.
268 if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth)
269 return NewRootDepth < RootDepth;
271 // A more flexible cost calculation for the critical path includes the slack
272 // of the original code sequence. This may allow the transform to proceed
273 // even if the instruction depths (data dependency cycles) become worse.
275 unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
276 unsigned RootLatency = 0;
278 for (auto I : DelInstrs)
279 RootLatency += TSchedModel.computeInstrLatency(I);
281 unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
283 DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
284 dbgs() << " RootLatency: " << RootLatency << "\n";
285 dbgs() << " RootSlack: " << RootSlack << "\n";
286 dbgs() << " NewRootDepth + NewRootLatency = "
287 << NewRootDepth + NewRootLatency << "\n";
288 dbgs() << " RootDepth + RootLatency + RootSlack = "
289 << RootDepth + RootLatency + RootSlack << "\n";);
291 unsigned NewCycleCount = NewRootDepth + NewRootLatency;
292 unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
294 return NewCycleCount <= OldCycleCount;
297 /// helper routine to convert instructions into SC
298 void MachineCombiner::instr2instrSC(
299 SmallVectorImpl<MachineInstr *> &Instrs,
300 SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
301 for (auto *InstrPtr : Instrs) {
302 unsigned Opc = InstrPtr->getOpcode();
303 unsigned Idx = TII->get(Opc).getSchedClass();
304 const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
305 InstrsSC.push_back(SC);
309 /// True when the new instructions do not increase resource length
310 bool MachineCombiner::preservesResourceLen(
311 MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
312 SmallVectorImpl<MachineInstr *> &InsInstrs,
313 SmallVectorImpl<MachineInstr *> &DelInstrs) {
314 if (!TSchedModel.hasInstrSchedModel())
317 // Compute current resource length
319 //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
320 SmallVector <const MachineBasicBlock *, 1> MBBarr;
321 MBBarr.push_back(MBB);
322 unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
324 // Deal with SC rather than Instructions.
325 SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
326 SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
328 instr2instrSC(InsInstrs, InsInstrsSC);
329 instr2instrSC(DelInstrs, DelInstrsSC);
331 ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC);
332 ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC);
334 // Compute new resource length.
335 unsigned ResLenAfterCombine =
336 BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
338 DEBUG(dbgs() << "RESOURCE DATA: \n";
339 dbgs() << " resource len before: " << ResLenBeforeCombine
340 << " after: " << ResLenAfterCombine << "\n";);
342 return ResLenAfterCombine <= ResLenBeforeCombine;
345 /// \returns true when new instruction sequence should be generated
346 /// independent if it lengthens critical path or not
347 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
348 if (OptSize && (NewSize < OldSize))
350 if (!TSchedModel.hasInstrSchedModelOrItineraries())
355 /// Substitute a slow code sequence with a faster one by
356 /// evaluating instruction combining pattern.
357 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
358 /// combining based on machine trace metrics. Only combine a sequence of
359 /// instructions when this neither lengthens the critical path nor increases
360 /// resource pressure. When optimizing for codesize always combine when the new
361 /// sequence is shorter.
362 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
363 bool Changed = false;
364 DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
366 auto BlockIter = MBB->begin();
367 // Check if the block is in a loop.
368 const MachineLoop *ML = MLI->getLoopFor(MBB);
370 while (BlockIter != MBB->end()) {
371 auto &MI = *BlockIter++;
373 DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";);
374 SmallVector<MachineCombinerPattern, 16> Patterns;
375 // The motivating example is:
377 // MUL Other MUL_op1 MUL_op2 Other
379 // ADD/SUB => MADD/MSUB
380 // (=Root) (=NewRoot)
382 // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
383 // usually beneficial for code size it unfortunately can hurt performance
384 // when the ADD is on the critical path, but the MUL is not. With the
385 // substitution the MUL becomes part of the critical path (in form of the
386 // MADD) and can lengthen it on architectures where the MADD latency is
387 // longer than the ADD latency.
389 // For each instruction we check if it can be the root of a combiner
390 // pattern. Then for each pattern the new code sequence in form of MI is
391 // generated and evaluated. When the efficiency criteria (don't lengthen
392 // critical path, don't use more resources) is met the new sequence gets
393 // hooked up into the basic block before the old sequence is removed.
395 // The algorithm does not try to evaluate all patterns and pick the best.
396 // This is only an artificial restriction though. In practice there is
397 // mostly one pattern, and getMachineCombinerPatterns() can order patterns
398 // based on an internal cost heuristic.
400 if (!TII->getMachineCombinerPatterns(MI, Patterns))
403 for (auto P : Patterns) {
404 SmallVector<MachineInstr *, 16> InsInstrs;
405 SmallVector<MachineInstr *, 16> DelInstrs;
406 DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
408 MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
409 MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
410 Traces->verifyAnalysis();
411 TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
413 unsigned NewInstCount = InsInstrs.size();
414 unsigned OldInstCount = DelInstrs.size();
415 // Found pattern, but did not generate alternative sequence.
416 // This can happen e.g. when an immediate could not be materialized
417 // in a single instruction.
421 bool SubstituteAlways = false;
422 if (ML && TII->isThroughputPattern(P))
423 SubstituteAlways = true;
425 // Substitute when we optimize for codesize and the new sequence has
426 // fewer instructions OR
427 // the new sequence neither lengthens the critical path nor increases
428 // resource pressure.
429 if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
430 (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
431 DelInstrs, InstrIdxForVirtReg, P) &&
432 preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
433 for (auto *InstrPtr : InsInstrs)
434 MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
435 for (auto *InstrPtr : DelInstrs)
436 InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
441 Traces->invalidate(MBB);
442 Traces->verifyAnalysis();
443 // Eagerly stop after the first pattern fires.
446 // Cleanup instructions of the alternative code sequence. There is no
448 MachineFunction *MF = MBB->getParent();
449 for (auto *InstrPtr : InsInstrs)
450 MF->DeleteMachineInstr(InstrPtr);
452 InstrIdxForVirtReg.clear();
459 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
460 const TargetSubtargetInfo &STI = MF.getSubtarget();
461 TII = STI.getInstrInfo();
462 TRI = STI.getRegisterInfo();
463 SchedModel = STI.getSchedModel();
464 TSchedModel.init(SchedModel, &STI, TII);
465 MRI = &MF.getRegInfo();
466 MLI = &getAnalysis<MachineLoopInfo>();
467 Traces = &getAnalysis<MachineTraceMetrics>();
469 OptSize = MF.getFunction()->optForSize();
471 DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
472 if (!TII->useMachineCombiner()) {
473 DEBUG(dbgs() << " Skipping pass: Target does not support machine combiner\n");
477 bool Changed = false;
479 // Try to combine instructions.
481 Changed |= combineInstructions(&MBB);