1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// \brief The AMDGPU target machine contains all of the hardware specific
12 /// information needed to emit code for R600 and SI GPUs.
14 //===----------------------------------------------------------------------===//
16 #include "AMDGPUTargetMachine.h"
18 #include "AMDGPUAliasAnalysis.h"
19 #include "AMDGPUCallLowering.h"
20 #include "AMDGPUInstructionSelector.h"
21 #include "AMDGPULegalizerInfo.h"
22 #include "AMDGPUMacroFusion.h"
23 #include "AMDGPUTargetObjectFile.h"
24 #include "AMDGPUTargetTransformInfo.h"
25 #include "GCNIterativeScheduler.h"
26 #include "GCNSchedStrategy.h"
27 #include "R600MachineScheduler.h"
28 #include "SIMachineScheduler.h"
29 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
30 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
31 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
32 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
33 #include "llvm/CodeGen/Passes.h"
34 #include "llvm/CodeGen/TargetPassConfig.h"
35 #include "llvm/IR/Attributes.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/LegacyPassManager.h"
38 #include "llvm/Pass.h"
39 #include "llvm/Support/CommandLine.h"
40 #include "llvm/Support/Compiler.h"
41 #include "llvm/Support/TargetRegistry.h"
42 #include "llvm/Target/TargetLoweringObjectFile.h"
43 #include "llvm/Transforms/IPO.h"
44 #include "llvm/Transforms/IPO/AlwaysInliner.h"
45 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
46 #include "llvm/Transforms/Scalar.h"
47 #include "llvm/Transforms/Scalar/GVN.h"
48 #include "llvm/Transforms/Vectorize.h"
53 static cl::opt<bool> EnableR600StructurizeCFG(
54 "r600-ir-structurize",
55 cl::desc("Use StructurizeCFG IR pass"),
58 static cl::opt<bool> EnableSROA(
60 cl::desc("Run SROA after promote alloca pass"),
65 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
66 cl::desc("Run early if-conversion"),
69 static cl::opt<bool> EnableR600IfConvert(
71 cl::desc("Use if conversion pass"),
75 // Option to disable vectorizer for tests.
76 static cl::opt<bool> EnableLoadStoreVectorizer(
77 "amdgpu-load-store-vectorizer",
78 cl::desc("Enable load store vectorizer"),
82 // Option to to control global loads scalarization
83 static cl::opt<bool> ScalarizeGlobal(
84 "amdgpu-scalarize-global-loads",
85 cl::desc("Enable global load scalarization"),
89 // Option to run internalize pass.
90 static cl::opt<bool> InternalizeSymbols(
91 "amdgpu-internalize-symbols",
92 cl::desc("Enable elimination of non-kernel functions and unused globals"),
96 // Option to inline all early.
97 static cl::opt<bool> EarlyInlineAll(
98 "amdgpu-early-inline-all",
99 cl::desc("Inline all functions early"),
103 static cl::opt<bool> EnableSDWAPeephole(
104 "amdgpu-sdwa-peephole",
105 cl::desc("Enable SDWA peepholer"),
108 // Enable address space based alias analysis
109 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
110 cl::desc("Enable AMDGPU Alias Analysis"),
113 // Option to enable new waitcnt insertion pass.
114 static cl::opt<bool> EnableSIInsertWaitcntsPass(
115 "enable-si-insert-waitcnts",
116 cl::desc("Use new waitcnt insertion pass"),
119 // Option to run late CFG structurizer
120 static cl::opt<bool> LateCFGStructurize(
121 "amdgpu-late-structurize",
122 cl::desc("Enable late CFG structurization"),
126 extern "C" void LLVMInitializeAMDGPUTarget() {
127 // Register the target
128 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
129 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
131 PassRegistry *PR = PassRegistry::getPassRegistry();
132 initializeSILowerI1CopiesPass(*PR);
133 initializeSIFixSGPRCopiesPass(*PR);
134 initializeSIFixVGPRCopiesPass(*PR);
135 initializeSIFoldOperandsPass(*PR);
136 initializeSIPeepholeSDWAPass(*PR);
137 initializeSIShrinkInstructionsPass(*PR);
138 initializeSIFixControlFlowLiveIntervalsPass(*PR);
139 initializeSILoadStoreOptimizerPass(*PR);
140 initializeAMDGPUAlwaysInlinePass(*PR);
141 initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
142 initializeAMDGPUAnnotateUniformValuesPass(*PR);
143 initializeAMDGPULowerIntrinsicsPass(*PR);
144 initializeAMDGPUPromoteAllocaPass(*PR);
145 initializeAMDGPUCodeGenPreparePass(*PR);
146 initializeAMDGPUUnifyMetadataPass(*PR);
147 initializeSIAnnotateControlFlowPass(*PR);
148 initializeSIInsertWaitsPass(*PR);
149 initializeSIInsertWaitcntsPass(*PR);
150 initializeSIWholeQuadModePass(*PR);
151 initializeSILowerControlFlowPass(*PR);
152 initializeSIInsertSkipsPass(*PR);
153 initializeSIDebuggerInsertNopsPass(*PR);
154 initializeSIOptimizeExecMaskingPass(*PR);
155 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
156 initializeAMDGPUAAWrapperPassPass(*PR);
159 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
160 return llvm::make_unique<AMDGPUTargetObjectFile>();
163 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
164 return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
167 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
168 return new SIScheduleDAGMI(C);
171 static ScheduleDAGInstrs *
172 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
173 ScheduleDAGMILive *DAG =
174 new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
175 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
176 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
177 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
181 static ScheduleDAGInstrs *
182 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
183 auto DAG = new GCNIterativeScheduler(C,
184 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
185 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
186 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
190 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
191 return new GCNIterativeScheduler(C,
192 GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
195 static MachineSchedRegistry
196 R600SchedRegistry("r600", "Run R600's custom scheduler",
197 createR600MachineScheduler);
199 static MachineSchedRegistry
200 SISchedRegistry("si", "Run SI's custom scheduler",
201 createSIMachineScheduler);
203 static MachineSchedRegistry
204 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
205 "Run GCN scheduler to maximize occupancy",
206 createGCNMaxOccupancyMachineScheduler);
208 static MachineSchedRegistry
209 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
210 "Run GCN scheduler to maximize occupancy (experimental)",
211 createIterativeGCNMaxOccupancyMachineScheduler);
213 static MachineSchedRegistry
214 GCNMinRegSchedRegistry("gcn-minreg",
215 "Run GCN iterative scheduler for minimal register usage (experimental)",
216 createMinRegScheduler);
218 static StringRef computeDataLayout(const Triple &TT) {
219 if (TT.getArch() == Triple::r600) {
221 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
222 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
225 // 32-bit private, local, and region pointers. 64-bit global, constant and
227 if (TT.getEnvironmentName() == "amdgiz" ||
228 TT.getEnvironmentName() == "amdgizcl")
229 return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
230 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
231 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
232 return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
233 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
234 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
238 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
242 // HSA only supports CI+, so change the default GPU to a CI for HSA.
243 if (TT.getArch() == Triple::amdgcn)
244 return (TT.getOS() == Triple::AMDHSA) ? "kaveri" : "tahiti";
249 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
250 // The AMDGPU toolchain only supports generating shared objects, so we
251 // must always use PIC.
255 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
256 StringRef CPU, StringRef FS,
257 TargetOptions Options,
258 Optional<Reloc::Model> RM,
260 CodeGenOpt::Level OptLevel)
261 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
262 FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
263 TLOF(createTLOF(getTargetTriple())) {
264 AS = AMDGPU::getAMDGPUAS(TT);
268 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
270 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
271 Attribute GPUAttr = F.getFnAttribute("target-cpu");
272 return GPUAttr.hasAttribute(Attribute::None) ?
273 getTargetCPU() : GPUAttr.getValueAsString();
276 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
277 Attribute FSAttr = F.getFnAttribute("target-features");
279 return FSAttr.hasAttribute(Attribute::None) ?
280 getTargetFeatureString() :
281 FSAttr.getValueAsString();
284 static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
285 return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
286 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
287 AAR.addAAResult(WrapperPass->getResult());
291 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
292 Builder.DivergentTarget = true;
294 bool Internalize = InternalizeSymbols &&
295 (getOptLevel() > CodeGenOpt::None) &&
296 (getTargetTriple().getArch() == Triple::amdgcn);
297 bool EarlyInline = EarlyInlineAll &&
298 (getOptLevel() > CodeGenOpt::None);
299 bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None;
301 Builder.addExtension(
302 PassManagerBuilder::EP_ModuleOptimizerEarly,
303 [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
304 legacy::PassManagerBase &PM) {
306 PM.add(createAMDGPUAAWrapperPass());
307 PM.add(createAMDGPUExternalAAWrapperPass());
309 PM.add(createAMDGPUUnifyMetadataPass());
311 PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool {
312 if (const Function *F = dyn_cast<Function>(&GV)) {
313 if (F->isDeclaration())
315 switch (F->getCallingConv()) {
318 case CallingConv::AMDGPU_VS:
319 case CallingConv::AMDGPU_HS:
320 case CallingConv::AMDGPU_GS:
321 case CallingConv::AMDGPU_PS:
322 case CallingConv::AMDGPU_CS:
323 case CallingConv::AMDGPU_KERNEL:
324 case CallingConv::SPIR_KERNEL:
328 return !GV.use_empty();
330 PM.add(createGlobalDCEPass());
333 PM.add(createAMDGPUAlwaysInlinePass(false));
336 Builder.addExtension(
337 PassManagerBuilder::EP_EarlyAsPossible,
338 [AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
340 PM.add(createAMDGPUAAWrapperPass());
341 PM.add(createAMDGPUExternalAAWrapperPass());
345 Builder.addExtension(
346 PassManagerBuilder::EP_CGSCCOptimizerLate,
347 [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
348 // Add infer address spaces pass to the opt pipeline after inlining
349 // but before SROA to increase SROA opportunities.
350 PM.add(createInferAddressSpacesPass());
354 //===----------------------------------------------------------------------===//
355 // R600 Target Machine (R600 -> Cayman)
356 //===----------------------------------------------------------------------===//
358 R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
359 StringRef CPU, StringRef FS,
360 TargetOptions Options,
361 Optional<Reloc::Model> RM,
362 CodeModel::Model CM, CodeGenOpt::Level OL)
363 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
364 setRequiresStructuredCFG(true);
367 const R600Subtarget *R600TargetMachine::getSubtargetImpl(
368 const Function &F) const {
369 StringRef GPU = getGPUName(F);
370 StringRef FS = getFeatureString(F);
372 SmallString<128> SubtargetKey(GPU);
373 SubtargetKey.append(FS);
375 auto &I = SubtargetMap[SubtargetKey];
377 // This needs to be done before we create a new subtarget since any
378 // creation will depend on the TM and the code generation flags on the
379 // function that reside in TargetOptions.
380 resetTargetOptions(F);
381 I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
387 //===----------------------------------------------------------------------===//
388 // GCN Target Machine (SI+)
389 //===----------------------------------------------------------------------===//
391 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
392 StringRef CPU, StringRef FS,
393 TargetOptions Options,
394 Optional<Reloc::Model> RM,
395 CodeModel::Model CM, CodeGenOpt::Level OL)
396 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
398 const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const {
399 StringRef GPU = getGPUName(F);
400 StringRef FS = getFeatureString(F);
402 SmallString<128> SubtargetKey(GPU);
403 SubtargetKey.append(FS);
405 auto &I = SubtargetMap[SubtargetKey];
407 // This needs to be done before we create a new subtarget since any
408 // creation will depend on the TM and the code generation flags on the
409 // function that reside in TargetOptions.
410 resetTargetOptions(F);
411 I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this);
414 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
419 //===----------------------------------------------------------------------===//
421 //===----------------------------------------------------------------------===//
425 class AMDGPUPassConfig : public TargetPassConfig {
427 AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
428 : TargetPassConfig(TM, PM) {
429 // Exceptions and StackMaps are not supported, so these passes will never do
431 disablePass(&StackMapLivenessID);
432 disablePass(&FuncletLayoutID);
435 AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
436 return getTM<AMDGPUTargetMachine>();
440 createMachineScheduler(MachineSchedContext *C) const override {
441 ScheduleDAGMILive *DAG = createGenericSchedLive(C);
442 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
443 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
447 void addEarlyCSEOrGVNPass();
448 void addStraightLineScalarOptimizationPasses();
449 void addIRPasses() override;
450 void addCodeGenPrepare() override;
451 bool addPreISel() override;
452 bool addInstSelector() override;
453 bool addGCPasses() override;
456 class R600PassConfig final : public AMDGPUPassConfig {
458 R600PassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
459 : AMDGPUPassConfig(TM, PM) {}
461 ScheduleDAGInstrs *createMachineScheduler(
462 MachineSchedContext *C) const override {
463 return createR600MachineScheduler(C);
466 bool addPreISel() override;
467 void addPreRegAlloc() override;
468 void addPreSched2() override;
469 void addPreEmitPass() override;
472 class GCNPassConfig final : public AMDGPUPassConfig {
474 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
475 : AMDGPUPassConfig(TM, PM) {}
477 GCNTargetMachine &getGCNTargetMachine() const {
478 return getTM<GCNTargetMachine>();
482 createMachineScheduler(MachineSchedContext *C) const override;
484 bool addPreISel() override;
485 void addMachineSSAOptimization() override;
486 bool addILPOpts() override;
487 bool addInstSelector() override;
488 #ifdef LLVM_BUILD_GLOBAL_ISEL
489 bool addIRTranslator() override;
490 bool addLegalizeMachineIR() override;
491 bool addRegBankSelect() override;
492 bool addGlobalInstructionSelect() override;
494 void addFastRegAlloc(FunctionPass *RegAllocPass) override;
495 void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
496 void addPreRegAlloc() override;
497 void addPostRegAlloc() override;
498 void addPreSched2() override;
499 void addPreEmitPass() override;
502 } // end anonymous namespace
504 TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() {
505 return TargetIRAnalysis([this](const Function &F) {
506 return TargetTransformInfo(AMDGPUTTIImpl(this, F));
510 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
511 if (getOptLevel() == CodeGenOpt::Aggressive)
512 addPass(createGVNPass());
514 addPass(createEarlyCSEPass());
517 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
518 addPass(createSeparateConstOffsetFromGEPPass());
519 addPass(createSpeculativeExecutionPass());
520 // ReassociateGEPs exposes more opportunites for SLSR. See
521 // the example in reassociate-geps-and-slsr.ll.
522 addPass(createStraightLineStrengthReducePass());
523 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
524 // EarlyCSE can reuse.
525 addEarlyCSEOrGVNPass();
526 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
527 addPass(createNaryReassociatePass());
528 // NaryReassociate on GEPs creates redundant common expressions, so run
529 // EarlyCSE after it.
530 addPass(createEarlyCSEPass());
533 void AMDGPUPassConfig::addIRPasses() {
534 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
536 // There is no reason to run these.
537 disablePass(&StackMapLivenessID);
538 disablePass(&FuncletLayoutID);
539 disablePass(&PatchableFunctionID);
541 addPass(createAMDGPULowerIntrinsicsPass());
543 // Function calls are not supported, so make sure we inline everything.
544 addPass(createAMDGPUAlwaysInlinePass());
545 addPass(createAlwaysInlinerLegacyPass());
546 // We need to add the barrier noop pass, otherwise adding the function
547 // inlining pass will cause all of the PassConfigs passes to be run
548 // one function at a time, which means if we have a nodule with two
549 // functions, then we will generate code for the first function
550 // without ever running any passes on the second.
551 addPass(createBarrierNoopPass());
553 if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
554 // TODO: May want to move later or split into an early and late one.
556 addPass(createAMDGPUCodeGenPreparePass());
559 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
560 addPass(createAMDGPUOpenCLImageTypeLoweringPass());
562 if (TM.getOptLevel() > CodeGenOpt::None) {
563 addPass(createInferAddressSpacesPass());
564 addPass(createAMDGPUPromoteAlloca());
567 addPass(createSROAPass());
569 addStraightLineScalarOptimizationPasses();
571 if (EnableAMDGPUAliasAnalysis) {
572 addPass(createAMDGPUAAWrapperPass());
573 addPass(createExternalAAWrapperPass([](Pass &P, Function &,
575 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
576 AAR.addAAResult(WrapperPass->getResult());
581 TargetPassConfig::addIRPasses();
583 // EarlyCSE is not always strong enough to clean up what LSR produces. For
584 // example, GVN can combine
591 // %0 = shl nsw %a, 2
594 // but EarlyCSE can do neither of them.
595 if (getOptLevel() != CodeGenOpt::None)
596 addEarlyCSEOrGVNPass();
599 void AMDGPUPassConfig::addCodeGenPrepare() {
600 TargetPassConfig::addCodeGenPrepare();
602 if (EnableLoadStoreVectorizer)
603 addPass(createLoadStoreVectorizerPass());
606 bool AMDGPUPassConfig::addPreISel() {
607 addPass(createFlattenCFGPass());
611 bool AMDGPUPassConfig::addInstSelector() {
612 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
616 bool AMDGPUPassConfig::addGCPasses() {
617 // Do nothing. GC is not supported.
621 //===----------------------------------------------------------------------===//
623 //===----------------------------------------------------------------------===//
625 bool R600PassConfig::addPreISel() {
626 AMDGPUPassConfig::addPreISel();
628 if (EnableR600StructurizeCFG)
629 addPass(createStructurizeCFGPass());
633 void R600PassConfig::addPreRegAlloc() {
634 addPass(createR600VectorRegMerger());
637 void R600PassConfig::addPreSched2() {
638 addPass(createR600EmitClauseMarkers(), false);
639 if (EnableR600IfConvert)
640 addPass(&IfConverterID, false);
641 addPass(createR600ClauseMergePass(), false);
644 void R600PassConfig::addPreEmitPass() {
645 addPass(createAMDGPUCFGStructurizerPass(), false);
646 addPass(createR600ExpandSpecialInstrsPass(), false);
647 addPass(&FinalizeMachineBundlesID, false);
648 addPass(createR600Packetizer(), false);
649 addPass(createR600ControlFlowFinalizer(), false);
652 TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) {
653 return new R600PassConfig(*this, PM);
656 //===----------------------------------------------------------------------===//
658 //===----------------------------------------------------------------------===//
660 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
661 MachineSchedContext *C) const {
662 const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
663 if (ST.enableSIScheduler())
664 return createSIMachineScheduler(C);
665 return createGCNMaxOccupancyMachineScheduler(C);
668 bool GCNPassConfig::addPreISel() {
669 AMDGPUPassConfig::addPreISel();
671 // FIXME: We need to run a pass to propagate the attributes when calls are
673 addPass(createAMDGPUAnnotateKernelFeaturesPass());
675 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
676 // regions formed by them.
677 addPass(&AMDGPUUnifyDivergentExitNodesID);
678 if (!LateCFGStructurize) {
679 addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
681 addPass(createSinkingPass());
682 addPass(createAMDGPUAnnotateUniformValues());
683 if (!LateCFGStructurize) {
684 addPass(createSIAnnotateControlFlowPass());
690 void GCNPassConfig::addMachineSSAOptimization() {
691 TargetPassConfig::addMachineSSAOptimization();
693 // We want to fold operands after PeepholeOptimizer has run (or as part of
694 // it), because it will eliminate extra copies making it easier to fold the
695 // real source operand. We want to eliminate dead instructions after, so that
696 // we see fewer uses of the copies. We then need to clean up the dead
697 // instructions leftover after the operands are folded as well.
699 // XXX - Can we get away without running DeadMachineInstructionElim again?
700 addPass(&SIFoldOperandsID);
701 addPass(&DeadMachineInstructionElimID);
702 addPass(&SILoadStoreOptimizerID);
703 if (EnableSDWAPeephole) {
704 addPass(&SIPeepholeSDWAID);
705 addPass(&MachineLICMID);
706 addPass(&MachineCSEID);
707 addPass(&SIFoldOperandsID);
708 addPass(&DeadMachineInstructionElimID);
710 addPass(createSIShrinkInstructionsPass());
713 bool GCNPassConfig::addILPOpts() {
714 if (EnableEarlyIfConversion)
715 addPass(&EarlyIfConverterID);
717 TargetPassConfig::addILPOpts();
721 bool GCNPassConfig::addInstSelector() {
722 AMDGPUPassConfig::addInstSelector();
723 addPass(createSILowerI1CopiesPass());
724 addPass(&SIFixSGPRCopiesID);
728 #ifdef LLVM_BUILD_GLOBAL_ISEL
729 bool GCNPassConfig::addIRTranslator() {
730 addPass(new IRTranslator());
734 bool GCNPassConfig::addLegalizeMachineIR() {
735 addPass(new Legalizer());
739 bool GCNPassConfig::addRegBankSelect() {
740 addPass(new RegBankSelect());
744 bool GCNPassConfig::addGlobalInstructionSelect() {
745 addPass(new InstructionSelect());
751 void GCNPassConfig::addPreRegAlloc() {
752 if (LateCFGStructurize) {
753 addPass(createAMDGPUMachineCFGStructurizerPass());
755 addPass(createSIWholeQuadModePass());
758 void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
759 // FIXME: We have to disable the verifier here because of PHIElimination +
760 // TwoAddressInstructions disabling it.
762 // This must be run immediately after phi elimination and before
763 // TwoAddressInstructions, otherwise the processing of the tied operand of
764 // SI_ELSE will introduce a copy of the tied operand source after the else.
765 insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
767 TargetPassConfig::addFastRegAlloc(RegAllocPass);
770 void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
771 // This needs to be run directly before register allocation because earlier
772 // passes might recompute live intervals.
773 insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
775 // This must be run immediately after phi elimination and before
776 // TwoAddressInstructions, otherwise the processing of the tied operand of
777 // SI_ELSE will introduce a copy of the tied operand source after the else.
778 insertPass(&PHIEliminationID, &SILowerControlFlowID, false);
780 TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
783 void GCNPassConfig::addPostRegAlloc() {
784 addPass(&SIFixVGPRCopiesID);
785 addPass(&SIOptimizeExecMaskingID);
786 TargetPassConfig::addPostRegAlloc();
789 void GCNPassConfig::addPreSched2() {
792 void GCNPassConfig::addPreEmitPass() {
793 // The hazard recognizer that runs as part of the post-ra scheduler does not
794 // guarantee to be able handle all hazards correctly. This is because if there
795 // are multiple scheduling regions in a basic block, the regions are scheduled
796 // bottom up, so when we begin to schedule a region we don't know what
797 // instructions were emitted directly before it.
799 // Here we add a stand-alone hazard recognizer pass which can handle all
801 addPass(&PostRAHazardRecognizerID);
803 if (EnableSIInsertWaitcntsPass)
804 addPass(createSIInsertWaitcntsPass());
806 addPass(createSIInsertWaitsPass());
807 addPass(createSIShrinkInstructionsPass());
808 addPass(&SIInsertSkipsPassID);
809 addPass(createSIDebuggerInsertNopsPass());
810 addPass(&BranchRelaxationPassID);
813 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
814 return new GCNPassConfig(*this, PM);