1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information needed to emit code for SI+ GPUs.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUCtorDtorLowering.h"
19 #include "AMDGPUExportClustering.h"
20 #include "AMDGPUIGroupLP.h"
21 #include "AMDGPUMacroFusion.h"
22 #include "AMDGPURegBankSelect.h"
23 #include "AMDGPUTargetObjectFile.h"
24 #include "AMDGPUTargetTransformInfo.h"
25 #include "AMDGPUUnifyDivergentExitNodes.h"
26 #include "GCNIterativeScheduler.h"
27 #include "GCNSchedStrategy.h"
28 #include "GCNVOPDUtils.h"
30 #include "R600MachineFunctionInfo.h"
31 #include "R600TargetMachine.h"
32 #include "SIMachineFunctionInfo.h"
33 #include "SIMachineScheduler.h"
34 #include "TargetInfo/AMDGPUTargetInfo.h"
35 #include "Utils/AMDGPUBaseInfo.h"
36 #include "llvm/Analysis/CGSCCPassManager.h"
37 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
38 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
39 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
40 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
41 #include "llvm/CodeGen/GlobalISel/Localizer.h"
42 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
43 #include "llvm/CodeGen/MIRParser/MIParser.h"
44 #include "llvm/CodeGen/Passes.h"
45 #include "llvm/CodeGen/RegAllocRegistry.h"
46 #include "llvm/CodeGen/TargetPassConfig.h"
47 #include "llvm/IR/IntrinsicsAMDGPU.h"
48 #include "llvm/IR/PassManager.h"
49 #include "llvm/IR/PatternMatch.h"
50 #include "llvm/InitializePasses.h"
51 #include "llvm/MC/TargetRegistry.h"
52 #include "llvm/Passes/PassBuilder.h"
53 #include "llvm/Transforms/IPO.h"
54 #include "llvm/Transforms/IPO/AlwaysInliner.h"
55 #include "llvm/Transforms/IPO/GlobalDCE.h"
56 #include "llvm/Transforms/IPO/Internalize.h"
57 #include "llvm/Transforms/Scalar.h"
58 #include "llvm/Transforms/Scalar/GVN.h"
59 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
60 #include "llvm/Transforms/Utils.h"
61 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
62 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
66 using namespace llvm::PatternMatch;
69 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
71 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
72 : RegisterRegAllocBase(N, D, C) {}
75 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
77 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
78 : RegisterRegAllocBase(N, D, C) {}
81 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
82 const TargetRegisterClass &RC) {
83 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
86 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
87 const TargetRegisterClass &RC) {
88 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
92 /// -{sgpr|vgpr}-regalloc=... command line option.
93 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
95 /// A dummy default pass factory indicates whether the register allocator is
96 /// overridden on the command line.
97 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
98 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
100 static SGPRRegisterRegAlloc
101 defaultSGPRRegAlloc("default",
102 "pick SGPR register allocator based on -O option",
103 useDefaultRegisterAllocator);
105 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
106 RegisterPassParser<SGPRRegisterRegAlloc>>
107 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
108 cl::desc("Register allocator to use for SGPRs"));
110 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
111 RegisterPassParser<VGPRRegisterRegAlloc>>
112 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
113 cl::desc("Register allocator to use for VGPRs"));
116 static void initializeDefaultSGPRRegisterAllocatorOnce() {
117 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
121 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
125 static void initializeDefaultVGPRRegisterAllocatorOnce() {
126 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
130 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
134 static FunctionPass *createBasicSGPRRegisterAllocator() {
135 return createBasicRegisterAllocator(onlyAllocateSGPRs);
138 static FunctionPass *createGreedySGPRRegisterAllocator() {
139 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
142 static FunctionPass *createFastSGPRRegisterAllocator() {
143 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
146 static FunctionPass *createBasicVGPRRegisterAllocator() {
147 return createBasicRegisterAllocator(onlyAllocateVGPRs);
150 static FunctionPass *createGreedyVGPRRegisterAllocator() {
151 return createGreedyRegisterAllocator(onlyAllocateVGPRs);
154 static FunctionPass *createFastVGPRRegisterAllocator() {
155 return createFastRegisterAllocator(onlyAllocateVGPRs, true);
158 static SGPRRegisterRegAlloc basicRegAllocSGPR(
159 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
160 static SGPRRegisterRegAlloc greedyRegAllocSGPR(
161 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
163 static SGPRRegisterRegAlloc fastRegAllocSGPR(
164 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
167 static VGPRRegisterRegAlloc basicRegAllocVGPR(
168 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
169 static VGPRRegisterRegAlloc greedyRegAllocVGPR(
170 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
172 static VGPRRegisterRegAlloc fastRegAllocVGPR(
173 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
176 static cl::opt<bool> EnableSROA(
178 cl::desc("Run SROA after promote alloca pass"),
183 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
184 cl::desc("Run early if-conversion"),
188 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
189 cl::desc("Run pre-RA exec mask optimizations"),
193 LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
194 cl::desc("Lower GPU ctor / dtors to globals on the device."),
195 cl::init(true), cl::Hidden);
197 // Option to disable vectorizer for tests.
198 static cl::opt<bool> EnableLoadStoreVectorizer(
199 "amdgpu-load-store-vectorizer",
200 cl::desc("Enable load store vectorizer"),
204 // Option to control global loads scalarization
205 static cl::opt<bool> ScalarizeGlobal(
206 "amdgpu-scalarize-global-loads",
207 cl::desc("Enable global load scalarization"),
211 // Option to run internalize pass.
212 static cl::opt<bool> InternalizeSymbols(
213 "amdgpu-internalize-symbols",
214 cl::desc("Enable elimination of non-kernel functions and unused globals"),
218 // Option to inline all early.
219 static cl::opt<bool> EarlyInlineAll(
220 "amdgpu-early-inline-all",
221 cl::desc("Inline all functions early"),
225 static cl::opt<bool> RemoveIncompatibleFunctions(
226 "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
227 cl::desc("Enable removal of functions when they"
228 "use features not supported by the target GPU"),
231 static cl::opt<bool> EnableSDWAPeephole(
232 "amdgpu-sdwa-peephole",
233 cl::desc("Enable SDWA peepholer"),
236 static cl::opt<bool> EnableDPPCombine(
237 "amdgpu-dpp-combine",
238 cl::desc("Enable DPP combiner"),
241 // Enable address space based alias analysis
242 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
243 cl::desc("Enable AMDGPU Alias Analysis"),
246 // Option to run late CFG structurizer
247 static cl::opt<bool, true> LateCFGStructurize(
248 "amdgpu-late-structurize",
249 cl::desc("Enable late CFG structurization"),
250 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
253 // Enable lib calls simplifications
254 static cl::opt<bool> EnableLibCallSimplify(
255 "amdgpu-simplify-libcall",
256 cl::desc("Enable amdgpu library simplifications"),
260 static cl::opt<bool> EnableLowerKernelArguments(
261 "amdgpu-ir-lower-kernel-arguments",
262 cl::desc("Lower kernel argument loads in IR pass"),
266 static cl::opt<bool> EnableRegReassign(
267 "amdgpu-reassign-regs",
268 cl::desc("Enable register reassign optimizations on gfx10+"),
272 static cl::opt<bool> OptVGPRLiveRange(
273 "amdgpu-opt-vgpr-liverange",
274 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
275 cl::init(true), cl::Hidden);
277 static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
278 "amdgpu-atomic-optimizer-strategy",
279 cl::desc("Select DPP or Iterative strategy for scan"),
280 cl::init(ScanOptions::Iterative),
282 clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
283 clEnumValN(ScanOptions::Iterative, "Iterative",
284 "Use Iterative approach for scan"),
285 clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
287 // Enable Mode register optimization
288 static cl::opt<bool> EnableSIModeRegisterPass(
289 "amdgpu-mode-register",
290 cl::desc("Enable mode register pass"),
294 // Enable GFX11+ s_delay_alu insertion
296 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
297 cl::desc("Enable s_delay_alu insertion"),
298 cl::init(true), cl::Hidden);
300 // Enable GFX11+ VOPD
302 EnableVOPD("amdgpu-enable-vopd",
303 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
304 cl::init(true), cl::Hidden);
306 // Option is used in lit tests to prevent deadcoding of patterns inspected.
308 EnableDCEInRA("amdgpu-dce-in-ra",
309 cl::init(true), cl::Hidden,
310 cl::desc("Enable machine DCE inside regalloc"));
312 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
313 cl::desc("Adjust wave priority"),
314 cl::init(false), cl::Hidden);
316 static cl::opt<bool> EnableScalarIRPasses(
317 "amdgpu-scalar-ir-passes",
318 cl::desc("Enable scalar IR passes"),
322 static cl::opt<bool> EnableStructurizerWorkarounds(
323 "amdgpu-enable-structurizer-workarounds",
324 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
327 static cl::opt<bool, true> EnableLowerModuleLDS(
328 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
329 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
332 static cl::opt<bool> EnablePreRAOptimizations(
333 "amdgpu-enable-pre-ra-optimizations",
334 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
337 static cl::opt<bool> EnablePromoteKernelArguments(
338 "amdgpu-enable-promote-kernel-arguments",
339 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
340 cl::Hidden, cl::init(true));
342 static cl::opt<bool> EnableMaxIlpSchedStrategy(
343 "amdgpu-enable-max-ilp-scheduling-strategy",
344 cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
345 cl::Hidden, cl::init(false));
347 static cl::opt<bool> EnableRewritePartialRegUses(
348 "amdgpu-enable-rewrite-partial-reg-uses",
349 cl::desc("Enable rewrite partial reg uses pass"), cl::init(false),
352 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
353 // Register the target
354 RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
355 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
357 PassRegistry *PR = PassRegistry::getPassRegistry();
358 initializeR600ClauseMergePassPass(*PR);
359 initializeR600ControlFlowFinalizerPass(*PR);
360 initializeR600PacketizerPass(*PR);
361 initializeR600ExpandSpecialInstrsPassPass(*PR);
362 initializeR600VectorRegMergerPass(*PR);
363 initializeGlobalISel(*PR);
364 initializeAMDGPUDAGToDAGISelPass(*PR);
365 initializeGCNDPPCombinePass(*PR);
366 initializeSILowerI1CopiesPass(*PR);
367 initializeSILowerWWMCopiesPass(*PR);
368 initializeSILowerSGPRSpillsPass(*PR);
369 initializeSIFixSGPRCopiesPass(*PR);
370 initializeSIFixVGPRCopiesPass(*PR);
371 initializeSIFoldOperandsPass(*PR);
372 initializeSIPeepholeSDWAPass(*PR);
373 initializeSIShrinkInstructionsPass(*PR);
374 initializeSIOptimizeExecMaskingPreRAPass(*PR);
375 initializeSIOptimizeVGPRLiveRangePass(*PR);
376 initializeSILoadStoreOptimizerPass(*PR);
377 initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
378 initializeAMDGPUAlwaysInlinePass(*PR);
379 initializeAMDGPUAttributorPass(*PR);
380 initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
381 initializeAMDGPUAnnotateUniformValuesPass(*PR);
382 initializeAMDGPUArgumentUsageInfoPass(*PR);
383 initializeAMDGPUAtomicOptimizerPass(*PR);
384 initializeAMDGPULowerKernelArgumentsPass(*PR);
385 initializeAMDGPUPromoteKernelArgumentsPass(*PR);
386 initializeAMDGPULowerKernelAttributesPass(*PR);
387 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
388 initializeAMDGPUPostLegalizerCombinerPass(*PR);
389 initializeAMDGPUPreLegalizerCombinerPass(*PR);
390 initializeAMDGPURegBankCombinerPass(*PR);
391 initializeAMDGPURegBankSelectPass(*PR);
392 initializeAMDGPUPromoteAllocaPass(*PR);
393 initializeAMDGPUPromoteAllocaToVectorPass(*PR);
394 initializeAMDGPUCodeGenPreparePass(*PR);
395 initializeAMDGPULateCodeGenPreparePass(*PR);
396 initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
397 initializeAMDGPULowerModuleLDSPass(*PR);
398 initializeAMDGPURewriteOutArgumentsPass(*PR);
399 initializeAMDGPURewriteUndefForPHIPass(*PR);
400 initializeAMDGPUUnifyMetadataPass(*PR);
401 initializeSIAnnotateControlFlowPass(*PR);
402 initializeAMDGPUInsertDelayAluPass(*PR);
403 initializeSIInsertHardClausesPass(*PR);
404 initializeSIInsertWaitcntsPass(*PR);
405 initializeSIModeRegisterPass(*PR);
406 initializeSIWholeQuadModePass(*PR);
407 initializeSILowerControlFlowPass(*PR);
408 initializeSIPreEmitPeepholePass(*PR);
409 initializeSILateBranchLoweringPass(*PR);
410 initializeSIMemoryLegalizerPass(*PR);
411 initializeSIOptimizeExecMaskingPass(*PR);
412 initializeSIPreAllocateWWMRegsPass(*PR);
413 initializeSIFormMemoryClausesPass(*PR);
414 initializeSIPostRABundlerPass(*PR);
415 initializeGCNCreateVOPDPass(*PR);
416 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
417 initializeAMDGPUAAWrapperPassPass(*PR);
418 initializeAMDGPUExternalAAWrapperPass(*PR);
419 initializeAMDGPUUseNativeCallsPass(*PR);
420 initializeAMDGPUSimplifyLibCallsPass(*PR);
421 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
422 initializeAMDGPUResourceUsageAnalysisPass(*PR);
423 initializeGCNNSAReassignPass(*PR);
424 initializeGCNPreRAOptimizationsPass(*PR);
425 initializeGCNPreRALongBranchRegPass(*PR);
426 initializeGCNRewritePartialRegUsesPass(*PR);
429 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
430 return std::make_unique<AMDGPUTargetObjectFile>();
433 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
434 return new SIScheduleDAGMI(C);
437 static ScheduleDAGInstrs *
438 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
439 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
440 ScheduleDAGMILive *DAG =
441 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
442 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
443 if (ST.shouldClusterStores())
444 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
445 DAG->addMutation(createIGroupLPDAGMutation());
446 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
447 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
451 static ScheduleDAGInstrs *
452 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
453 ScheduleDAGMILive *DAG =
454 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
455 DAG->addMutation(createIGroupLPDAGMutation());
459 static ScheduleDAGInstrs *
460 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
461 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
462 auto DAG = new GCNIterativeScheduler(C,
463 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
464 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
465 if (ST.shouldClusterStores())
466 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
470 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
471 return new GCNIterativeScheduler(C,
472 GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
475 static ScheduleDAGInstrs *
476 createIterativeILPMachineScheduler(MachineSchedContext *C) {
477 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
478 auto DAG = new GCNIterativeScheduler(C,
479 GCNIterativeScheduler::SCHEDULE_ILP);
480 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
481 if (ST.shouldClusterStores())
482 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
483 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
487 static MachineSchedRegistry
488 SISchedRegistry("si", "Run SI's custom scheduler",
489 createSIMachineScheduler);
491 static MachineSchedRegistry
492 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
493 "Run GCN scheduler to maximize occupancy",
494 createGCNMaxOccupancyMachineScheduler);
496 static MachineSchedRegistry
497 GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
498 createGCNMaxILPMachineScheduler);
500 static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
501 "gcn-iterative-max-occupancy-experimental",
502 "Run GCN scheduler to maximize occupancy (experimental)",
503 createIterativeGCNMaxOccupancyMachineScheduler);
505 static MachineSchedRegistry GCNMinRegSchedRegistry(
506 "gcn-iterative-minreg",
507 "Run GCN iterative scheduler for minimal register usage (experimental)",
508 createMinRegScheduler);
510 static MachineSchedRegistry GCNILPSchedRegistry(
512 "Run GCN iterative scheduler for ILP scheduling (experimental)",
513 createIterativeILPMachineScheduler);
515 static StringRef computeDataLayout(const Triple &TT) {
516 if (TT.getArch() == Triple::r600) {
518 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
519 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
522 // 32-bit private, local, and region pointers. 64-bit global, constant and
523 // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
524 // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
525 // (address space 7), and 128-bit non-integral buffer resourcees (address
526 // space 8) which cannot be non-trivilally accessed by LLVM memory operations
527 // like getelementptr.
528 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
529 "-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:"
530 "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
535 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
539 // Need to default to a target with flat support for HSA.
540 if (TT.getArch() == Triple::amdgcn)
541 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
546 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
547 // The AMDGPU toolchain only supports generating shared objects, so we
548 // must always use PIC.
552 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
553 StringRef CPU, StringRef FS,
554 TargetOptions Options,
555 std::optional<Reloc::Model> RM,
556 std::optional<CodeModel::Model> CM,
557 CodeGenOpt::Level OptLevel)
558 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
559 FS, Options, getEffectiveRelocModel(RM),
560 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
561 TLOF(createTLOF(getTargetTriple())) {
563 if (TT.getArch() == Triple::amdgcn) {
564 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
565 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
566 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
567 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
571 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
572 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
573 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
575 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
577 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
578 Attribute GPUAttr = F.getFnAttribute("target-cpu");
579 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
582 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
583 Attribute FSAttr = F.getFnAttribute("target-features");
585 return FSAttr.isValid() ? FSAttr.getValueAsString()
586 : getTargetFeatureString();
589 /// Predicate for Internalize pass.
590 static bool mustPreserveGV(const GlobalValue &GV) {
591 if (const Function *F = dyn_cast<Function>(&GV))
592 return F->isDeclaration() || F->getName().startswith("__asan_") ||
593 F->getName().startswith("__sanitizer_") ||
594 AMDGPU::isEntryFunctionCC(F->getCallingConv());
596 GV.removeDeadConstantUsers();
597 return !GV.use_empty();
600 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
601 AAM.registerFunctionAnalysis<AMDGPUAA>();
604 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
605 PB.registerPipelineParsingCallback(
606 [](StringRef PassName, ModulePassManager &PM,
607 ArrayRef<PassBuilder::PipelineElement>) {
608 if (PassName == "amdgpu-unify-metadata") {
609 PM.addPass(AMDGPUUnifyMetadataPass());
612 if (PassName == "amdgpu-printf-runtime-binding") {
613 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
616 if (PassName == "amdgpu-always-inline") {
617 PM.addPass(AMDGPUAlwaysInlinePass());
620 if (PassName == "amdgpu-lower-module-lds") {
621 PM.addPass(AMDGPULowerModuleLDSPass());
624 if (PassName == "amdgpu-lower-ctor-dtor") {
625 PM.addPass(AMDGPUCtorDtorLoweringPass());
630 PB.registerPipelineParsingCallback(
631 [this](StringRef PassName, FunctionPassManager &PM,
632 ArrayRef<PassBuilder::PipelineElement>) {
633 if (PassName == "amdgpu-simplifylib") {
634 PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
637 if (PassName == "amdgpu-usenative") {
638 PM.addPass(AMDGPUUseNativeCallsPass());
641 if (PassName == "amdgpu-promote-alloca") {
642 PM.addPass(AMDGPUPromoteAllocaPass(*this));
645 if (PassName == "amdgpu-promote-alloca-to-vector") {
646 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
649 if (PassName == "amdgpu-lower-kernel-attributes") {
650 PM.addPass(AMDGPULowerKernelAttributesPass());
653 if (PassName == "amdgpu-promote-kernel-arguments") {
654 PM.addPass(AMDGPUPromoteKernelArgumentsPass());
657 if (PassName == "amdgpu-unify-divergent-exit-nodes") {
658 PM.addPass(AMDGPUUnifyDivergentExitNodesPass());
661 if (PassName == "amdgpu-atomic-optimizer") {
663 AMDGPUAtomicOptimizerPass(*this, AMDGPUAtomicOptimizerStrategy));
666 if (PassName == "amdgpu-codegenprepare") {
667 PM.addPass(AMDGPUCodeGenPreparePass(*this));
673 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
674 FAM.registerPass([&] { return AMDGPUAA(); });
677 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
678 if (AAName == "amdgpu-aa") {
679 AAM.registerFunctionAnalysis<AMDGPUAA>();
685 PB.registerPipelineStartEPCallback(
686 [this](ModulePassManager &PM, OptimizationLevel Level) {
687 FunctionPassManager FPM;
688 FPM.addPass(AMDGPUUseNativeCallsPass());
689 if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
690 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
691 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
694 PB.registerPipelineEarlySimplificationEPCallback(
695 [](ModulePassManager &PM, OptimizationLevel Level) {
696 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
698 if (Level == OptimizationLevel::O0)
701 PM.addPass(AMDGPUUnifyMetadataPass());
703 if (InternalizeSymbols) {
704 PM.addPass(InternalizePass(mustPreserveGV));
705 PM.addPass(GlobalDCEPass());
708 if (EarlyInlineAll && !EnableFunctionCalls)
709 PM.addPass(AMDGPUAlwaysInlinePass());
712 PB.registerCGSCCOptimizerLateEPCallback(
713 [this](CGSCCPassManager &PM, OptimizationLevel Level) {
714 if (Level == OptimizationLevel::O0)
717 FunctionPassManager FPM;
719 // Add promote kernel arguments pass to the opt pipeline right before
720 // infer address spaces which is needed to do actual address space
722 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
723 EnablePromoteKernelArguments)
724 FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
726 // Add infer address spaces pass to the opt pipeline after inlining
727 // but before SROA to increase SROA opportunities.
728 FPM.addPass(InferAddressSpacesPass());
730 // This should run after inlining to have any chance of doing
731 // anything, and before other cleanup optimizations.
732 FPM.addPass(AMDGPULowerKernelAttributesPass());
734 if (Level != OptimizationLevel::O0) {
735 // Promote alloca to vector before SROA and loop unroll. If we
736 // manage to eliminate allocas before unroll we may choose to unroll
738 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
741 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
745 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
746 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
747 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
748 AddrSpace == AMDGPUAS::REGION_ADDRESS)
753 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
754 unsigned DestAS) const {
755 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
756 AMDGPU::isFlatGlobalAddrSpace(DestAS);
759 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
760 const auto *LD = dyn_cast<LoadInst>(V);
762 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
764 // It must be a generic pointer loaded.
765 assert(V->getType()->isPointerTy() &&
766 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
768 const auto *Ptr = LD->getPointerOperand();
769 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
770 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
771 // For a generic pointer loaded from the constant memory, it could be assumed
772 // as a global pointer since the constant memory is only populated on the
773 // host side. As implied by the offload programming model, only global
774 // pointers could be referenced on the host side.
775 return AMDGPUAS::GLOBAL_ADDRESS;
778 std::pair<const Value *, unsigned>
779 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
780 if (auto *II = dyn_cast<IntrinsicInst>(V)) {
781 switch (II->getIntrinsicID()) {
782 case Intrinsic::amdgcn_is_shared:
783 return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
784 case Intrinsic::amdgcn_is_private:
785 return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
789 return std::pair(nullptr, -1);
791 // Check the global pointer predication based on
792 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
793 // the order of 'is_shared' and 'is_private' is not significant.
796 const_cast<Value *>(V),
797 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
798 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
800 return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
802 return std::pair(nullptr, -1);
806 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
808 case PseudoSourceValue::Stack:
809 case PseudoSourceValue::FixedStack:
810 return AMDGPUAS::PRIVATE_ADDRESS;
811 case PseudoSourceValue::ConstantPool:
812 case PseudoSourceValue::GOT:
813 case PseudoSourceValue::JumpTable:
814 case PseudoSourceValue::GlobalValueCallEntry:
815 case PseudoSourceValue::ExternalSymbolCallEntry:
816 return AMDGPUAS::CONSTANT_ADDRESS;
818 return AMDGPUAS::FLAT_ADDRESS;
821 //===----------------------------------------------------------------------===//
822 // GCN Target Machine (SI+)
823 //===----------------------------------------------------------------------===//
825 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
826 StringRef CPU, StringRef FS,
827 TargetOptions Options,
828 std::optional<Reloc::Model> RM,
829 std::optional<CodeModel::Model> CM,
830 CodeGenOpt::Level OL, bool JIT)
831 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
833 const TargetSubtargetInfo *
834 GCNTargetMachine::getSubtargetImpl(const Function &F) const {
835 StringRef GPU = getGPUName(F);
836 StringRef FS = getFeatureString(F);
838 SmallString<128> SubtargetKey(GPU);
839 SubtargetKey.append(FS);
841 auto &I = SubtargetMap[SubtargetKey];
843 // This needs to be done before we create a new subtarget since any
844 // creation will depend on the TM and the code generation flags on the
845 // function that reside in TargetOptions.
846 resetTargetOptions(F);
847 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
850 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
856 GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
857 return TargetTransformInfo(GCNTTIImpl(this, F));
860 //===----------------------------------------------------------------------===//
862 //===----------------------------------------------------------------------===//
864 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
865 return getStandardCSEConfigForOpt(TM->getOptLevel());
870 class GCNPassConfig final : public AMDGPUPassConfig {
872 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
873 : AMDGPUPassConfig(TM, PM) {
874 // It is necessary to know the register usage of the entire call graph. We
875 // allow calls without EnableAMDGPUFunctionCalls if they are marked
876 // noinline, so this is always required.
877 setRequiresCodeGenSCCOrder(true);
878 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
881 GCNTargetMachine &getGCNTargetMachine() const {
882 return getTM<GCNTargetMachine>();
886 createMachineScheduler(MachineSchedContext *C) const override;
889 createPostMachineScheduler(MachineSchedContext *C) const override {
890 ScheduleDAGMI *DAG = new GCNPostScheduleDAGMILive(
891 C, std::make_unique<PostGenericScheduler>(C),
892 /*RemoveKillFlags=*/true);
893 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
894 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
895 if (ST.shouldClusterStores())
896 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
897 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
898 DAG->addMutation(createIGroupLPDAGMutation());
899 if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
900 DAG->addMutation(createVOPDPairingMutation());
904 bool addPreISel() override;
905 void addMachineSSAOptimization() override;
906 bool addILPOpts() override;
907 bool addInstSelector() override;
908 bool addIRTranslator() override;
909 void addPreLegalizeMachineIR() override;
910 bool addLegalizeMachineIR() override;
911 void addPreRegBankSelect() override;
912 bool addRegBankSelect() override;
913 void addPreGlobalInstructionSelect() override;
914 bool addGlobalInstructionSelect() override;
915 void addFastRegAlloc() override;
916 void addOptimizedRegAlloc() override;
918 FunctionPass *createSGPRAllocPass(bool Optimized);
919 FunctionPass *createVGPRAllocPass(bool Optimized);
920 FunctionPass *createRegAllocPass(bool Optimized) override;
922 bool addRegAssignAndRewriteFast() override;
923 bool addRegAssignAndRewriteOptimized() override;
925 void addPreRegAlloc() override;
926 bool addPreRewrite() override;
927 void addPostRegAlloc() override;
928 void addPreSched2() override;
929 void addPreEmitPass() override;
932 } // end anonymous namespace
934 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
935 : TargetPassConfig(TM, PM) {
936 // Exceptions and StackMaps are not supported, so these passes will never do
938 disablePass(&StackMapLivenessID);
939 disablePass(&FuncletLayoutID);
940 // Garbage collection is not supported.
941 disablePass(&GCLoweringID);
942 disablePass(&ShadowStackGCLoweringID);
945 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
946 if (getOptLevel() == CodeGenOpt::Aggressive)
947 addPass(createGVNPass());
949 addPass(createEarlyCSEPass());
952 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
953 addPass(createSeparateConstOffsetFromGEPPass());
954 // ReassociateGEPs exposes more opportunities for SLSR. See
955 // the example in reassociate-geps-and-slsr.ll.
956 addPass(createStraightLineStrengthReducePass());
957 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
958 // EarlyCSE can reuse.
959 addEarlyCSEOrGVNPass();
960 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
961 addPass(createNaryReassociatePass());
962 // NaryReassociate on GEPs creates redundant common expressions, so run
963 // EarlyCSE after it.
964 addPass(createEarlyCSEPass());
967 void AMDGPUPassConfig::addIRPasses() {
968 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
970 // There is no reason to run these.
971 disablePass(&StackMapLivenessID);
972 disablePass(&FuncletLayoutID);
973 disablePass(&PatchableFunctionID);
975 addPass(createAMDGPUPrintfRuntimeBinding());
977 addPass(createAMDGPUCtorDtorLoweringLegacyPass());
979 // Function calls are not supported, so make sure we inline everything.
980 addPass(createAMDGPUAlwaysInlinePass());
981 addPass(createAlwaysInlinerLegacyPass());
983 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
984 if (TM.getTargetTriple().getArch() == Triple::r600)
985 addPass(createR600OpenCLImageTypeLoweringPass());
987 // Replace OpenCL enqueued block function pointers with global variables.
988 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
990 // Runs before PromoteAlloca so the latter can account for function uses
991 if (EnableLowerModuleLDS) {
992 addPass(createAMDGPULowerModuleLDSPass());
995 // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
996 // after their introduction
997 if (TM.getOptLevel() > CodeGenOpt::None)
998 addPass(createAMDGPUAttributorPass());
1000 if (TM.getOptLevel() > CodeGenOpt::None)
1001 addPass(createInferAddressSpacesPass());
1003 addPass(createAtomicExpandPass());
1005 if (TM.getOptLevel() > CodeGenOpt::None) {
1006 addPass(createAMDGPUPromoteAlloca());
1009 addPass(createSROAPass());
1010 if (isPassEnabled(EnableScalarIRPasses))
1011 addStraightLineScalarOptimizationPasses();
1013 if (EnableAMDGPUAliasAnalysis) {
1014 addPass(createAMDGPUAAWrapperPass());
1015 addPass(createExternalAAWrapperPass([](Pass &P, Function &,
1017 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1018 AAR.addAAResult(WrapperPass->getResult());
1022 if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1023 // TODO: May want to move later or split into an early and late one.
1024 addPass(createAMDGPUCodeGenPreparePass());
1027 // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
1029 if (TM.getOptLevel() > CodeGenOpt::Less)
1030 addPass(createLICMPass());
1033 TargetPassConfig::addIRPasses();
1035 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1036 // example, GVN can combine
1043 // %0 = shl nsw %a, 2
1046 // but EarlyCSE can do neither of them.
1047 if (isPassEnabled(EnableScalarIRPasses))
1048 addEarlyCSEOrGVNPass();
1051 void AMDGPUPassConfig::addCodeGenPrepare() {
1052 if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1053 if (RemoveIncompatibleFunctions)
1054 addPass(createAMDGPURemoveIncompatibleFunctionsPass(TM));
1056 // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1057 // analysis, and should be removed.
1058 addPass(createAMDGPUAnnotateKernelFeaturesPass());
1061 if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
1062 EnableLowerKernelArguments)
1063 addPass(createAMDGPULowerKernelArgumentsPass());
1065 TargetPassConfig::addCodeGenPrepare();
1067 if (isPassEnabled(EnableLoadStoreVectorizer))
1068 addPass(createLoadStoreVectorizerPass());
1070 // LowerSwitch pass may introduce unreachable blocks that can
1071 // cause unexpected behavior for subsequent passes. Placing it
1072 // here seems better that these blocks would get cleaned up by
1073 // UnreachableBlockElim inserted next in the pass flow.
1074 addPass(createLowerSwitchPass());
1077 bool AMDGPUPassConfig::addPreISel() {
1078 if (TM->getOptLevel() > CodeGenOpt::None)
1079 addPass(createFlattenCFGPass());
1083 bool AMDGPUPassConfig::addInstSelector() {
1084 addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
1088 bool AMDGPUPassConfig::addGCPasses() {
1089 // Do nothing. GC is not supported.
1093 llvm::ScheduleDAGInstrs *
1094 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
1095 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1096 ScheduleDAGMILive *DAG = createGenericSchedLive(C);
1097 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1098 if (ST.shouldClusterStores())
1099 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1103 MachineFunctionInfo *R600TargetMachine::createMachineFunctionInfo(
1104 BumpPtrAllocator &Allocator, const Function &F,
1105 const TargetSubtargetInfo *STI) const {
1106 return R600MachineFunctionInfo::create<R600MachineFunctionInfo>(
1107 Allocator, F, static_cast<const R600Subtarget *>(STI));
1110 //===----------------------------------------------------------------------===//
1112 //===----------------------------------------------------------------------===//
1114 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1115 MachineSchedContext *C) const {
1116 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1117 if (ST.enableSIScheduler())
1118 return createSIMachineScheduler(C);
1120 if (EnableMaxIlpSchedStrategy)
1121 return createGCNMaxILPMachineScheduler(C);
1123 return createGCNMaxOccupancyMachineScheduler(C);
1126 bool GCNPassConfig::addPreISel() {
1127 AMDGPUPassConfig::addPreISel();
1129 if (TM->getOptLevel() > CodeGenOpt::None)
1130 addPass(createAMDGPULateCodeGenPreparePass());
1132 if ((TM->getOptLevel() >= CodeGenOpt::Less) &&
1133 (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
1134 addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
1137 if (TM->getOptLevel() > CodeGenOpt::None)
1138 addPass(createSinkingPass());
1140 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1141 // regions formed by them.
1142 addPass(&AMDGPUUnifyDivergentExitNodesID);
1143 if (!LateCFGStructurize) {
1144 if (EnableStructurizerWorkarounds) {
1145 addPass(createFixIrreduciblePass());
1146 addPass(createUnifyLoopExitsPass());
1148 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1150 addPass(createAMDGPUAnnotateUniformValues());
1151 if (!LateCFGStructurize) {
1152 addPass(createSIAnnotateControlFlowPass());
1153 // TODO: Move this right after structurizeCFG to avoid extra divergence
1154 // analysis. This depends on stopping SIAnnotateControlFlow from making
1155 // control flow modifications.
1156 addPass(createAMDGPURewriteUndefForPHIPass());
1158 addPass(createLCSSAPass());
1160 if (TM->getOptLevel() > CodeGenOpt::Less)
1161 addPass(&AMDGPUPerfHintAnalysisID);
1166 void GCNPassConfig::addMachineSSAOptimization() {
1167 TargetPassConfig::addMachineSSAOptimization();
1169 // We want to fold operands after PeepholeOptimizer has run (or as part of
1170 // it), because it will eliminate extra copies making it easier to fold the
1171 // real source operand. We want to eliminate dead instructions after, so that
1172 // we see fewer uses of the copies. We then need to clean up the dead
1173 // instructions leftover after the operands are folded as well.
1175 // XXX - Can we get away without running DeadMachineInstructionElim again?
1176 addPass(&SIFoldOperandsID);
1177 if (EnableDPPCombine)
1178 addPass(&GCNDPPCombineID);
1179 addPass(&SILoadStoreOptimizerID);
1180 if (isPassEnabled(EnableSDWAPeephole)) {
1181 addPass(&SIPeepholeSDWAID);
1182 addPass(&EarlyMachineLICMID);
1183 addPass(&MachineCSEID);
1184 addPass(&SIFoldOperandsID);
1186 addPass(&DeadMachineInstructionElimID);
1187 addPass(createSIShrinkInstructionsPass());
1190 bool GCNPassConfig::addILPOpts() {
1191 if (EnableEarlyIfConversion)
1192 addPass(&EarlyIfConverterID);
1194 TargetPassConfig::addILPOpts();
1198 bool GCNPassConfig::addInstSelector() {
1199 AMDGPUPassConfig::addInstSelector();
1200 addPass(&SIFixSGPRCopiesID);
1201 addPass(createSILowerI1CopiesPass());
1205 bool GCNPassConfig::addIRTranslator() {
1206 addPass(new IRTranslator(getOptLevel()));
1210 void GCNPassConfig::addPreLegalizeMachineIR() {
1211 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1212 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1213 addPass(new Localizer());
1216 bool GCNPassConfig::addLegalizeMachineIR() {
1217 addPass(new Legalizer());
1221 void GCNPassConfig::addPreRegBankSelect() {
1222 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1223 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1226 bool GCNPassConfig::addRegBankSelect() {
1227 addPass(new AMDGPURegBankSelect());
1231 void GCNPassConfig::addPreGlobalInstructionSelect() {
1232 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1233 addPass(createAMDGPURegBankCombiner(IsOptNone));
1236 bool GCNPassConfig::addGlobalInstructionSelect() {
1237 addPass(new InstructionSelect(getOptLevel()));
1241 void GCNPassConfig::addPreRegAlloc() {
1242 if (LateCFGStructurize) {
1243 addPass(createAMDGPUMachineCFGStructurizerPass());
1247 void GCNPassConfig::addFastRegAlloc() {
1248 // FIXME: We have to disable the verifier here because of PHIElimination +
1249 // TwoAddressInstructions disabling it.
1251 // This must be run immediately after phi elimination and before
1252 // TwoAddressInstructions, otherwise the processing of the tied operand of
1253 // SI_ELSE will introduce a copy of the tied operand source after the else.
1254 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1256 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1257 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
1259 TargetPassConfig::addFastRegAlloc();
1262 void GCNPassConfig::addOptimizedRegAlloc() {
1263 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1264 // instructions that cause scheduling barriers.
1265 insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1266 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
1268 if (OptExecMaskPreRA)
1269 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1271 if (EnableRewritePartialRegUses)
1272 insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
1274 if (isPassEnabled(EnablePreRAOptimizations))
1275 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
1277 // This is not an essential optimization and it has a noticeable impact on
1278 // compilation time, so we only enable it from O2.
1279 if (TM->getOptLevel() > CodeGenOpt::Less)
1280 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1282 // FIXME: when an instruction has a Killed operand, and the instruction is
1283 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1284 // the register in LiveVariables, this would trigger a failure in verifier,
1285 // we should fix it and enable the verifier.
1286 if (OptVGPRLiveRange)
1287 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID);
1288 // This must be run immediately after phi elimination and before
1289 // TwoAddressInstructions, otherwise the processing of the tied operand of
1290 // SI_ELSE will introduce a copy of the tied operand source after the else.
1291 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1294 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1296 TargetPassConfig::addOptimizedRegAlloc();
1299 bool GCNPassConfig::addPreRewrite() {
1300 addPass(&SILowerWWMCopiesID);
1301 if (EnableRegReassign)
1302 addPass(&GCNNSAReassignID);
1306 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1307 // Initialize the global default.
1308 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1309 initializeDefaultSGPRRegisterAllocatorOnce);
1311 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1312 if (Ctor != useDefaultRegisterAllocator)
1316 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1318 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1321 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1322 // Initialize the global default.
1323 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1324 initializeDefaultVGPRRegisterAllocatorOnce);
1326 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1327 if (Ctor != useDefaultRegisterAllocator)
1331 return createGreedyVGPRRegisterAllocator();
1333 return createFastVGPRRegisterAllocator();
1336 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1337 llvm_unreachable("should not be used");
1340 static const char RegAllocOptNotSupportedMessage[] =
1341 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1343 bool GCNPassConfig::addRegAssignAndRewriteFast() {
1344 if (!usingDefaultRegAlloc())
1345 report_fatal_error(RegAllocOptNotSupportedMessage);
1347 addPass(&GCNPreRALongBranchRegID);
1349 addPass(createSGPRAllocPass(false));
1351 // Equivalent of PEI for SGPRs.
1352 addPass(&SILowerSGPRSpillsID);
1354 addPass(createVGPRAllocPass(false));
1356 addPass(&SILowerWWMCopiesID);
1360 bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1361 if (!usingDefaultRegAlloc())
1362 report_fatal_error(RegAllocOptNotSupportedMessage);
1364 addPass(&GCNPreRALongBranchRegID);
1366 addPass(createSGPRAllocPass(true));
1368 // Commit allocated register changes. This is mostly necessary because too
1369 // many things rely on the use lists of the physical registers, such as the
1370 // verifier. This is only necessary with allocators which use LiveIntervals,
1371 // since FastRegAlloc does the replacements itself.
1372 addPass(createVirtRegRewriter(false));
1374 // Equivalent of PEI for SGPRs.
1375 addPass(&SILowerSGPRSpillsID);
1377 addPass(createVGPRAllocPass(true));
1380 addPass(&VirtRegRewriterID);
1385 void GCNPassConfig::addPostRegAlloc() {
1386 addPass(&SIFixVGPRCopiesID);
1387 if (getOptLevel() > CodeGenOpt::None)
1388 addPass(&SIOptimizeExecMaskingID);
1389 TargetPassConfig::addPostRegAlloc();
1392 void GCNPassConfig::addPreSched2() {
1393 if (TM->getOptLevel() > CodeGenOpt::None)
1394 addPass(createSIShrinkInstructionsPass());
1395 addPass(&SIPostRABundlerID);
1398 void GCNPassConfig::addPreEmitPass() {
1399 if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
1400 addPass(&GCNCreateVOPDID);
1401 addPass(createSIMemoryLegalizerPass());
1402 addPass(createSIInsertWaitcntsPass());
1404 addPass(createSIModeRegisterPass());
1406 if (getOptLevel() > CodeGenOpt::None)
1407 addPass(&SIInsertHardClausesID);
1409 addPass(&SILateBranchLoweringPassID);
1410 if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
1411 addPass(createAMDGPUSetWavePriorityPass());
1412 if (getOptLevel() > CodeGenOpt::None)
1413 addPass(&SIPreEmitPeepholeID);
1414 // The hazard recognizer that runs as part of the post-ra scheduler does not
1415 // guarantee to be able handle all hazards correctly. This is because if there
1416 // are multiple scheduling regions in a basic block, the regions are scheduled
1417 // bottom up, so when we begin to schedule a region we don't know what
1418 // instructions were emitted directly before it.
1420 // Here we add a stand-alone hazard recognizer pass which can handle all
1422 addPass(&PostRAHazardRecognizerID);
1424 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
1425 addPass(&AMDGPUInsertDelayAluID);
1427 addPass(&BranchRelaxationPassID);
1430 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1431 return new GCNPassConfig(*this, PM);
1434 void GCNTargetMachine::registerMachineRegisterInfoCallback(
1435 MachineFunction &MF) const {
1436 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1437 MF.getRegInfo().addDelegate(MFI);
1440 MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
1441 BumpPtrAllocator &Allocator, const Function &F,
1442 const TargetSubtargetInfo *STI) const {
1443 return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
1444 Allocator, F, static_cast<const GCNSubtarget *>(STI));
1447 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1448 return new yaml::SIMachineFunctionInfo();
1451 yaml::MachineFunctionInfo *
1452 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1453 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1454 return new yaml::SIMachineFunctionInfo(
1455 *MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
1458 bool GCNTargetMachine::parseMachineFunctionInfo(
1459 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1460 SMDiagnostic &Error, SMRange &SourceRange) const {
1461 const yaml::SIMachineFunctionInfo &YamlMFI =
1462 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1463 MachineFunction &MF = PFS.MF;
1464 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1466 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1469 if (MFI->Occupancy == 0) {
1470 // Fixup the subtarget dependent default value.
1471 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1472 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1475 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1477 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1478 SourceRange = RegName.SourceRange;
1486 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1488 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1491 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1494 if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
1497 if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
1498 MFI->LongBranchReservedReg))
1501 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1502 // Create a diagnostic for a the register string literal.
1503 const MemoryBuffer &Buffer =
1504 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1505 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1506 RegName.Value.size(), SourceMgr::DK_Error,
1507 "incorrect register class for field", RegName.Value,
1508 std::nullopt, std::nullopt);
1509 SourceRange = RegName.SourceRange;
1513 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1514 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1515 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1518 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1519 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1520 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1523 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1524 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1525 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1528 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1529 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1530 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1533 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1535 if (parseRegister(YamlReg, ParsedReg))
1538 MFI->reserveWWMRegister(ParsedReg);
1541 auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
1542 const TargetRegisterClass &RC,
1543 ArgDescriptor &Arg, unsigned UserSGPRs,
1544 unsigned SystemSGPRs) {
1545 // Skip parsing if it's not present.
1549 if (A->IsRegister) {
1551 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1552 SourceRange = A->RegisterName.SourceRange;
1555 if (!RC.contains(Reg))
1556 return diagnoseRegisterClass(A->RegisterName);
1557 Arg = ArgDescriptor::createRegister(Reg);
1559 Arg = ArgDescriptor::createStack(A->StackOffset);
1560 // Check and apply the optional mask.
1562 Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1564 MFI->NumUserSGPRs += UserSGPRs;
1565 MFI->NumSystemSGPRs += SystemSGPRs;
1569 if (YamlMFI.ArgInfo &&
1570 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1571 AMDGPU::SGPR_128RegClass,
1572 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1573 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1574 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1576 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1577 MFI->ArgInfo.QueuePtr, 2, 0) ||
1578 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1579 AMDGPU::SReg_64RegClass,
1580 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1581 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1582 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1584 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1585 AMDGPU::SReg_64RegClass,
1586 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1587 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1588 AMDGPU::SGPR_32RegClass,
1589 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1590 parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
1591 AMDGPU::SGPR_32RegClass,
1592 MFI->ArgInfo.LDSKernelId, 0, 1) ||
1593 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1594 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1596 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1597 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1599 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1600 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1602 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1603 AMDGPU::SGPR_32RegClass,
1604 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1605 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1606 AMDGPU::SGPR_32RegClass,
1607 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1608 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1609 AMDGPU::SReg_64RegClass,
1610 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1611 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1612 AMDGPU::SReg_64RegClass,
1613 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1614 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1615 AMDGPU::VGPR_32RegClass,
1616 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1617 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1618 AMDGPU::VGPR_32RegClass,
1619 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1620 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1621 AMDGPU::VGPR_32RegClass,
1622 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1625 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1626 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1628 // FIXME: Move proper support for denormal-fp-math into base MachineFunction
1629 MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
1630 ? DenormalMode::IEEE
1631 : DenormalMode::PreserveSign;
1632 MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
1633 ? DenormalMode::IEEE
1634 : DenormalMode::PreserveSign;
1636 MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
1637 ? DenormalMode::IEEE
1638 : DenormalMode::PreserveSign;
1639 MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
1640 ? DenormalMode::IEEE
1641 : DenormalMode::PreserveSign;