1 //===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// The AMDGPU target machine contains all of the hardware specific
11 /// information needed to emit code for SI+ GPUs.
13 //===----------------------------------------------------------------------===//
15 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUAliasAnalysis.h"
18 #include "AMDGPUExportClustering.h"
19 #include "AMDGPUIGroupLP.h"
20 #include "AMDGPUMacroFusion.h"
21 #include "AMDGPUTargetObjectFile.h"
22 #include "AMDGPUTargetTransformInfo.h"
23 #include "GCNIterativeScheduler.h"
24 #include "GCNSchedStrategy.h"
25 #include "GCNVOPDUtils.h"
27 #include "R600TargetMachine.h"
28 #include "SIMachineFunctionInfo.h"
29 #include "SIMachineScheduler.h"
30 #include "TargetInfo/AMDGPUTargetInfo.h"
31 #include "Utils/AMDGPUBaseInfo.h"
32 #include "llvm/Analysis/CGSCCPassManager.h"
33 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
34 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
35 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
36 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
37 #include "llvm/CodeGen/GlobalISel/Localizer.h"
38 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
39 #include "llvm/CodeGen/MIRParser/MIParser.h"
40 #include "llvm/CodeGen/Passes.h"
41 #include "llvm/CodeGen/RegAllocRegistry.h"
42 #include "llvm/CodeGen/TargetPassConfig.h"
43 #include "llvm/IR/IntrinsicsAMDGPU.h"
44 #include "llvm/IR/LegacyPassManager.h"
45 #include "llvm/IR/PassManager.h"
46 #include "llvm/IR/PatternMatch.h"
47 #include "llvm/InitializePasses.h"
48 #include "llvm/MC/TargetRegistry.h"
49 #include "llvm/Passes/PassBuilder.h"
50 #include "llvm/Transforms/IPO.h"
51 #include "llvm/Transforms/IPO/AlwaysInliner.h"
52 #include "llvm/Transforms/IPO/GlobalDCE.h"
53 #include "llvm/Transforms/IPO/Internalize.h"
54 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
55 #include "llvm/Transforms/Scalar.h"
56 #include "llvm/Transforms/Scalar/GVN.h"
57 #include "llvm/Transforms/Scalar/InferAddressSpaces.h"
58 #include "llvm/Transforms/Utils.h"
59 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
60 #include "llvm/Transforms/Vectorize.h"
63 using namespace llvm::PatternMatch;
66 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
68 SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
69 : RegisterRegAllocBase(N, D, C) {}
72 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
74 VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
75 : RegisterRegAllocBase(N, D, C) {}
78 static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
79 const TargetRegisterClass &RC) {
80 return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
83 static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
84 const TargetRegisterClass &RC) {
85 return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(&RC);
89 /// -{sgpr|vgpr}-regalloc=... command line option.
90 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
92 /// A dummy default pass factory indicates whether the register allocator is
93 /// overridden on the command line.
94 static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
95 static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
97 static SGPRRegisterRegAlloc
98 defaultSGPRRegAlloc("default",
99 "pick SGPR register allocator based on -O option",
100 useDefaultRegisterAllocator);
102 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
103 RegisterPassParser<SGPRRegisterRegAlloc>>
104 SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
105 cl::desc("Register allocator to use for SGPRs"));
107 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
108 RegisterPassParser<VGPRRegisterRegAlloc>>
109 VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
110 cl::desc("Register allocator to use for VGPRs"));
113 static void initializeDefaultSGPRRegisterAllocatorOnce() {
114 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
118 SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
122 static void initializeDefaultVGPRRegisterAllocatorOnce() {
123 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
127 VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
131 static FunctionPass *createBasicSGPRRegisterAllocator() {
132 return createBasicRegisterAllocator(onlyAllocateSGPRs);
135 static FunctionPass *createGreedySGPRRegisterAllocator() {
136 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
139 static FunctionPass *createFastSGPRRegisterAllocator() {
140 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
143 static FunctionPass *createBasicVGPRRegisterAllocator() {
144 return createBasicRegisterAllocator(onlyAllocateVGPRs);
147 static FunctionPass *createGreedyVGPRRegisterAllocator() {
148 return createGreedyRegisterAllocator(onlyAllocateVGPRs);
151 static FunctionPass *createFastVGPRRegisterAllocator() {
152 return createFastRegisterAllocator(onlyAllocateVGPRs, true);
155 static SGPRRegisterRegAlloc basicRegAllocSGPR(
156 "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
157 static SGPRRegisterRegAlloc greedyRegAllocSGPR(
158 "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
160 static SGPRRegisterRegAlloc fastRegAllocSGPR(
161 "fast", "fast register allocator", createFastSGPRRegisterAllocator);
164 static VGPRRegisterRegAlloc basicRegAllocVGPR(
165 "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
166 static VGPRRegisterRegAlloc greedyRegAllocVGPR(
167 "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
169 static VGPRRegisterRegAlloc fastRegAllocVGPR(
170 "fast", "fast register allocator", createFastVGPRRegisterAllocator);
173 static cl::opt<bool> EnableSROA(
175 cl::desc("Run SROA after promote alloca pass"),
180 EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
181 cl::desc("Run early if-conversion"),
185 OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
186 cl::desc("Run pre-RA exec mask optimizations"),
189 // Option to disable vectorizer for tests.
190 static cl::opt<bool> EnableLoadStoreVectorizer(
191 "amdgpu-load-store-vectorizer",
192 cl::desc("Enable load store vectorizer"),
196 // Option to control global loads scalarization
197 static cl::opt<bool> ScalarizeGlobal(
198 "amdgpu-scalarize-global-loads",
199 cl::desc("Enable global load scalarization"),
203 // Option to run internalize pass.
204 static cl::opt<bool> InternalizeSymbols(
205 "amdgpu-internalize-symbols",
206 cl::desc("Enable elimination of non-kernel functions and unused globals"),
210 // Option to inline all early.
211 static cl::opt<bool> EarlyInlineAll(
212 "amdgpu-early-inline-all",
213 cl::desc("Inline all functions early"),
217 static cl::opt<bool> EnableSDWAPeephole(
218 "amdgpu-sdwa-peephole",
219 cl::desc("Enable SDWA peepholer"),
222 static cl::opt<bool> EnableDPPCombine(
223 "amdgpu-dpp-combine",
224 cl::desc("Enable DPP combiner"),
227 // Enable address space based alias analysis
228 static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
229 cl::desc("Enable AMDGPU Alias Analysis"),
232 // Option to run late CFG structurizer
233 static cl::opt<bool, true> LateCFGStructurize(
234 "amdgpu-late-structurize",
235 cl::desc("Enable late CFG structurization"),
236 cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
239 // Enable lib calls simplifications
240 static cl::opt<bool> EnableLibCallSimplify(
241 "amdgpu-simplify-libcall",
242 cl::desc("Enable amdgpu library simplifications"),
246 static cl::opt<bool> EnableLowerKernelArguments(
247 "amdgpu-ir-lower-kernel-arguments",
248 cl::desc("Lower kernel argument loads in IR pass"),
252 static cl::opt<bool> EnableRegReassign(
253 "amdgpu-reassign-regs",
254 cl::desc("Enable register reassign optimizations on gfx10+"),
258 static cl::opt<bool> OptVGPRLiveRange(
259 "amdgpu-opt-vgpr-liverange",
260 cl::desc("Enable VGPR liverange optimizations for if-else structure"),
261 cl::init(true), cl::Hidden);
263 // Enable atomic optimization
264 static cl::opt<bool> EnableAtomicOptimizations(
265 "amdgpu-atomic-optimizations",
266 cl::desc("Enable atomic optimizations"),
270 // Enable Mode register optimization
271 static cl::opt<bool> EnableSIModeRegisterPass(
272 "amdgpu-mode-register",
273 cl::desc("Enable mode register pass"),
277 // Enable GFX11+ s_delay_alu insertion
279 EnableInsertDelayAlu("amdgpu-enable-delay-alu",
280 cl::desc("Enable s_delay_alu insertion"),
281 cl::init(true), cl::Hidden);
283 // Enable GFX11+ VOPD
285 EnableVOPD("amdgpu-enable-vopd",
286 cl::desc("Enable VOPD, dual issue of VALU in wave32"),
287 cl::init(true), cl::Hidden);
289 // Option is used in lit tests to prevent deadcoding of patterns inspected.
291 EnableDCEInRA("amdgpu-dce-in-ra",
292 cl::init(true), cl::Hidden,
293 cl::desc("Enable machine DCE inside regalloc"));
295 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
296 cl::desc("Adjust wave priority"),
297 cl::init(false), cl::Hidden);
299 static cl::opt<bool> EnableScalarIRPasses(
300 "amdgpu-scalar-ir-passes",
301 cl::desc("Enable scalar IR passes"),
305 static cl::opt<bool> EnableStructurizerWorkarounds(
306 "amdgpu-enable-structurizer-workarounds",
307 cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
310 static cl::opt<bool> EnableLDSReplaceWithPointer(
311 "amdgpu-enable-lds-replace-with-pointer",
312 cl::desc("Enable LDS replace with pointer pass"), cl::init(false),
315 static cl::opt<bool, true> EnableLowerModuleLDS(
316 "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
317 cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
320 static cl::opt<bool> EnablePreRAOptimizations(
321 "amdgpu-enable-pre-ra-optimizations",
322 cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
325 static cl::opt<bool> EnablePromoteKernelArguments(
326 "amdgpu-enable-promote-kernel-arguments",
327 cl::desc("Enable promotion of flat kernel pointer arguments to global"),
328 cl::Hidden, cl::init(true));
330 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
331 // Register the target
332 RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
333 RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
335 PassRegistry *PR = PassRegistry::getPassRegistry();
336 initializeR600ClauseMergePassPass(*PR);
337 initializeR600ControlFlowFinalizerPass(*PR);
338 initializeR600PacketizerPass(*PR);
339 initializeR600ExpandSpecialInstrsPassPass(*PR);
340 initializeR600VectorRegMergerPass(*PR);
341 initializeGlobalISel(*PR);
342 initializeAMDGPUDAGToDAGISelPass(*PR);
343 initializeGCNDPPCombinePass(*PR);
344 initializeSILowerI1CopiesPass(*PR);
345 initializeSILowerSGPRSpillsPass(*PR);
346 initializeSIFixSGPRCopiesPass(*PR);
347 initializeSIFixVGPRCopiesPass(*PR);
348 initializeSIFoldOperandsPass(*PR);
349 initializeSIPeepholeSDWAPass(*PR);
350 initializeSIShrinkInstructionsPass(*PR);
351 initializeSIOptimizeExecMaskingPreRAPass(*PR);
352 initializeSIOptimizeVGPRLiveRangePass(*PR);
353 initializeSILoadStoreOptimizerPass(*PR);
354 initializeAMDGPUCtorDtorLoweringPass(*PR);
355 initializeAMDGPUAlwaysInlinePass(*PR);
356 initializeAMDGPUAttributorPass(*PR);
357 initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
358 initializeAMDGPUAnnotateUniformValuesPass(*PR);
359 initializeAMDGPUArgumentUsageInfoPass(*PR);
360 initializeAMDGPUAtomicOptimizerPass(*PR);
361 initializeAMDGPULowerKernelArgumentsPass(*PR);
362 initializeAMDGPUPromoteKernelArgumentsPass(*PR);
363 initializeAMDGPULowerKernelAttributesPass(*PR);
364 initializeAMDGPULowerIntrinsicsPass(*PR);
365 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
366 initializeAMDGPUPostLegalizerCombinerPass(*PR);
367 initializeAMDGPUPreLegalizerCombinerPass(*PR);
368 initializeAMDGPURegBankCombinerPass(*PR);
369 initializeAMDGPUPromoteAllocaPass(*PR);
370 initializeAMDGPUPromoteAllocaToVectorPass(*PR);
371 initializeAMDGPUCodeGenPreparePass(*PR);
372 initializeAMDGPULateCodeGenPreparePass(*PR);
373 initializeAMDGPUPropagateAttributesEarlyPass(*PR);
374 initializeAMDGPUPropagateAttributesLatePass(*PR);
375 initializeAMDGPUReplaceLDSUseWithPointerPass(*PR);
376 initializeAMDGPULowerModuleLDSPass(*PR);
377 initializeAMDGPURewriteOutArgumentsPass(*PR);
378 initializeAMDGPUUnifyMetadataPass(*PR);
379 initializeSIAnnotateControlFlowPass(*PR);
380 initializeAMDGPUReleaseVGPRsPass(*PR);
381 initializeAMDGPUInsertDelayAluPass(*PR);
382 initializeSIInsertHardClausesPass(*PR);
383 initializeSIInsertWaitcntsPass(*PR);
384 initializeSIModeRegisterPass(*PR);
385 initializeSIWholeQuadModePass(*PR);
386 initializeSILowerControlFlowPass(*PR);
387 initializeSIPreEmitPeepholePass(*PR);
388 initializeSILateBranchLoweringPass(*PR);
389 initializeSIMemoryLegalizerPass(*PR);
390 initializeSIOptimizeExecMaskingPass(*PR);
391 initializeSIPreAllocateWWMRegsPass(*PR);
392 initializeSIFormMemoryClausesPass(*PR);
393 initializeSIPostRABundlerPass(*PR);
394 initializeGCNCreateVOPDPass(*PR);
395 initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
396 initializeAMDGPUAAWrapperPassPass(*PR);
397 initializeAMDGPUExternalAAWrapperPass(*PR);
398 initializeAMDGPUUseNativeCallsPass(*PR);
399 initializeAMDGPUSimplifyLibCallsPass(*PR);
400 initializeAMDGPUPrintfRuntimeBindingPass(*PR);
401 initializeAMDGPUResourceUsageAnalysisPass(*PR);
402 initializeGCNNSAReassignPass(*PR);
403 initializeGCNPreRAOptimizationsPass(*PR);
406 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
407 return std::make_unique<AMDGPUTargetObjectFile>();
410 static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
411 return new SIScheduleDAGMI(C);
414 static ScheduleDAGInstrs *
415 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
416 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
417 ScheduleDAGMILive *DAG =
418 new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
419 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
420 if (ST.shouldClusterStores())
421 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
422 DAG->addMutation(createIGroupLPDAGMutation());
423 DAG->addMutation(createSchedBarrierDAGMutation());
424 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
425 DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
429 static ScheduleDAGInstrs *
430 createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
431 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
432 auto DAG = new GCNIterativeScheduler(C,
433 GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
434 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
435 if (ST.shouldClusterStores())
436 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
440 static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
441 return new GCNIterativeScheduler(C,
442 GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
445 static ScheduleDAGInstrs *
446 createIterativeILPMachineScheduler(MachineSchedContext *C) {
447 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
448 auto DAG = new GCNIterativeScheduler(C,
449 GCNIterativeScheduler::SCHEDULE_ILP);
450 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
451 if (ST.shouldClusterStores())
452 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
453 DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
457 static MachineSchedRegistry
458 SISchedRegistry("si", "Run SI's custom scheduler",
459 createSIMachineScheduler);
461 static MachineSchedRegistry
462 GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
463 "Run GCN scheduler to maximize occupancy",
464 createGCNMaxOccupancyMachineScheduler);
466 static MachineSchedRegistry
467 IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
468 "Run GCN scheduler to maximize occupancy (experimental)",
469 createIterativeGCNMaxOccupancyMachineScheduler);
471 static MachineSchedRegistry
472 GCNMinRegSchedRegistry("gcn-minreg",
473 "Run GCN iterative scheduler for minimal register usage (experimental)",
474 createMinRegScheduler);
476 static MachineSchedRegistry
477 GCNILPSchedRegistry("gcn-ilp",
478 "Run GCN iterative scheduler for ILP scheduling (experimental)",
479 createIterativeILPMachineScheduler);
481 static StringRef computeDataLayout(const Triple &TT) {
482 if (TT.getArch() == Triple::r600) {
484 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
485 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
488 // 32-bit private, local, and region pointers. 64-bit global, constant and
489 // flat, non-integral buffer fat pointers.
490 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
491 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
492 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
497 static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
501 // Need to default to a target with flat support for HSA.
502 if (TT.getArch() == Triple::amdgcn)
503 return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
508 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
509 // The AMDGPU toolchain only supports generating shared objects, so we
510 // must always use PIC.
514 AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
515 StringRef CPU, StringRef FS,
516 TargetOptions Options,
517 Optional<Reloc::Model> RM,
518 Optional<CodeModel::Model> CM,
519 CodeGenOpt::Level OptLevel)
520 : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
521 FS, Options, getEffectiveRelocModel(RM),
522 getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
523 TLOF(createTLOF(getTargetTriple())) {
525 if (TT.getArch() == Triple::amdgcn) {
526 if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
527 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
528 else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
529 MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
533 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
534 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
535 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
537 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
539 StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
540 Attribute GPUAttr = F.getFnAttribute("target-cpu");
541 return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
544 StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
545 Attribute FSAttr = F.getFnAttribute("target-features");
547 return FSAttr.isValid() ? FSAttr.getValueAsString()
548 : getTargetFeatureString();
551 /// Predicate for Internalize pass.
552 static bool mustPreserveGV(const GlobalValue &GV) {
553 if (const Function *F = dyn_cast<Function>(&GV))
554 return F->isDeclaration() || F->getName().startswith("__asan_") ||
555 F->getName().startswith("__sanitizer_") ||
556 AMDGPU::isEntryFunctionCC(F->getCallingConv());
558 GV.removeDeadConstantUsers();
559 return !GV.use_empty();
562 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
563 Builder.DivergentTarget = true;
565 bool EnableOpt = getOptLevel() > CodeGenOpt::None;
566 bool Internalize = InternalizeSymbols;
567 bool EarlyInline = EarlyInlineAll && EnableOpt && !EnableFunctionCalls;
568 bool AMDGPUAA = EnableAMDGPUAliasAnalysis && EnableOpt;
569 bool LibCallSimplify = EnableLibCallSimplify && EnableOpt;
570 bool PromoteKernelArguments =
571 EnablePromoteKernelArguments && getOptLevel() > CodeGenOpt::Less;
573 if (EnableFunctionCalls) {
574 delete Builder.Inliner;
575 Builder.Inliner = createFunctionInliningPass();
578 Builder.addExtension(
579 PassManagerBuilder::EP_ModuleOptimizerEarly,
580 [Internalize, EarlyInline, AMDGPUAA, this](const PassManagerBuilder &,
581 legacy::PassManagerBase &PM) {
583 PM.add(createAMDGPUAAWrapperPass());
584 PM.add(createAMDGPUExternalAAWrapperPass());
586 PM.add(createAMDGPUUnifyMetadataPass());
587 PM.add(createAMDGPUPrintfRuntimeBinding());
589 PM.add(createInternalizePass(mustPreserveGV));
590 PM.add(createAMDGPUPropagateAttributesLatePass(this));
592 PM.add(createGlobalDCEPass());
594 PM.add(createAMDGPUAlwaysInlinePass(false));
597 Builder.addExtension(
598 PassManagerBuilder::EP_EarlyAsPossible,
599 [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &,
600 legacy::PassManagerBase &PM) {
602 PM.add(createAMDGPUAAWrapperPass());
603 PM.add(createAMDGPUExternalAAWrapperPass());
605 PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
606 PM.add(llvm::createAMDGPUUseNativeCallsPass());
608 PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this));
611 Builder.addExtension(
612 PassManagerBuilder::EP_CGSCCOptimizerLate,
613 [EnableOpt, PromoteKernelArguments](const PassManagerBuilder &,
614 legacy::PassManagerBase &PM) {
615 // Add promote kernel arguments pass to the opt pipeline right before
616 // infer address spaces which is needed to do actual address space
618 if (PromoteKernelArguments)
619 PM.add(createAMDGPUPromoteKernelArgumentsPass());
621 // Add infer address spaces pass to the opt pipeline after inlining
622 // but before SROA to increase SROA opportunities.
623 PM.add(createInferAddressSpacesPass());
625 // This should run after inlining to have any chance of doing anything,
626 // and before other cleanup optimizations.
627 PM.add(createAMDGPULowerKernelAttributesPass());
629 // Promote alloca to vector before SROA and loop unroll. If we manage
630 // to eliminate allocas before unroll we may choose to unroll less.
632 PM.add(createAMDGPUPromoteAllocaToVector());
636 void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
637 AAM.registerFunctionAnalysis<AMDGPUAA>();
640 void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
641 PB.registerPipelineParsingCallback(
642 [this](StringRef PassName, ModulePassManager &PM,
643 ArrayRef<PassBuilder::PipelineElement>) {
644 if (PassName == "amdgpu-propagate-attributes-late") {
645 PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
648 if (PassName == "amdgpu-unify-metadata") {
649 PM.addPass(AMDGPUUnifyMetadataPass());
652 if (PassName == "amdgpu-printf-runtime-binding") {
653 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
656 if (PassName == "amdgpu-always-inline") {
657 PM.addPass(AMDGPUAlwaysInlinePass());
660 if (PassName == "amdgpu-replace-lds-use-with-pointer") {
661 PM.addPass(AMDGPUReplaceLDSUseWithPointerPass());
664 if (PassName == "amdgpu-lower-module-lds") {
665 PM.addPass(AMDGPULowerModuleLDSPass());
670 PB.registerPipelineParsingCallback(
671 [this](StringRef PassName, FunctionPassManager &PM,
672 ArrayRef<PassBuilder::PipelineElement>) {
673 if (PassName == "amdgpu-simplifylib") {
674 PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
677 if (PassName == "amdgpu-usenative") {
678 PM.addPass(AMDGPUUseNativeCallsPass());
681 if (PassName == "amdgpu-promote-alloca") {
682 PM.addPass(AMDGPUPromoteAllocaPass(*this));
685 if (PassName == "amdgpu-promote-alloca-to-vector") {
686 PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
689 if (PassName == "amdgpu-lower-kernel-attributes") {
690 PM.addPass(AMDGPULowerKernelAttributesPass());
693 if (PassName == "amdgpu-propagate-attributes-early") {
694 PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
697 if (PassName == "amdgpu-promote-kernel-arguments") {
698 PM.addPass(AMDGPUPromoteKernelArgumentsPass());
704 PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &FAM) {
705 FAM.registerPass([&] { return AMDGPUAA(); });
708 PB.registerParseAACallback([](StringRef AAName, AAManager &AAM) {
709 if (AAName == "amdgpu-aa") {
710 AAM.registerFunctionAnalysis<AMDGPUAA>();
716 PB.registerPipelineStartEPCallback(
717 [this](ModulePassManager &PM, OptimizationLevel Level) {
718 FunctionPassManager FPM;
719 FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
720 FPM.addPass(AMDGPUUseNativeCallsPass());
721 if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
722 FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
723 PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
726 PB.registerPipelineEarlySimplificationEPCallback(
727 [this](ModulePassManager &PM, OptimizationLevel Level) {
728 if (Level == OptimizationLevel::O0)
731 PM.addPass(AMDGPUUnifyMetadataPass());
732 PM.addPass(AMDGPUPrintfRuntimeBindingPass());
734 if (InternalizeSymbols) {
735 PM.addPass(InternalizePass(mustPreserveGV));
737 PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
738 if (InternalizeSymbols) {
739 PM.addPass(GlobalDCEPass());
741 if (EarlyInlineAll && !EnableFunctionCalls)
742 PM.addPass(AMDGPUAlwaysInlinePass());
745 PB.registerCGSCCOptimizerLateEPCallback(
746 [this](CGSCCPassManager &PM, OptimizationLevel Level) {
747 if (Level == OptimizationLevel::O0)
750 FunctionPassManager FPM;
752 // Add promote kernel arguments pass to the opt pipeline right before
753 // infer address spaces which is needed to do actual address space
755 if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
756 EnablePromoteKernelArguments)
757 FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
759 // Add infer address spaces pass to the opt pipeline after inlining
760 // but before SROA to increase SROA opportunities.
761 FPM.addPass(InferAddressSpacesPass());
763 // This should run after inlining to have any chance of doing
764 // anything, and before other cleanup optimizations.
765 FPM.addPass(AMDGPULowerKernelAttributesPass());
767 if (Level != OptimizationLevel::O0) {
768 // Promote alloca to vector before SROA and loop unroll. If we
769 // manage to eliminate allocas before unroll we may choose to unroll
771 FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
774 PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
778 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
779 return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
780 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
781 AddrSpace == AMDGPUAS::REGION_ADDRESS)
786 bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
787 unsigned DestAS) const {
788 return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
789 AMDGPU::isFlatGlobalAddrSpace(DestAS);
792 unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
793 const auto *LD = dyn_cast<LoadInst>(V);
795 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
797 // It must be a generic pointer loaded.
798 assert(V->getType()->isPointerTy() &&
799 V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
801 const auto *Ptr = LD->getPointerOperand();
802 if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
803 return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
804 // For a generic pointer loaded from the constant memory, it could be assumed
805 // as a global pointer since the constant memory is only populated on the
806 // host side. As implied by the offload programming model, only global
807 // pointers could be referenced on the host side.
808 return AMDGPUAS::GLOBAL_ADDRESS;
811 std::pair<const Value *, unsigned>
812 AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
813 if (auto *II = dyn_cast<IntrinsicInst>(V)) {
814 switch (II->getIntrinsicID()) {
815 case Intrinsic::amdgcn_is_shared:
816 return std::make_pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
817 case Intrinsic::amdgcn_is_private:
818 return std::make_pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
822 return std::make_pair(nullptr, -1);
824 // Check the global pointer predication based on
825 // (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
826 // the order of 'is_shared' and 'is_private' is not significant.
829 const_cast<Value *>(V),
830 m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
831 m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
833 return std::make_pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
835 return std::make_pair(nullptr, -1);
839 AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
841 case PseudoSourceValue::Stack:
842 case PseudoSourceValue::FixedStack:
843 return AMDGPUAS::PRIVATE_ADDRESS;
844 case PseudoSourceValue::ConstantPool:
845 case PseudoSourceValue::GOT:
846 case PseudoSourceValue::JumpTable:
847 case PseudoSourceValue::GlobalValueCallEntry:
848 case PseudoSourceValue::ExternalSymbolCallEntry:
849 case PseudoSourceValue::TargetCustom:
850 return AMDGPUAS::CONSTANT_ADDRESS;
852 return AMDGPUAS::FLAT_ADDRESS;
855 //===----------------------------------------------------------------------===//
856 // GCN Target Machine (SI+)
857 //===----------------------------------------------------------------------===//
859 GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
860 StringRef CPU, StringRef FS,
861 TargetOptions Options,
862 Optional<Reloc::Model> RM,
863 Optional<CodeModel::Model> CM,
864 CodeGenOpt::Level OL, bool JIT)
865 : AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
867 const TargetSubtargetInfo *
868 GCNTargetMachine::getSubtargetImpl(const Function &F) const {
869 StringRef GPU = getGPUName(F);
870 StringRef FS = getFeatureString(F);
872 SmallString<128> SubtargetKey(GPU);
873 SubtargetKey.append(FS);
875 auto &I = SubtargetMap[SubtargetKey];
877 // This needs to be done before we create a new subtarget since any
878 // creation will depend on the TM and the code generation flags on the
879 // function that reside in TargetOptions.
880 resetTargetOptions(F);
881 I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
884 I->setScalarizeGlobalBehavior(ScalarizeGlobal);
890 GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
891 return TargetTransformInfo(GCNTTIImpl(this, F));
894 //===----------------------------------------------------------------------===//
896 //===----------------------------------------------------------------------===//
898 std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
899 return getStandardCSEConfigForOpt(TM->getOptLevel());
904 class GCNPassConfig final : public AMDGPUPassConfig {
906 GCNPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
907 : AMDGPUPassConfig(TM, PM) {
908 // It is necessary to know the register usage of the entire call graph. We
909 // allow calls without EnableAMDGPUFunctionCalls if they are marked
910 // noinline, so this is always required.
911 setRequiresCodeGenSCCOrder(true);
912 substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
915 GCNTargetMachine &getGCNTargetMachine() const {
916 return getTM<GCNTargetMachine>();
920 createMachineScheduler(MachineSchedContext *C) const override;
923 createPostMachineScheduler(MachineSchedContext *C) const override {
924 ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
925 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
926 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
927 if (ST.shouldClusterStores())
928 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
929 DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
930 DAG->addMutation(createIGroupLPDAGMutation());
931 DAG->addMutation(createSchedBarrierDAGMutation());
932 if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
933 DAG->addMutation(createVOPDPairingMutation());
937 bool addPreISel() override;
938 void addMachineSSAOptimization() override;
939 bool addILPOpts() override;
940 bool addInstSelector() override;
941 bool addIRTranslator() override;
942 void addPreLegalizeMachineIR() override;
943 bool addLegalizeMachineIR() override;
944 void addPreRegBankSelect() override;
945 bool addRegBankSelect() override;
946 void addPreGlobalInstructionSelect() override;
947 bool addGlobalInstructionSelect() override;
948 void addFastRegAlloc() override;
949 void addOptimizedRegAlloc() override;
951 FunctionPass *createSGPRAllocPass(bool Optimized);
952 FunctionPass *createVGPRAllocPass(bool Optimized);
953 FunctionPass *createRegAllocPass(bool Optimized) override;
955 bool addRegAssignAndRewriteFast() override;
956 bool addRegAssignAndRewriteOptimized() override;
958 void addPreRegAlloc() override;
959 bool addPreRewrite() override;
960 void addPostRegAlloc() override;
961 void addPreSched2() override;
962 void addPreEmitPass() override;
965 } // end anonymous namespace
967 AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
968 : TargetPassConfig(TM, PM) {
969 // Exceptions and StackMaps are not supported, so these passes will never do
971 disablePass(&StackMapLivenessID);
972 disablePass(&FuncletLayoutID);
973 // Garbage collection is not supported.
974 disablePass(&GCLoweringID);
975 disablePass(&ShadowStackGCLoweringID);
978 void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
979 if (getOptLevel() == CodeGenOpt::Aggressive)
980 addPass(createGVNPass());
982 addPass(createEarlyCSEPass());
985 void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
986 addPass(createLICMPass());
987 addPass(createSeparateConstOffsetFromGEPPass());
988 addPass(createSpeculativeExecutionPass());
989 // ReassociateGEPs exposes more opportunities for SLSR. See
990 // the example in reassociate-geps-and-slsr.ll.
991 addPass(createStraightLineStrengthReducePass());
992 // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
993 // EarlyCSE can reuse.
994 addEarlyCSEOrGVNPass();
995 // Run NaryReassociate after EarlyCSE/GVN to be more effective.
996 addPass(createNaryReassociatePass());
997 // NaryReassociate on GEPs creates redundant common expressions, so run
998 // EarlyCSE after it.
999 addPass(createEarlyCSEPass());
1002 void AMDGPUPassConfig::addIRPasses() {
1003 const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
1005 // There is no reason to run these.
1006 disablePass(&StackMapLivenessID);
1007 disablePass(&FuncletLayoutID);
1008 disablePass(&PatchableFunctionID);
1010 addPass(createAMDGPUPrintfRuntimeBinding());
1011 addPass(createAMDGPUCtorDtorLoweringPass());
1013 // A call to propagate attributes pass in the backend in case opt was not run.
1014 addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
1016 addPass(createAMDGPULowerIntrinsicsPass());
1018 // Function calls are not supported, so make sure we inline everything.
1019 addPass(createAMDGPUAlwaysInlinePass());
1020 addPass(createAlwaysInlinerLegacyPass());
1021 // We need to add the barrier noop pass, otherwise adding the function
1022 // inlining pass will cause all of the PassConfigs passes to be run
1023 // one function at a time, which means if we have a module with two
1024 // functions, then we will generate code for the first function
1025 // without ever running any passes on the second.
1026 addPass(createBarrierNoopPass());
1028 // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
1029 if (TM.getTargetTriple().getArch() == Triple::r600)
1030 addPass(createR600OpenCLImageTypeLoweringPass());
1032 // Replace OpenCL enqueued block function pointers with global variables.
1033 addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
1035 // Can increase LDS used by kernel so runs before PromoteAlloca
1036 if (EnableLowerModuleLDS) {
1037 // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the
1038 // pass "amdgpu-lower-module-lds", and also it required to be run only if
1039 // "amdgpu-lower-module-lds" pass is enabled.
1040 if (EnableLDSReplaceWithPointer)
1041 addPass(createAMDGPUReplaceLDSUseWithPointerPass());
1043 addPass(createAMDGPULowerModuleLDSPass());
1046 if (TM.getOptLevel() > CodeGenOpt::None)
1047 addPass(createInferAddressSpacesPass());
1049 addPass(createAtomicExpandPass());
1051 if (TM.getOptLevel() > CodeGenOpt::None) {
1052 addPass(createAMDGPUPromoteAlloca());
1055 addPass(createSROAPass());
1056 if (isPassEnabled(EnableScalarIRPasses))
1057 addStraightLineScalarOptimizationPasses();
1059 if (EnableAMDGPUAliasAnalysis) {
1060 addPass(createAMDGPUAAWrapperPass());
1061 addPass(createExternalAAWrapperPass([](Pass &P, Function &,
1063 if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
1064 AAR.addAAResult(WrapperPass->getResult());
1068 if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1069 // TODO: May want to move later or split into an early and late one.
1070 addPass(createAMDGPUCodeGenPreparePass());
1074 TargetPassConfig::addIRPasses();
1076 // EarlyCSE is not always strong enough to clean up what LSR produces. For
1077 // example, GVN can combine
1084 // %0 = shl nsw %a, 2
1087 // but EarlyCSE can do neither of them.
1088 if (isPassEnabled(EnableScalarIRPasses))
1089 addEarlyCSEOrGVNPass();
1092 void AMDGPUPassConfig::addCodeGenPrepare() {
1093 if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
1094 addPass(createAMDGPUAttributorPass());
1096 // FIXME: This pass adds 2 hacky attributes that can be replaced with an
1097 // analysis, and should be removed.
1098 addPass(createAMDGPUAnnotateKernelFeaturesPass());
1101 if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
1102 EnableLowerKernelArguments)
1103 addPass(createAMDGPULowerKernelArgumentsPass());
1105 TargetPassConfig::addCodeGenPrepare();
1107 if (isPassEnabled(EnableLoadStoreVectorizer))
1108 addPass(createLoadStoreVectorizerPass());
1110 // LowerSwitch pass may introduce unreachable blocks that can
1111 // cause unexpected behavior for subsequent passes. Placing it
1112 // here seems better that these blocks would get cleaned up by
1113 // UnreachableBlockElim inserted next in the pass flow.
1114 addPass(createLowerSwitchPass());
1117 bool AMDGPUPassConfig::addPreISel() {
1118 if (TM->getOptLevel() > CodeGenOpt::None)
1119 addPass(createFlattenCFGPass());
1123 bool AMDGPUPassConfig::addInstSelector() {
1124 addPass(createAMDGPUISelDag(&getAMDGPUTargetMachine(), getOptLevel()));
1128 bool AMDGPUPassConfig::addGCPasses() {
1129 // Do nothing. GC is not supported.
1133 llvm::ScheduleDAGInstrs *
1134 AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
1135 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1136 ScheduleDAGMILive *DAG = createGenericSchedLive(C);
1137 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
1138 if (ST.shouldClusterStores())
1139 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
1143 //===----------------------------------------------------------------------===//
1145 //===----------------------------------------------------------------------===//
1147 ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
1148 MachineSchedContext *C) const {
1149 const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
1150 if (ST.enableSIScheduler())
1151 return createSIMachineScheduler(C);
1152 return createGCNMaxOccupancyMachineScheduler(C);
1155 bool GCNPassConfig::addPreISel() {
1156 AMDGPUPassConfig::addPreISel();
1158 if (TM->getOptLevel() > CodeGenOpt::None)
1159 addPass(createAMDGPULateCodeGenPreparePass());
1161 if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) {
1162 addPass(createAMDGPUAtomicOptimizerPass());
1165 if (TM->getOptLevel() > CodeGenOpt::None)
1166 addPass(createSinkingPass());
1168 // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
1169 // regions formed by them.
1170 addPass(&AMDGPUUnifyDivergentExitNodesID);
1171 if (!LateCFGStructurize) {
1172 if (EnableStructurizerWorkarounds) {
1173 addPass(createFixIrreduciblePass());
1174 addPass(createUnifyLoopExitsPass());
1176 addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
1178 addPass(createAMDGPUAnnotateUniformValues());
1179 if (!LateCFGStructurize) {
1180 addPass(createSIAnnotateControlFlowPass());
1182 addPass(createLCSSAPass());
1184 if (TM->getOptLevel() > CodeGenOpt::Less)
1185 addPass(&AMDGPUPerfHintAnalysisID);
1190 void GCNPassConfig::addMachineSSAOptimization() {
1191 TargetPassConfig::addMachineSSAOptimization();
1193 // We want to fold operands after PeepholeOptimizer has run (or as part of
1194 // it), because it will eliminate extra copies making it easier to fold the
1195 // real source operand. We want to eliminate dead instructions after, so that
1196 // we see fewer uses of the copies. We then need to clean up the dead
1197 // instructions leftover after the operands are folded as well.
1199 // XXX - Can we get away without running DeadMachineInstructionElim again?
1200 addPass(&SIFoldOperandsID);
1201 if (EnableDPPCombine)
1202 addPass(&GCNDPPCombineID);
1203 addPass(&SILoadStoreOptimizerID);
1204 if (isPassEnabled(EnableSDWAPeephole)) {
1205 addPass(&SIPeepholeSDWAID);
1206 addPass(&EarlyMachineLICMID);
1207 addPass(&MachineCSEID);
1208 addPass(&SIFoldOperandsID);
1210 addPass(&DeadMachineInstructionElimID);
1211 addPass(createSIShrinkInstructionsPass());
1214 bool GCNPassConfig::addILPOpts() {
1215 if (EnableEarlyIfConversion)
1216 addPass(&EarlyIfConverterID);
1218 TargetPassConfig::addILPOpts();
1222 bool GCNPassConfig::addInstSelector() {
1223 AMDGPUPassConfig::addInstSelector();
1224 addPass(&SIFixSGPRCopiesID);
1225 addPass(createSILowerI1CopiesPass());
1229 bool GCNPassConfig::addIRTranslator() {
1230 addPass(new IRTranslator(getOptLevel()));
1234 void GCNPassConfig::addPreLegalizeMachineIR() {
1235 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1236 addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
1237 addPass(new Localizer());
1240 bool GCNPassConfig::addLegalizeMachineIR() {
1241 addPass(new Legalizer());
1245 void GCNPassConfig::addPreRegBankSelect() {
1246 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1247 addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
1250 bool GCNPassConfig::addRegBankSelect() {
1251 addPass(new RegBankSelect());
1255 void GCNPassConfig::addPreGlobalInstructionSelect() {
1256 bool IsOptNone = getOptLevel() == CodeGenOpt::None;
1257 addPass(createAMDGPURegBankCombiner(IsOptNone));
1260 bool GCNPassConfig::addGlobalInstructionSelect() {
1261 addPass(new InstructionSelect(getOptLevel()));
1265 void GCNPassConfig::addPreRegAlloc() {
1266 if (LateCFGStructurize) {
1267 addPass(createAMDGPUMachineCFGStructurizerPass());
1271 void GCNPassConfig::addFastRegAlloc() {
1272 // FIXME: We have to disable the verifier here because of PHIElimination +
1273 // TwoAddressInstructions disabling it.
1275 // This must be run immediately after phi elimination and before
1276 // TwoAddressInstructions, otherwise the processing of the tied operand of
1277 // SI_ELSE will introduce a copy of the tied operand source after the else.
1278 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1280 insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
1281 insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
1283 TargetPassConfig::addFastRegAlloc();
1286 void GCNPassConfig::addOptimizedRegAlloc() {
1287 // Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
1288 // instructions that cause scheduling barriers.
1289 insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
1290 insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
1292 if (OptExecMaskPreRA)
1293 insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
1295 if (isPassEnabled(EnablePreRAOptimizations))
1296 insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
1298 // This is not an essential optimization and it has a noticeable impact on
1299 // compilation time, so we only enable it from O2.
1300 if (TM->getOptLevel() > CodeGenOpt::Less)
1301 insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
1303 // FIXME: when an instruction has a Killed operand, and the instruction is
1304 // inside a bundle, seems only the BUNDLE instruction appears as the Kills of
1305 // the register in LiveVariables, this would trigger a failure in verifier,
1306 // we should fix it and enable the verifier.
1307 if (OptVGPRLiveRange)
1308 insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeID);
1309 // This must be run immediately after phi elimination and before
1310 // TwoAddressInstructions, otherwise the processing of the tied operand of
1311 // SI_ELSE will introduce a copy of the tied operand source after the else.
1312 insertPass(&PHIEliminationID, &SILowerControlFlowID);
1315 insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
1317 TargetPassConfig::addOptimizedRegAlloc();
1320 bool GCNPassConfig::addPreRewrite() {
1321 if (EnableRegReassign)
1322 addPass(&GCNNSAReassignID);
1326 FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
1327 // Initialize the global default.
1328 llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
1329 initializeDefaultSGPRRegisterAllocatorOnce);
1331 RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
1332 if (Ctor != useDefaultRegisterAllocator)
1336 return createGreedyRegisterAllocator(onlyAllocateSGPRs);
1338 return createFastRegisterAllocator(onlyAllocateSGPRs, false);
1341 FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
1342 // Initialize the global default.
1343 llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
1344 initializeDefaultVGPRRegisterAllocatorOnce);
1346 RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
1347 if (Ctor != useDefaultRegisterAllocator)
1351 return createGreedyVGPRRegisterAllocator();
1353 return createFastVGPRRegisterAllocator();
1356 FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
1357 llvm_unreachable("should not be used");
1360 static const char RegAllocOptNotSupportedMessage[] =
1361 "-regalloc not supported with amdgcn. Use -sgpr-regalloc and -vgpr-regalloc";
1363 bool GCNPassConfig::addRegAssignAndRewriteFast() {
1364 if (!usingDefaultRegAlloc())
1365 report_fatal_error(RegAllocOptNotSupportedMessage);
1367 addPass(createSGPRAllocPass(false));
1369 // Equivalent of PEI for SGPRs.
1370 addPass(&SILowerSGPRSpillsID);
1372 addPass(createVGPRAllocPass(false));
1376 bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
1377 if (!usingDefaultRegAlloc())
1378 report_fatal_error(RegAllocOptNotSupportedMessage);
1380 addPass(createSGPRAllocPass(true));
1382 // Commit allocated register changes. This is mostly necessary because too
1383 // many things rely on the use lists of the physical registers, such as the
1384 // verifier. This is only necessary with allocators which use LiveIntervals,
1385 // since FastRegAlloc does the replacements itself.
1386 addPass(createVirtRegRewriter(false));
1388 // Equivalent of PEI for SGPRs.
1389 addPass(&SILowerSGPRSpillsID);
1391 addPass(createVGPRAllocPass(true));
1394 addPass(&VirtRegRewriterID);
1399 void GCNPassConfig::addPostRegAlloc() {
1400 addPass(&SIFixVGPRCopiesID);
1401 if (getOptLevel() > CodeGenOpt::None)
1402 addPass(&SIOptimizeExecMaskingID);
1403 TargetPassConfig::addPostRegAlloc();
1406 void GCNPassConfig::addPreSched2() {
1407 if (TM->getOptLevel() > CodeGenOpt::None)
1408 addPass(createSIShrinkInstructionsPass());
1409 addPass(&SIPostRABundlerID);
1412 void GCNPassConfig::addPreEmitPass() {
1413 if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
1414 addPass(&GCNCreateVOPDID);
1415 addPass(createSIMemoryLegalizerPass());
1416 addPass(createSIInsertWaitcntsPass());
1418 addPass(createSIModeRegisterPass());
1420 if (getOptLevel() > CodeGenOpt::None)
1421 addPass(&SIInsertHardClausesID);
1423 addPass(&SILateBranchLoweringPassID);
1424 if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
1425 addPass(createAMDGPUSetWavePriorityPass());
1426 if (getOptLevel() > CodeGenOpt::None)
1427 addPass(&SIPreEmitPeepholeID);
1428 // The hazard recognizer that runs as part of the post-ra scheduler does not
1429 // guarantee to be able handle all hazards correctly. This is because if there
1430 // are multiple scheduling regions in a basic block, the regions are scheduled
1431 // bottom up, so when we begin to schedule a region we don't know what
1432 // instructions were emitted directly before it.
1434 // Here we add a stand-alone hazard recognizer pass which can handle all
1436 addPass(&PostRAHazardRecognizerID);
1438 if (getOptLevel() > CodeGenOpt::Less)
1439 addPass(&AMDGPUReleaseVGPRsID);
1441 if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
1442 addPass(&AMDGPUInsertDelayAluID);
1444 addPass(&BranchRelaxationPassID);
1447 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
1448 return new GCNPassConfig(*this, PM);
1451 yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
1452 return new yaml::SIMachineFunctionInfo();
1455 yaml::MachineFunctionInfo *
1456 GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
1457 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1458 return new yaml::SIMachineFunctionInfo(
1459 *MFI, *MF.getSubtarget().getRegisterInfo(), MF);
1462 bool GCNTargetMachine::parseMachineFunctionInfo(
1463 const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
1464 SMDiagnostic &Error, SMRange &SourceRange) const {
1465 const yaml::SIMachineFunctionInfo &YamlMFI =
1466 static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
1467 MachineFunction &MF = PFS.MF;
1468 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1470 if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
1473 if (MFI->Occupancy == 0) {
1474 // Fixup the subtarget dependent default value.
1475 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1476 MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
1479 auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
1481 if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
1482 SourceRange = RegName.SourceRange;
1490 auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
1492 return !RegName.Value.empty() && parseRegister(RegName, RegVal);
1495 if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
1498 auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
1499 // Create a diagnostic for a the register string literal.
1500 const MemoryBuffer &Buffer =
1501 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
1502 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
1503 RegName.Value.size(), SourceMgr::DK_Error,
1504 "incorrect register class for field", RegName.Value,
1506 SourceRange = RegName.SourceRange;
1510 if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
1511 parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
1512 parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
1515 if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
1516 !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
1517 return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
1520 if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
1521 !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
1522 return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
1525 if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
1526 !AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
1527 return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
1530 for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
1532 if (parseRegister(YamlReg, ParsedReg))
1535 MFI->reserveWWMRegister(ParsedReg);
1538 auto parseAndCheckArgument = [&](const Optional<yaml::SIArgument> &A,
1539 const TargetRegisterClass &RC,
1540 ArgDescriptor &Arg, unsigned UserSGPRs,
1541 unsigned SystemSGPRs) {
1542 // Skip parsing if it's not present.
1546 if (A->IsRegister) {
1548 if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
1549 SourceRange = A->RegisterName.SourceRange;
1552 if (!RC.contains(Reg))
1553 return diagnoseRegisterClass(A->RegisterName);
1554 Arg = ArgDescriptor::createRegister(Reg);
1556 Arg = ArgDescriptor::createStack(A->StackOffset);
1557 // Check and apply the optional mask.
1559 Arg = ArgDescriptor::createArg(Arg, *A->Mask);
1561 MFI->NumUserSGPRs += UserSGPRs;
1562 MFI->NumSystemSGPRs += SystemSGPRs;
1566 if (YamlMFI.ArgInfo &&
1567 (parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
1568 AMDGPU::SGPR_128RegClass,
1569 MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
1570 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
1571 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
1573 parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
1574 MFI->ArgInfo.QueuePtr, 2, 0) ||
1575 parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
1576 AMDGPU::SReg_64RegClass,
1577 MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
1578 parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
1579 AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
1581 parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
1582 AMDGPU::SReg_64RegClass,
1583 MFI->ArgInfo.FlatScratchInit, 2, 0) ||
1584 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
1585 AMDGPU::SGPR_32RegClass,
1586 MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
1587 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
1588 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
1590 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
1591 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
1593 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
1594 AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
1596 parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
1597 AMDGPU::SGPR_32RegClass,
1598 MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
1599 parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
1600 AMDGPU::SGPR_32RegClass,
1601 MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
1602 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
1603 AMDGPU::SReg_64RegClass,
1604 MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
1605 parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
1606 AMDGPU::SReg_64RegClass,
1607 MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
1608 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
1609 AMDGPU::VGPR_32RegClass,
1610 MFI->ArgInfo.WorkItemIDX, 0, 0) ||
1611 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
1612 AMDGPU::VGPR_32RegClass,
1613 MFI->ArgInfo.WorkItemIDY, 0, 0) ||
1614 parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
1615 AMDGPU::VGPR_32RegClass,
1616 MFI->ArgInfo.WorkItemIDZ, 0, 0)))
1619 MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
1620 MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
1621 MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals;
1622 MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals;
1623 MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals;
1624 MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals;