1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
12 //===----------------------------------------------------------------------===//
14 #include "AMDGPUSubtarget.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
32 #define DEBUG_TYPE "amdgpu-subtarget"
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
43 static cl::opt<bool> DisablePowerSched(
44 "amdgpu-disable-power-sched",
45 cl::desc("Disable scheduling to minimize mAI power bursts"),
48 static cl::opt<bool> EnableVGPRIndexMode(
49 "amdgpu-vgpr-index-mode",
50 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
53 GCNSubtarget::~GCNSubtarget() = default;
56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57 StringRef GPU, StringRef FS) {
58 SmallString<256> FullFS("+promote-alloca,");
60 ParseSubtargetFeatures(GPU, FullFS);
62 HasMulU24 = getGeneration() >= EVERGREEN;
63 HasMulI24 = hasCaymanISA();
69 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
70 StringRef GPU, StringRef FS) {
71 // Determine default and user-specified characteristics
73 // We want to be able to turn these off, but making this a subtarget feature
74 // for SI has the unhelpful behavior that it unsets everything else if you
77 // Similarly we want enable-prt-strict-null to be on by default and not to
78 // unset everything else if it is disabled
80 // Assuming ECC is enabled is the conservative default.
81 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
83 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
84 FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
86 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
88 // Disable mutually exclusive bits.
89 if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
90 if (FS.find_lower("wavefrontsize16") == StringRef::npos)
91 FullFS += "-wavefrontsize16,";
92 if (FS.find_lower("wavefrontsize32") == StringRef::npos)
93 FullFS += "-wavefrontsize32,";
94 if (FS.find_lower("wavefrontsize64") == StringRef::npos)
95 FullFS += "-wavefrontsize64,";
100 ParseSubtargetFeatures(GPU, FullFS);
102 // We don't support FP64 for EG/NI atm.
103 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
105 // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
106 // on VI and newer hardware to avoid assertion failures due to missing ADDR64
107 // variants of MUBUF instructions.
108 if (!hasAddr64() && !FS.contains("flat-for-global")) {
109 FlatForGlobal = true;
112 // Set defaults if needed.
113 if (MaxPrivateElementSize == 0)
114 MaxPrivateElementSize = 4;
116 if (LDSBankCount == 0)
119 if (TT.getArch() == Triple::amdgcn) {
120 if (LocalMemorySize == 0)
121 LocalMemorySize = 32768;
123 // Do something sensible for unspecified target.
124 if (!HasMovrel && !HasVGPRIndexMode)
128 // Don't crash on invalid devices.
129 if (WavefrontSizeLog2 == 0)
130 WavefrontSizeLog2 = 5;
132 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
134 // Disable XNACK on targets where it is not enabled by default unless it is
135 // explicitly requested.
136 if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
137 ToggleFeature(AMDGPU::FeatureXNACK);
141 // ECC is on by default, but turn it off if the hardware doesn't support it
142 // anyway. This matters for the gfx9 targets with d16 loads, but don't support
144 if (DoesNotSupportSRAMECC && EnableSRAMECC) {
145 ToggleFeature(AMDGPU::FeatureSRAMECC);
146 EnableSRAMECC = false;
152 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
154 Has16BitInsts(false),
155 HasMadMixInsts(false),
156 HasMadMacF32Insts(false),
157 HasDsSrc2Insts(false),
159 HasVOP3PInsts(false),
162 HasInv2PiInlineImm(false),
163 HasFminFmaxLegacy(true),
164 EnablePromoteAlloca(false),
165 HasTrigReducedRange(false),
171 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
172 const GCNTargetMachine &TM) :
173 AMDGPUGenSubtargetInfo(TT, GPU, FS),
176 Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
177 InstrItins(getInstrItineraryForCPU(GPU)),
179 MaxPrivateElementSize(0),
182 FastDenormalF32(false),
183 HalfRate64Ops(false),
185 FlatForGlobal(false),
186 AutoWaitcntBeforeBarrier(false),
188 UnalignedScratchAccess(false),
189 UnalignedBufferAccess(false),
191 HasApertureRegs(false),
193 DoesNotSupportXNACK(false),
197 EnableLoadStoreOpt(false),
198 EnableUnsafeDSOffsetFolding(false),
199 EnableSIScheduler(false),
201 EnablePRTStrictNull(false),
211 GFX7GFX8GFX9Insts(false),
213 HasSMemRealTime(false),
215 HasFmaMixInsts(false),
217 HasVGPRIndexMode(false),
218 HasScalarStores(false),
219 HasScalarAtomics(false),
221 HasSDWAScalar(false),
224 HasSDWAOutModsVOPC(false),
230 HasNSAEncoding(false),
231 GFX10_BEncoding(false),
240 HasPkFmacF16Inst(false),
241 HasAtomicFaddInsts(false),
242 EnableSRAMECC(false),
243 DoesNotSupportSRAMECC(false),
244 HasNoSdstCMPX(false),
246 HasGetWaveIdInst(false),
247 HasSMemTimeInst(false),
248 HasRegisterBanking(false),
249 HasVOP3Literal(false),
250 HasNoDataDepHazard(false),
251 FlatAddressSpace(false),
252 FlatInstOffsets(false),
253 FlatGlobalInsts(false),
254 FlatScratchInsts(false),
255 ScalarFlatScratchInsts(false),
256 AddNoCarryInsts(false),
257 HasUnpackedD16VMem(false),
258 LDSMisalignedBug(false),
259 HasMFMAInlineLiteralBug(false),
261 ScalarizeGlobal(false),
263 HasVcmpxPermlaneHazard(false),
264 HasVMEMtoScalarWriteHazard(false),
265 HasSMEMtoVectorWriteHazard(false),
266 HasInstFwdPrefetchBug(false),
267 HasVcmpxExecWARHazard(false),
268 HasLdsBranchVmemWARHazard(false),
269 HasNSAtoVMEMBug(false),
270 HasOffset3fBug(false),
271 HasFlatSegmentOffsetBug(false),
273 FeatureDisable(false),
274 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
276 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
277 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
278 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
279 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
280 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
281 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
282 InstSelector.reset(new AMDGPUInstructionSelector(
283 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
286 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
287 if (getGeneration() < GFX10)
291 case AMDGPU::V_LSHLREV_B64:
292 case AMDGPU::V_LSHLREV_B64_gfx10:
293 case AMDGPU::V_LSHL_B64:
294 case AMDGPU::V_LSHRREV_B64:
295 case AMDGPU::V_LSHRREV_B64_gfx10:
296 case AMDGPU::V_LSHR_B64:
297 case AMDGPU::V_ASHRREV_I64:
298 case AMDGPU::V_ASHRREV_I64_gfx10:
299 case AMDGPU::V_ASHR_I64:
306 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
307 const Function &F) const {
309 return getLocalMemorySize();
310 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
311 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
312 if (!WorkGroupsPerCu)
314 unsigned MaxWaves = getMaxWavesPerEU();
315 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
318 // FIXME: Should return min,max range.
319 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
320 const Function &F) const {
321 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
322 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
323 if (!MaxWorkGroupsPerCu)
326 const unsigned WaveSize = getWavefrontSize();
328 // FIXME: Do we need to account for alignment requirement of LDS rounding the
330 // Compute restriction based on LDS usage
331 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
333 // This can be queried with more LDS than is possible, so just assume the
338 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
340 // Round to the number of waves.
341 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
342 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
344 // Clamp to the maximum possible number of waves.
345 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
347 // FIXME: Needs to be a multiple of the group size?
348 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
350 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
351 "computed invalid occupancy");
356 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
357 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
358 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
361 std::pair<unsigned, unsigned>
362 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
364 case CallingConv::AMDGPU_VS:
365 case CallingConv::AMDGPU_LS:
366 case CallingConv::AMDGPU_HS:
367 case CallingConv::AMDGPU_ES:
368 case CallingConv::AMDGPU_GS:
369 case CallingConv::AMDGPU_PS:
370 return std::make_pair(1, getWavefrontSize());
372 return std::make_pair(1u, getMaxFlatWorkGroupSize());
376 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
377 const Function &F) const {
378 // Default minimum/maximum flat work group sizes.
379 std::pair<unsigned, unsigned> Default =
380 getDefaultFlatWorkGroupSize(F.getCallingConv());
382 // Requested minimum/maximum flat work group sizes.
383 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
384 F, "amdgpu-flat-work-group-size", Default);
386 // Make sure requested minimum is less than requested maximum.
387 if (Requested.first > Requested.second)
390 // Make sure requested values do not violate subtarget's specifications.
391 if (Requested.first < getMinFlatWorkGroupSize())
393 if (Requested.second > getMaxFlatWorkGroupSize())
399 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
400 const Function &F) const {
401 // Default minimum/maximum number of waves per execution unit.
402 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
404 // Default/requested minimum/maximum flat work group sizes.
405 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
407 // If minimum/maximum flat work group sizes were explicitly requested using
408 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
409 // number of waves per execution unit to values implied by requested
410 // minimum/maximum flat work group sizes.
411 unsigned MinImpliedByFlatWorkGroupSize =
412 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
413 Default.first = MinImpliedByFlatWorkGroupSize;
414 bool RequestedFlatWorkGroupSize =
415 F.hasFnAttribute("amdgpu-flat-work-group-size");
417 // Requested minimum/maximum number of waves per execution unit.
418 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
419 F, "amdgpu-waves-per-eu", Default, true);
421 // Make sure requested minimum is less than requested maximum.
422 if (Requested.second && Requested.first > Requested.second)
425 // Make sure requested values do not violate subtarget's specifications.
426 if (Requested.first < getMinWavesPerEU() ||
427 Requested.second > getMaxWavesPerEU())
430 // Make sure requested values are compatible with values implied by requested
431 // minimum/maximum flat work group sizes.
432 if (RequestedFlatWorkGroupSize &&
433 Requested.first < MinImpliedByFlatWorkGroupSize)
439 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
440 Function *Kernel = I->getParent()->getParent();
441 unsigned MinSize = 0;
442 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
443 bool IdQuery = false;
445 // If reqd_work_group_size is present it narrows value down.
446 if (auto *CI = dyn_cast<CallInst>(I)) {
447 const Function *F = CI->getCalledFunction();
449 unsigned Dim = UINT_MAX;
450 switch (F->getIntrinsicID()) {
451 case Intrinsic::amdgcn_workitem_id_x:
452 case Intrinsic::r600_read_tidig_x:
455 case Intrinsic::r600_read_local_size_x:
458 case Intrinsic::amdgcn_workitem_id_y:
459 case Intrinsic::r600_read_tidig_y:
462 case Intrinsic::r600_read_local_size_y:
465 case Intrinsic::amdgcn_workitem_id_z:
466 case Intrinsic::r600_read_tidig_z:
469 case Intrinsic::r600_read_local_size_z:
476 if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
477 if (Node->getNumOperands() == 3)
478 MinSize = MaxSize = mdconst::extract<ConstantInt>(
479 Node->getOperand(Dim))->getZExtValue();
487 // Range metadata is [Lo, Hi). For ID query we need to pass max size
488 // as Hi. For size query we need to pass Hi + 1.
494 MDBuilder MDB(I->getContext());
495 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
497 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
501 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
502 Align &MaxAlign) const {
503 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
504 F.getCallingConv() == CallingConv::SPIR_KERNEL);
506 const DataLayout &DL = F.getParent()->getDataLayout();
507 uint64_t ExplicitArgBytes = 0;
510 for (const Argument &Arg : F.args()) {
511 Type *ArgTy = Arg.getType();
513 const Align Alignment = DL.getABITypeAlign(ArgTy);
514 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
515 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
516 MaxAlign = std::max(MaxAlign, Alignment);
519 return ExplicitArgBytes;
522 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
523 Align &MaxAlign) const {
524 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
526 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
528 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
529 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
530 if (ImplicitBytes != 0) {
531 const Align Alignment = getAlignmentForImplicitArgPtr();
532 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
535 // Being able to dereference past the end is useful for emitting scalar loads.
536 return alignTo(TotalSize, 4);
539 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
540 const TargetMachine &TM) :
541 R600GenSubtargetInfo(TT, GPU, FS),
544 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
548 HasVertexCache(false),
553 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
554 InstrItins(getInstrItineraryForCPU(GPU)) { }
556 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
557 unsigned NumRegionInstrs) const {
558 // Track register pressure so the scheduler can try to decrease
559 // pressure once register usage is above the threshold defined by
560 // SIRegisterInfo::getRegPressureSetLimit()
561 Policy.ShouldTrackPressure = true;
563 // Enabling both top down and bottom up scheduling seems to give us less
564 // register spills than just using one of these approaches on its own.
565 Policy.OnlyTopDown = false;
566 Policy.OnlyBottomUp = false;
568 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
569 if (!enableSIScheduler())
570 Policy.ShouldTrackLaneMasks = true;
573 bool GCNSubtarget::hasMadF16() const {
574 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
577 bool GCNSubtarget::useVGPRIndexMode() const {
578 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
581 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
582 if (getGeneration() >= AMDGPUSubtarget::GFX10)
583 return getMaxWavesPerEU();
585 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
607 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
608 unsigned MaxWaves = getMaxWavesPerEU();
609 unsigned Granule = getVGPRAllocGranule();
612 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
613 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
616 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
617 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
618 if (getGeneration() >= AMDGPUSubtarget::GFX10)
619 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
621 if (MFI.hasFlatScratchInit()) {
622 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
623 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
624 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
625 return 4; // FLAT_SCRATCH, VCC (in that order).
628 if (isXNACKEnabled())
629 return 4; // XNACK, VCC (in that order).
633 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
635 unsigned NumVGPRs) const {
637 std::min(getMaxWavesPerEU(),
638 getOccupancyWithLocalMemSize(LDSSize, F));
640 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
642 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
646 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
647 const Function &F = MF.getFunction();
648 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
650 // Compute maximum number of SGPRs function can use using default/requested
651 // minimum number of waves per execution unit.
652 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
653 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
654 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
656 // Check if maximum number of SGPRs was explicitly requested using
657 // "amdgpu-num-sgpr" attribute.
658 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
659 unsigned Requested = AMDGPU::getIntegerAttribute(
660 F, "amdgpu-num-sgpr", MaxNumSGPRs);
662 // Make sure requested value does not violate subtarget's specifications.
663 if (Requested && (Requested <= getReservedNumSGPRs(MF)))
666 // If more SGPRs are required to support the input user/system SGPRs,
667 // increase to accommodate them.
669 // FIXME: This really ends up using the requested number of SGPRs + number
670 // of reserved special registers in total. Theoretically you could re-use
671 // the last input registers for these special registers, but this would
672 // require a lot of complexity to deal with the weird aliasing.
673 unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
674 if (Requested && Requested < InputNumSGPRs)
675 Requested = InputNumSGPRs;
677 // Make sure requested value is compatible with values implied by
678 // default/requested minimum/maximum number of waves per execution unit.
679 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
681 if (WavesPerEU.second &&
682 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
686 MaxNumSGPRs = Requested;
689 if (hasSGPRInitBug())
690 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
692 return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
693 MaxAddressableNumSGPRs);
696 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
697 const Function &F = MF.getFunction();
698 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
700 // Compute maximum number of VGPRs function can use using default/requested
701 // minimum number of waves per execution unit.
702 std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
703 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
705 // Check if maximum number of VGPRs was explicitly requested using
706 // "amdgpu-num-vgpr" attribute.
707 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
708 unsigned Requested = AMDGPU::getIntegerAttribute(
709 F, "amdgpu-num-vgpr", MaxNumVGPRs);
711 // Make sure requested value is compatible with values implied by
712 // default/requested minimum/maximum number of waves per execution unit.
713 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
715 if (WavesPerEU.second &&
716 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
720 MaxNumVGPRs = Requested;
726 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
727 int UseOpIdx, SDep &Dep) const {
728 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
729 !Def->isInstr() || !Use->isInstr())
732 MachineInstr *DefI = Def->getInstr();
733 MachineInstr *UseI = Use->getInstr();
735 if (DefI->isBundle()) {
736 const SIRegisterInfo *TRI = getRegisterInfo();
737 auto Reg = Dep.getReg();
738 MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
739 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
741 for (++I; I != E && I->isBundledWithPred(); ++I) {
742 if (I->modifiesRegister(Reg, TRI))
743 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
748 } else if (UseI->isBundle()) {
749 const SIRegisterInfo *TRI = getRegisterInfo();
750 auto Reg = Dep.getReg();
751 MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
752 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
753 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
754 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
755 if (I->readsRegister(Reg, TRI))
764 struct FillMFMAShadowMutation : ScheduleDAGMutation {
765 const SIInstrInfo *TII;
769 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
771 bool isSALU(const SUnit *SU) const {
772 const MachineInstr *MI = SU->getInstr();
773 return MI && TII->isSALU(*MI) && !MI->isTerminator();
776 bool isVALU(const SUnit *SU) const {
777 const MachineInstr *MI = SU->getInstr();
778 return MI && TII->isVALU(*MI);
781 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
782 if (Pred->NodeNum < Succ->NodeNum)
785 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
787 for (unsigned I = 0; I < Succs.size(); ++I) {
788 for (const SDep &SI : Succs[I]->Succs) {
789 const SUnit *SU = SI.getSUnit();
790 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
795 SmallPtrSet<const SUnit*, 32> Visited;
796 while (!Preds.empty()) {
797 const SUnit *SU = Preds.pop_back_val();
798 if (llvm::find(Succs, SU) != Succs.end())
801 for (const SDep &SI : SU->Preds)
802 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
803 Preds.push_back(SI.getSUnit());
809 // Link as much SALU intructions in chain as possible. Return the size
810 // of the chain. Links up to MaxChain instructions.
811 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
812 SmallPtrSetImpl<SUnit *> &Visited) const {
813 SmallVector<SUnit *, 8> Worklist({To});
816 while (!Worklist.empty() && MaxChain-- > 0) {
817 SUnit *SU = Worklist.pop_back_val();
818 if (!Visited.insert(SU).second)
821 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
822 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
824 if (SU->addPred(SDep(From, SDep::Artificial), false))
827 for (SDep &SI : From->Succs) {
828 SUnit *SUv = SI.getSUnit();
829 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
830 SUv->addPred(SDep(SU, SDep::Artificial), false);
833 for (SDep &SI : SU->Succs) {
834 SUnit *Succ = SI.getSUnit();
835 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
836 Worklist.push_back(Succ);
843 void apply(ScheduleDAGInstrs *DAGInstrs) override {
844 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
845 if (!ST.hasMAIInsts() || DisablePowerSched)
847 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
848 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
849 if (!TSchedModel || DAG->SUnits.empty())
852 // Scan for MFMA long latency instructions and try to add a dependency
853 // of available SALU instructions to give them a chance to fill MFMA
854 // shadow. That is desirable to fill MFMA shadow with SALU instructions
855 // rather than VALU to prevent power consumption bursts and throttle.
856 auto LastSALU = DAG->SUnits.begin();
857 auto E = DAG->SUnits.end();
858 SmallPtrSet<SUnit*, 32> Visited;
859 for (SUnit &SU : DAG->SUnits) {
860 MachineInstr &MAI = *SU.getInstr();
861 if (!TII->isMAI(MAI) ||
862 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
863 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
866 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
868 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
869 dbgs() << "Need " << Lat
870 << " instructions to cover latency.\n");
872 // Find up to Lat independent scalar instructions as early as
873 // possible such that they can be scheduled after this MFMA.
874 for ( ; Lat && LastSALU != E; ++LastSALU) {
875 if (Visited.count(&*LastSALU))
878 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
881 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
888 void GCNSubtarget::getPostRAMutations(
889 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
890 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
893 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
894 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
895 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
897 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
900 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
901 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
902 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
904 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));